view detrprok_scripts/splitTranscriptGff.pl @ 0:a53eb951b164 draft

Uploaded
author clairetn
date Mon, 25 Mar 2013 05:39:29 -0400
parents
children
line wrap: on
line source

#!/usr/bin/perl -w
###
# But : croiser
# 
# Entrees : 2 fichiers gff à croiser
#
# Sortie : gff affiche a l'ecran
#
###------------------------------------------------------
use vars qw($USAGE);                      
use strict;                               

=head1 NAME

splitTranscriptGff.pl - compare 2 input gff files and define intervalls by couple of overlapping elements

=head1 SYNOPSIS

% intervallsExtractorGff.pl -i file1.gff -j file2.gff -s strand [-h] 

=head1 DESCRIPTION
This script will sort 2 gff files and compute distance between 2 successives lines.

    -i|--input1 fileName   gff input file name: included elements
    -j|--input2 fileName   gff input file name: extended elements
   [-s|--strand] [s|d]	   s for single strand (colinear) or d for double strands (antisense) [default d]
   [-h|--help]             help mode then die                              

=head1 USECASE
Compare 2 input gff files: an annotations file (included elements) and transcription units file (extended elements).
For each couple of overlapping elements, split the transcription unit in 5'utr, CDS, 3'utr or "operon" in case of successives genes included in the transcription unit.

=head1 KWON BUG
Fisrt and last elements on the genome sequence can be misclassified in the result file.

=head1 AUTHOR
Claire Toffano-Nioche - sep.11

=cut
#-----------------------
sub feedPositionTab { my ($val, $pF, $pB, @info) = @_ ;
		#print "feedPositionTab::$#info, ", ($#info+1)/4," \n";
	for (my $i=0 ; $i <= $#info ; $i+=4) { # for each extended element 
			#print "....$info[$i+2]\n";
		if ($info[$i+3] =~ /\+/) {
			for (my $c = $info[$i+1] ; $c <= $info[$i+2] ; $c++) { @$pF[$c]=$val } ; # Forward sequence
		} else {
			for (my $c = $info[$i+1] ; $c <= $info[$i+2] ; $c++) { @$pB[$c]=$val } ; # Backward sequence
		}
	}
		#print "feedPos...:: ", join(".", @$pF[0..100]), "\n";
		#print "feedPos...:: ", join(".", @$pB[0..100]), "\n";
}
#-----------------------
sub recupInfo {	my ($pInfo, @lines) = @_ ;
    my $i=0 ;
    while ($i<=$#lines){
	chomp($lines[$i]);
	my @line = split("\t",$lines[$i]);
	if (defined ($line[0])) {
		if (!($line[0] =~ m/^\s*$|^#/)) { # skip both null and comment lines
			push(@$pInfo, $line[0], $line[3], $line[4], $line[6]) ; # 0=name, 3=begin, 4=end, 6=strand
		} 
	}
	$i=$i+1 ;
    }
    	#print "recupInfo::end=", $i, "\n" ;
}
#-----------------------
sub tagName { my ($seqN, $posB, $posE, $strand) = @_ ;    
	my $tagN=$seqN.$strand.$posB."..".$posE;
		#print "tagName::",join("_",@_)," and tagName:$tagN\n";
return $tagN;
}
#-----------------------
sub transitionAnalysis {
my ($pos, $seq, $s, $pdebAmont, $pfinAmont, $pdebIn, $pfinIn, $pdebAval, $pfinAval, $ptag) = @_ ;
    my $enCours = @$ptag[$pos] ; 
    my $precedant = ($s =~ /\+/?@$ptag[$pos-1]:@$ptag[$pos+1]) ; 
    if ($enCours ne $precedant) {
    	#print "transi...:: $s, $pos, $precedant, $enCours\n";
    	#print "transition::$$pdebAmont, $$pfinAmont, $$pdebIn, $$pfinIn, $$pdebAval, $$pfinAval\n";
    	SWITCH: for ($precedant.$enCours) {
               	/01/ && do { $$pdebAmont = $pos ; last SWITCH ;};
                /02/ && do { $$pdebIn = $pos ; last SWITCH ;};
                /10/ && do { $$pfinAval = ($s =~/\+/?$pos-1:$pos+1) ; 
                		if (($s =~ /\+/)and ($$pdebAval!=$$pfinAval)) {
                			printf "%s\tsplit\tutr3\t%s\t%s\t.\t%s\t.\tName=%s;\n", 
                				$seq, $$pdebAval, $$pfinAval, $s, &tagName($seq, $$pdebAval, $$pfinAval, $s) ; 
                			#if ($$pdebAval==$$pfinAval) { print "transition 10 +\n"};
                		} elsif ($$pfinAval!=$$pdebAval) {
                			printf "%s\tsplit\tutr3\t%s\t%s\t.\t%s\t.\tName=%s;\n", 
                				$seq, $$pfinAval, $$pdebAval, $s, &tagName($seq, $$pfinAval, $$pdebAval, $s) ; 
                			#if ($$pfinAval==$$pdebAval){ print "transition 10 -\n"};
                		}
                		$$pdebAval = 0 ; $$pfinAval = 0 ;
                		last SWITCH ;
                	 };
                /12/ && do { $$pdebIn = $pos ; $$pfinAmont=($s =~/\+/?$pos-1:$pos+1) ;
                		my $type="utr5";
                		if ($$pdebAmont == 0) { # in case of interOperon : utr5-CDS-interOperon-CDS-utr3
                			$$pdebAmont=($s =~/\+/?$$pfinIn+1:$$pfinIn-1) ; 
                			$type="operon" ;
                		}
                		if (($s =~ /\+/) and ($$pdebAmont!=$$pfinAmont)) {
                			printf "%s\tsplit\t%s\t%s\t%s\t.\t%s\t.\tName=%s;\n", 
                				$seq, $type, $$pdebAmont, $$pfinAmont, $s, &tagName($seq, $$pdebAmont, $$pfinAmont, $s) ; 
                			# if ($$pdebAmont==$$pfinAmont) { print "transition 12 +\n"};
                		} elsif ($$pfinAmont!=$$pdebAmont) {
	                		printf "%s\tsplit\t%s\t%s\t%s\t.\t%s\t.\tName=%s;\n", 
                				$seq, $type, $$pfinAmont, $$pdebAmont, $s, &tagName($seq, $$pfinAmont, $$pdebAmont, $s) ; 
                			#if ($$pfinAmont==$$pdebAmont) { print "transition 12 -\n"} ;
                		}
                		$$pdebAmont = 0 ; $$pfinAmont = 0 ;
                		last SWITCH ;
                	 };
                /20/ && do { $$pfinIn=($s =~/\+/?$pos-1:$pos+1) ; 
                        if (($s =~ /\+/) and ($$pdebIn!=$$pfinIn)) {
                        	printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n", 
                				$seq, $$pdebIn, $$pfinIn, $s, &tagName($seq, $$pdebIn, $$pfinIn, $s) ; 
                		} elsif ($$pfinIn!=$$pdebIn) {
                		    printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n", 
                				$seq, $$pfinIn, $$pdebIn, $s, &tagName($seq, $$pfinIn, $$pdebIn, $s) ; 
                		}
                		$$pdebIn = 0 ; $$pfinIn = 0 ;
                		last SWITCH ;
                	 };
                /21/ && do { $$pdebAval=$pos ; $$pfinIn=($s =~/\+/?$pos-1:$pos+1) ; 
                        if (($s =~ /\+/) and ($$pdebIn!=$$pfinIn)) {
                        	printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n", 
                				$seq, $$pdebIn, $$pfinIn, $s, &tagName($seq, $$pdebIn, $$pfinIn, $s) ; 
                		} elsif ($$pfinIn!=$$pdebIn) {
                			printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n", 
                				$seq, $$pfinIn, $$pdebIn, $s, &tagName($seq, $$pfinIn, $$pdebIn, $s) ; 
                		}
                		#$$pdebIn = 0 ; $$pfinIn = 0 ;
                		last SWITCH ;
                	 };
          }
    }
 }
#-----------------------	
my ($fileNameI, $fileNameE, $strand) = ("", "", 0) ;
# command line check
foreach my $num (0 .. $#ARGV) {
        SWITCH: for ($ARGV[$num]) {
        /--input1|-i/ && do { 
			$fileNameI=$ARGV[$num+1]; 
			open ( fichierGffI, "< $fileNameI" ) or die "Can't open gff file: \"$fileNameI\"\n" ; 
			last };
	/--input2|-j/ && do { 
			$fileNameE=$ARGV[$num+1]; 
			open ( fichierGffE, "< $fileNameE" ) or die "Can't open gff file: \"$fileNameE\"\n" ; 
			last };
        /--strand|-s/ && do { 
			if ($ARGV[$num+1] eq "s") { $strand=1}; 
			last };
        /--help|-h/ && do { exec("pod2text $0\n") ; die };
        }
}
# memory declarations:
my @infoI ; my @infoE ;
my $seqName ;
my @tagF ; my @tagB ; # for Forward and Backward sequences
# data retrieval:
my @linesI = <fichierGffI> ; my @linesE = <fichierGffE> ;
close fichierGffI ; close fichierGffE ;
	#print "gff files read ; number of lines : ",$#linesI+1," + ",$#linesE+1,"\n";
# positions management:
&recupInfo(\@infoI, @linesI) ; 
	#print "number of treated elements:",($#infoI+1)/4,"\n";
&recupInfo(\@infoE, @linesE) ;
	#print "number of treated elements:",($#infoE+1)/4,"\n";
# treatement: 
# transform gff lines into chromosomal position tags : 0 for nothing, 1 resp. 2 for extended resp. included elements
if (($#infoI) and ($#infoE)) { 
	$seqName=$infoI[0] ;
		#print "end : $infoE[$#infoE-1]\n";
	for (my $i=0 ; $i <= $infoE[$#infoE-1] ; $i++) { $tagF[$i] = 0 ; $tagB[$i] = 0 ; } ; # "O" tag in all chr. positions
		#print "seqName : $seqName\n" ;
	&feedPositionTab(1, \@tagF, \@tagB, @infoE) ; # "1" tag for all extended elements
	&feedPositionTab(2, \@tagF, \@tagB, @infoI) ; # "2" tag for all included elements
		#print join("", @tagF), "\n";
		#print join("", @tagB), "\n";
	# transition management:
	my ($beginUpstream, $endUpstream, $beginIncluded, $endIncluded, $beginDownstream, $endDownstream) 
		= (0, 0, 0, 0, 0, 0) ;
	for (my $i=1 ; $i <= $#tagF-1 ; $i+=1) {
		&transitionAnalysis($i, $seqName, "+", \$beginUpstream, \$endUpstream, \$beginIncluded, \$endIncluded, \$beginDownstream, \$endDownstream, \@tagF) ;
	}
	($beginUpstream, $endUpstream, $beginIncluded, $endIncluded, $beginDownstream, $endDownstream) 
		= ($infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1]) ;
	for (my $i=$#tagB-1 ; $i >= 1 ; $i-=1) {
		&transitionAnalysis($i, $seqName, "-", \$beginUpstream, \$endUpstream, \$beginIncluded, \$endIncluded, \$beginDownstream, \$endDownstream, \@tagB) ;
	}
}
exit(0) ;