Mercurial > repos > yufei-luo > s_mart

#!/usr/bin/perl -w
###
# Main : defining utr and intergenic operonic intervalles from a transcripts file following a referencies file
#
# Input : 2 gff files to intersect, transcript queries vs referencies
#
# Output : resulting gff file printing to standard output
#
###------------------------------------------------------
use vars qw($USAGE);
use strict;

=head1 NAME

splitTranscriptGff.pl - compare 2 input gff files and define utr and intergenic operonic intervalles by couple of overlapping elements

=head1 SYNOPSIS

% intervallsExtractorGff.pl -i referencies.gff -j transcriptQueries.gff -s strand [-h]

=head1 DESCRIPTION
This script will intersect 2 gff files and compute distance between 2 successives lines. Take care both of sorting by positions the input files and of that referencies are included in transcriptQueries.

    -i|--input1 fileName   gff input file name: included elements
    -j|--input2 fileName   gff input file name: extended elements
   [-s|--strand] [s|d]	   s for single strand (colinear) or d for double strands (antisense) [default d]
   [-h|--help]             help mode then die

=head1 USECASE
Define many fragments for each extended element (transcript): UTR5, gene, UTR3, "inOperon" for intergenomic region between 2 genes
intervallsExtractorGff.pl -i CDSannotations.gff -j RNAseqTranscripts.gff  > UTRsGenesOperonsLists.gff;

=head1 KWON BUGS
No disjonction of overlapping elements of the included elements (-i file).
In usecase, overlapping genes are fused in one long gene.

=head1 AUTHOR
Claire Toffano-Nioche - sep.11

=cut
#-----------------------
sub feedPositionTab { my ($val, $pF, $pB, @info) = @_ ;
		#print "feedPositionTab::$#info, ", ($#info+1)/4," \n";
	for (my $i=0 ; $i <= $#info ; $i+=4) { # for each extended element
			#print "....$info[$i+2]\n";
		if ($info[$i+3] =~ /\+/) {
			for (my $c = $info[$i+1] ; $c <= $info[$i+2] ; $c++) { @$pF[$c]=$val } ; # sequence Forward
		} else {
			for (my $c = $info[$i+1] ; $c <= $info[$i+2] ; $c++) { @$pB[$c]=$val } ; # sequence Backward
		}
	}
		#print "feedPos...:: ", join(".", @$pF[0..100]), "\n";
		#print "feedPos...:: ", join(".", @$pB[0..100]), "\n";
}
#-----------------------
sub recupInfo {	my ($pInfo, @lines) = @_ ;
    for (my $i=0 ; $i <= ($#lines+1)*4-1 ; $i+=4) {
    	my @line = split("\t",$lines[$i/4]);
		push(@$pInfo, $line[0], $line[3], $line[4], $line[6]) ; # 0=nom, 3=debut, 4=fin, 6=sens
	}
	#print "recupInfo::fin=", ($#lines+1)*4, "\n" ;
}
#-----------------------
sub tagName { my ($seqN, $posB, $posE, $strand) = @_ ;
	my $tagN=$seqN.$strand.$posB."..".$posE;
		#print "tagName:",join("_",@_)," et tagName:$tagN\n";
return $tagN;
}
#-----------------------
sub transitionAnalysis {
my ($pos, $seq, $s, $pdebAmont, $pfinAmont, $pdebIn, $pfinIn, $pdebAval, $pfinAval, $ptag) = @_ ;
	my $enCours = 0 ; my $precedant = 0 ;
	$enCours = @$ptag[$pos] ;
	$precedant = ($s =~ /\+/?@$ptag[$pos-1]:@$ptag[$pos+1]) ;
    if ($enCours ne $precedant) {
    	#print "transi...:: $s, $pos, $precedant, $enCours\n";
    	#print "transition::$$pdebAmont, $$pfinAmont, $$pdebIn, $$pfinIn, $$pdebAval, $$pfinAval\n";
    	SWITCH: for ($precedant.$enCours) {
               	/01/ && do { $$pdebAmont = $pos ; last SWITCH ;};
                /02/ && do { $$pdebIn = $pos ; last SWITCH ;};
                /10/ && do { $$pfinAval = ($s =~/\+/?$pos-1:$pos+1) ;
                		if (($s =~ /\+/)and ($$pdebAval!=$$pfinAval)) {
                			printf "%s\tsplit\tutr3\t%s\t%s\t.\t%s\t.\tName=%s;\n",
                				$seq, $$pdebAval, $$pfinAval, $s, &tagName($seq, $$pdebAval, $$pfinAval, $s) ;
                			#if ($$pdebAval==$$pfinAval) { print "transition 10 +\n"};
                		} elsif ($$pfinAval!=$$pdebAval) {
                			printf "%s\tsplit\tutr3\t%s\t%s\t.\t%s\t.\tName=%s;\n",
                				$seq, $$pfinAval, $$pdebAval, $s, &tagName($seq, $$pfinAval, $$pdebAval, $s) ;
                			#if ($$pfinAval==$$pdebAval){ print "transition 10 -\n"};
                		}
                		$$pdebAval = 0 ; $$pfinAval = 0 ;
                		last SWITCH ;
                	 };
                /12/ && do { $$pdebIn = $pos ; $$pfinAmont=($s =~/\+/?$pos-1:$pos+1) ;
                		my $type="utr5";
                		if ($$pdebAmont == 0) { # in case of interOperon : utr5-CDS-interOperon-CDS-utr3
                			$$pdebAmont=($s =~/\+/?$$pfinIn+1:$$pfinIn-1) ;
                			$type="inOperon" ;
                		}
                		if (($s =~ /\+/) and ($$pdebAmont!=$$pfinAmont)) {
                			printf "%s\tsplit\t%s\t%s\t%s\t.\t%s\t.\tName=%s;\n",
                				$seq, $type, $$pdebAmont, $$pfinAmont, $s, &tagName($seq, $$pdebAmont, $$pfinAmont, $s) ;
                			# if ($$pdebAmont==$$pfinAmont) { print "transition 12 +\n"};
                		} elsif ($$pfinAmont!=$$pdebAmont) {
	                		printf "%s\tsplit\t%s\t%s\t%s\t.\t%s\t.\tName=%s;\n",
                				$seq, $type, $$pfinAmont, $$pdebAmont, $s, &tagName($seq, $$pfinAmont, $$pdebAmont, $s) ;
                			#if ($$pfinAmont==$$pdebAmont) { print "transition 12 -\n"} ;
                		}
                		$$pdebAmont = 0 ; $$pfinAmont = 0 ;
                		last SWITCH ;
                	 };
                /20/ && do { $$pfinIn=($s =~/\+/?$pos-1:$pos+1) ;
                        if (($s =~ /\+/) and ($$pdebIn!=$$pfinIn)) {
                        	printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n",
                				$seq, $$pdebIn, $$pfinIn, $s, &tagName($seq, $$pdebIn, $$pfinIn, $s) ;
                		} elsif ($$pfinIn!=$$pdebIn) {
                		    printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n",
                				$seq, $$pfinIn, $$pdebIn, $s, &tagName($seq, $$pfinIn, $$pdebIn, $s) ;
                		}
                		$$pdebIn = 0 ; $$pfinIn = 0 ;
                		last SWITCH ;
                	 };
                /21/ && do { $$pdebAval=$pos ; $$pfinIn=($s =~/\+/?$pos-1:$pos+1) ;
                        if (($s =~ /\+/) and ($$pdebIn!=$$pfinIn)) {
                        	printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n",
                				$seq, $$pdebIn, $$pfinIn, $s, &tagName($seq, $$pdebIn, $$pfinIn, $s) ;
                		} elsif ($$pfinIn!=$$pdebIn) {
                			printf "%s\tsplit\tgene\t%s\t%s\t.\t%s\t.\tName=%s;\n",
                				$seq, $$pfinIn, $$pdebIn, $s, &tagName($seq, $$pfinIn, $$pdebIn, $s) ;
                		}
                		#$$pdebIn = 0 ; $$pfinIn = 0 ;
                		last SWITCH ;
                	 };
          }
    }
 }
#-----------------------
my ($fileNameI, $fileNameE, $strand) = ("", "", 0) ;
# command line check
foreach my $num (0 .. $#ARGV) {
        SWITCH: for ($ARGV[$num]) {
        /--input1|-i/ && do {
			$fileNameI=$ARGV[$num+1];
			open ( fichierGffI, "< $fileNameI" ) or die "Can't open gff file: \"$fileNameI\"\n" ;
			last };
	/--input2|-j/ && do {
			$fileNameE=$ARGV[$num+1];
			open ( fichierGffE, "< $fileNameE" ) or die "Can't open gff file: \"$fileNameE\"\n" ;
			last };
        /--strand|-s/ && do {
			if ($ARGV[$num+1] eq "s") { $strand=1};
			last };
        /--help|-h/ && do { exec("pod2text $0\n") ; die };
        }
}
# memory declarations:
my @infoI ; my @infoE ;
my $seqName ;
my @tagF ; my @tagB ; # Forward and Backward sequence
# data retrieval:
my @linesI = <fichierGffI> ; my @linesE = <fichierGffE> ;
close fichierGffI ; close fichierGffE ;
		#print "gff files read ; number of lines : $#lines1 + $#lines2\n";
		# positions management
&recupInfo(\@infoI, @linesI) ;
&recupInfo(\@infoE, @linesE) ;
# treatement:
# transform gff lines into chromosomal position tags : 0 for nothing, 1 resp. 2 for extended resp. included elements
if (($#infoI) and ($#infoE)) {
	$seqName=$infoI[0] ;
		#print "fin : $infoE[$#infoE-1]\n";
	for (my $i=0 ; $i <= $infoE[$#infoE-1] ; $i++) { $tagF[$i] = 0 ; $tagB[$i] = 0 ; } ; # "O" tag in all chr. positions
		#print "seqName : $seqName\n" ;
	&feedPositionTab(1, \@tagF, \@tagB, @infoE) ; # "1" tag for all extended elements
	&feedPositionTab(2, \@tagF, \@tagB, @infoI) ; # "2" tag for all included elements
		#print join("", @tagF), "\n";
		#print join("", @tagB), "\n";
	# transition management:
	my ($beginUpstream, $endUpstream, $beginIncluded, $endIncluded, $beginDownstream, $endDownstream)
		= (0, 0, 0, 0, 0, 0) ;
	for (my $i=1 ; $i <= $#tagF-1 ; $i+=1) {
		&transitionAnalysis($i, $seqName, "+", \$beginUpstream, \$endUpstream, \$beginIncluded, \$endIncluded, \$beginDownstream, \$endDownstream, \@tagF) ;
	}
	($beginUpstream, $endUpstream, $beginIncluded, $endIncluded, $beginDownstream, $endDownstream) = ($infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1], $infoE[$#infoE-1]) ;
	for (my $i=$#tagB-1 ; $i >= 1 ; $i-=1) {
		&transitionAnalysis($i, $seqName, "-", \$beginUpstream, \$endUpstream, \$beginIncluded, \$endIncluded, \$beginDownstream, \$endDownstream, \@tagB) ;
	}
}
exit(0) ;
author	m-zytnicki
date	Tue, 30 Apr 2013 14:33:21 -0400
parents	94ab73e8a190
children