view SMART/bacteriaRegulatoryRegion_Detection/interElementGff.pl @ 31:0ab839023fe4

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 14:33:21 -0400
parents 94ab73e8a190
children
line wrap: on
line source

#!/usr/bin/perl -w 
###
# But : protocol permettant la detection d'RNA non codant potentiel
# 
# Entrees : fichier de mapping Smart gff3
#			fichier gff des gènes
#			fichier gff des clusters Cis regulateur potentiel
#
# Sortie : fichier gff des clusters ARN nc
#
###------------------------------------------------------
use vars qw($USAGE);
use strict; 
                            
=head1 NAME

interElementGff.pl - creation of a new Gff corresponding to the region of two successive Elements

=head1 SYNOPSIS

% interElementGff.pl -i inputFile.gff3 -o outputFile.gff3 [-s 50] [-a 20] [-n seqName] [-h] 

=head1 DESCRIPTION
This script will determine cluster ok ncRNA.

    -i|--input 	fileName   gff input file name
    -o|--output	fileName   gff output file name
    -n|--name seqName      sequence name
    -p|--print			   print parameters	used

    -f5ff n   number of nt to exclude from 5' seed when gene before is Forward, seed is Forward and next gene is Forward [default 0]
    -ff3f n   number... " ...[default 0]

    -f5fr n   number... " ...[default 0] 
    -ff3r n   number... " ...[default 0]
     
    -fr3f n   number... " ...[default 0]        
    -fr5f n   number... " ...[default 0]
    
    -f3rr n   number... " ...[default 0]
    -fr5r n   number... " ...[default 0]

    -r5ff n   number... " ...[default 0]
    -rf3f n   number... " ...[default 0]

    -r5fr n   number... " ...[default 0]        
    -rf3r n   number... " ...[default 0]

    -r3rf n   number... " ...[default 0]
    -rr5f n   number... " ...[default 0]

    -r3rr n   number... " ...[default 0]
    -rr5r n   number... " ...[default 0]

   [-h|--help]		          help mode then die                              


USAGE_CASE

% interElementGff.pl -i inputFile.gff3 -o outputFile.gff3 -ff 53 -rr 23 -n NC_011744

BUG

Caution : input file needs to be sorted on positions

Caution : for -f/r options add +3 bp to include stop codon if not in input file

=head1 AUTHOR - CTN - apr.11
(from RNA-Vibrio/protocol_NC_V2.pl - Claire KUCHLY)

=cut
#----------------------------------------------------------------------------
# check command line :
my ($IDfile, $OutputFileName, $f5ff, $ff3f, $f5fr, $ff3r, $f3rf, $fr5f, $f3rr,$fr5r, $r5ff, $rf3f, $r5fr, $rf3r, $r3rf, $rr5f, $r3rr, $rr5r, $seqName, $printParameters) = 
   (undef, undef , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, "", 0) ;
if ($#ARGV==0) {
	die (exec("pod2text $0\n"));
} else {
    foreach my $num (0 .. $#ARGV) {
	SWITCH: for ($ARGV[$num]) {
	/--input|-i/ && do { $IDfile=$ARGV[$num+1]; 
		open(F,"<$IDfile") or die "Error: Can't open \"$IDfile\", $!"; 
		last; };
	/-f5ff/ && do { $f5ff=$ARGV[$num+1]+1; last; }; # need +1 for intervall computations
	/-ff3f/ && do { $ff3f=$ARGV[$num+1]+1; last; }; 
	
	/-f5fr/ && do { $f5fr=$ARGV[$num+1]+1; last; }; 
	/-ff3r/ && do { $ff3r=$ARGV[$num+1]+1; last; }; 	
	
	/-f3rf/ && do { $f3rf=$ARGV[$num+1]+1; last; };
	/-fr5f/ && do { $fr5f=$ARGV[$num+1]+1; last; };
	
	/-f3rr/ && do { $f3rr=$ARGV[$num+1]+1; last; };
	/-fr5r/ && do { $fr5r=$ARGV[$num+1]+1; last; };	
	
	/-r5ff/ && do { $r5ff=$ARGV[$num+1]+1; last; }; 
	/-rf3f/ && do { $rf3f=$ARGV[$num+1]+1; last; }; 
		
	/-r5fr/ && do { $r5fr=$ARGV[$num+1]+1; last; }; 
	/-rf3r/ && do { $rf3r=$ARGV[$num+1]+1; last; }; 
	
	/-r3rf/ && do { $r3rf=$ARGV[$num+1]+1; last; };
	/-rr5f/ && do { $rr5f=$ARGV[$num+1]+1; last; };
	
	/-r3rr/ && do { $r3rr=$ARGV[$num+1]+1; last; };
	/-rr5r/ && do { $rr5r=$ARGV[$num+1]+1; last; };
	
#	/--name|-n/ && do { $seqName=$ARGV[$num+1]; last; };
	/--print|-p/ && do { $printParameters=1; last; };
	/--output|-o/ && do { $OutputFileName=$ARGV[$num+1]; 
		open(S,">$OutputFileName") or die "Error : Can't open result file \"$OutputFileName\", $!";
		last; };
	/--help|-h/ && do { exec("pod2text $0\n") ; die };
	}
    }
	if ($printParameters) {
	print "
	       --> f5ff ",$f5ff-1," --> ff3f ",$ff3f-1,"  --> ; 
	       --> f5fr ",$f5fr-1," --> ff3r ",$ff3r-1,"  <-- ; 
	       --> f3rf ",$f3rf-1," <-- fr5f ",$fr5f-1,"  --> ;  
	       --> f3rr ",$f3rr-1," <-- fr5r ",$fr5r-1,"  <-- ; 
	       <-- r5ff ",$r5ff-1," --> rf3f ",$rf3f-1,"  --> ;
	       <-- r5fr ",$r5fr-1," --> rf3r ",$rf3r-1,"  <-- ;
	       <-- r3rf ",$r3rf-1," <-- rr5f ",$rr5f-1,"  --> ; 
	       <-- r3rr ",$r3rr-1," <-- rr5r ",$rr5r-1,"  <-- ;\n";
   }
   ##NC_011753.2	RefSeq	gene	367	834	.	-	.	locus_tag=VS_0001;db_xref=GeneID:7162789
   my $finSeedSens;
   my $finSeedAntisens;
   my $debSeedSens;
   my $debSeedAntisens;
   my $info_gene="";
   my $sensGeneAvant = "+" ; # 1rst seed definition : geneAvant (gene[i-1]) doesn't exist 
   my @chromList;
   while(my $ligne = <F>){
	chomp($ligne);
	my @list = split(/\t/,$ligne);
	if ((scalar(@chromList) == 0) or ($chromList[$#chromList] ne $list[0])){
		push(@chromList, $list[0]);
		my $finSeedSens;
   		my $finSeedAntisens;
   		my $debSeedSens;
   		my $debSeedAntisens;
   		my $info_gene="";
   		my $sensGeneAvant = "+" ; # 1rst seed definition : geneAvant (gene[i-1]) doesn't exist 
	}
	if (($sensGeneAvant eq "+") and ($list[6] eq "+")) { #CTN ie geneavant == f, geneapres == f
		$debSeedSens += $f5ff;
		$finSeedSens = $list[3]- $ff3f;
		$debSeedAntisens += $f3rf;
		$finSeedAntisens = $list[3]- $fr5f;
	} elsif (($sensGeneAvant eq "+") and ($list[6] eq "-")) { #CTN ie geneaavant == f, geneapres == r
		$debSeedSens += $f5fr;
		$finSeedSens = $list[3]- $ff3r;
		$debSeedAntisens += $f3rr;
		$finSeedAntisens = $list[3]- $fr5r;
	} elsif (($sensGeneAvant eq "-") and ($list[6] eq "+")) { #CTN ie geneaavant == r, geneapres == f
		$debSeedSens += $r5ff;
		$finSeedSens = $list[3]- $rf3f;
		$debSeedAntisens += $r3rf;
		$finSeedAntisens = $list[3]- $rr5f;
	} else {                    #CTN ie geneaavant == r, geneapres == r
		$debSeedSens += $r5fr;
		$finSeedSens = $list[3]- $rf3r;
		$debSeedAntisens += $r3rr;
		$finSeedAntisens = $list[3]- $rr5r;
	}
	if ($debSeedSens <= 0) { $debSeedSens=1 ; } # 1srt 
	if ($debSeedAntisens <= 0) { $debSeedAntisens=1 ; }
	if($debSeedSens < $finSeedSens){ # only "real" seed 
			#print "$gene_avant\nNC_011753\tperso\tseed\t$deb_seed\t$fin_seed\t.\t+\t.\tgeneavant=$info_gene;geneapres=$list[@list-1]\n$ligne\n\n";
	   # 
	
		print S "$list[0]\tperso\tseedIR\t$debSeedSens\t$finSeedSens\t.\t+\t.\tgeneavant=$info_gene;geneapres=$list[@list-1]\n";
	}
	if ($debSeedAntisens < $finSeedAntisens){
		print S "$list[0]\tperso\tseedIR\t$debSeedAntisens\t$finSeedAntisens\t.\t-\t.\tgeneavant=$info_gene;geneapres=$list[@list-1]\n";
	}
	$sensGeneAvant = $list[6] ; # GFF : column 6 gives strand
	$debSeedSens = $list[4];
	$debSeedAntisens = $list[4];
	$info_gene = $list[@list-1];
   }
   close F;
   close S;
   exit(0);
}