Mercurial > repos > mcharles > genephys
diff genephys/GenePhys.pl @ 3:8dfa09868059 draft
Uploaded
author | mcharles |
---|---|
date | Fri, 24 Oct 2014 05:54:20 -0400 |
parents | |
children | 3d79224aa2dc |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genephys/GenePhys.pl Fri Oct 24 05:54:20 2014 -0400 @@ -0,0 +1,335 @@ +#!/usr/bin/perl +#V1.1.0 integrated gene extraction +#V1.0.2 integrated segment fasta extraction +#V1.0.1 added log and option +#V1.0.0 +use strict; +use warnings; +use Getopt::Long; + +my $input_blast_files; +my $input_genes_position_file; +my $input_assembly_file; +my $input_markers_position_file; +my $input_markers_file; +my $log_file; +my $output_fasta_file; +my $output_segment_file; +my $output_genes_list_file; +my $EXTRACT_SEQ = "NO"; +my $WINDOW = 200000; +my $OFFSET = 100000; +my $MAX_BLAST_LINES = 1; + +GetOptions ( +"input_assembly_file=s" => \$input_assembly_file, +"input_markers_position_file=s" => \$input_markers_position_file, +"input_markers_file=s" => \$input_markers_file, +"log_file=s" => \$log_file, +"output_fasta_file=s" => \$output_fasta_file, +"output_segment_file=s" => \$output_segment_file, +"extractseq=s" => \$EXTRACT_SEQ, +"window=i" => \$WINDOW, +"offset=i" =>\$OFFSET, +"input_blast_files=s" => \$input_blast_files, +"input_genes_position_file=s"=> \$input_genes_position_file, +"output_genes_list_file=s"=>\$output_genes_list_file, +"max_blast_lines=i" => \$MAX_BLAST_LINES +) or die("Error in command line arguments\n"); + +open(LF, ">$log_file") or die("Can't open $log_file\n"); +#print LF $EXTRACT_SEQ."\n"; + +my $current_annotation=""; +my @list_marquer; +my %chr; +my %position; + +open(MP, $input_markers_position_file) or die("Can't open $input_markers_position_file\n"); + +my $compt=0; +while (my $line=<MP>){ + $compt++; + my @cols = split(/\t/,$line); + if ($#cols != 3){ + print STDERR "Error in marker position file format\n$compt : $line\n"; + exit(0); + } + my %current; + # Number#Map#Name#Chr#Position#GeneAT#FunctionAT + my $Name = $cols[0]; + my $Locus = $cols[1]; + my $Chr = $cols[2]; + my $Position = $cols[3]; + + + $chr{$Name} = $Chr; + $position{$Name} = $Position; + + ### Modification 0.9.9 + if ($Locus ne $Name){ + $chr{$Locus} = $Chr; + $position{$Locus} = $Position; + } + ### + +} +close (MP); + +open(MA, $input_markers_file) or die("Can't open $input_markers_file\n"); +while (my $line=<MA>){ + my @cols = split (/\s+/,$line); + for (my $i=0;$i<=$#cols;$i++){ + my $current = $cols[$i]; + chomp($current); + if ($current !~ /^\s+$/){ + push(@list_marquer,$current); + } + } +} +close (MA); + +my %coord_by_chr; +for (my $i=0;$i<=$#list_marquer;$i++){ + my $current_name = $list_marquer[$i]; + my $current_chr = $chr{$current_name}; + my $current_position = $position{$current_name}; + + if ($current_position =~ /^\d+$/){ + my @tbl_coord_for_current_chr; + if ($coord_by_chr{$current_chr}){ + @tbl_coord_for_current_chr = @{$coord_by_chr{$current_chr}}; + } + push(@tbl_coord_for_current_chr,$current_position); + $coord_by_chr{$current_chr}=\@tbl_coord_for_current_chr; + } + elsif (($current_position =~/\s*-\s*/)||($current_position =~/none/i)){ + + } + else { + chomp($current_position); + print STDERR "Error Parsing $current_name\tposition not recognized : $current_position \n"; + print $list_marquer[$i],"\n"; + } +} + +open(OS, ">$output_segment_file") or die ("Can't open $output_segment_file\n"); + +my @segment_chr; +my @segment_start; +my @segment_end; + +foreach my $key (sort keys %coord_by_chr){ + my @tbl_coord = @{$coord_by_chr{$key}}; + @tbl_coord = sort { $a <=> $b } @tbl_coord; + my $current_start; + my $current_stop; + my $current_start_with_offset; + my $current_stop_with_offset; + + for (my $i=0;$i<=$#tbl_coord;$i++){ + if (!$current_start){$current_start=$tbl_coord[$i];$current_stop=$tbl_coord[$i]} + + # print "$i : $current_start / $current_stop\n"; + if ($tbl_coord[$i]>$current_stop+$WINDOW){ + #OFFSET + if ($current_start>$OFFSET){$current_start_with_offset=$current_start-$OFFSET;}else{$current_start_with_offset=1;} + $current_stop_with_offset = $current_stop + $OFFSET; + ####### + print OS $key,":",$current_start_with_offset,"..",$current_stop_with_offset,"\n"; + push(@segment_chr,$key); + push(@segment_start,$current_start_with_offset); + push(@segment_end,$current_stop_with_offset); + + $current_start = $tbl_coord[$i]; + $current_stop = $tbl_coord[$i]; + + if ($i==$#tbl_coord){ + #OFFSET + if ($current_start>$OFFSET){$current_start_with_offset=$current_start-$OFFSET;}else{$current_start_with_offset=1;} + $current_stop_with_offset = $current_stop + $OFFSET; + ####### + print OS $key,":",$current_start_with_offset,"..",$current_stop_with_offset,"\n"; + push(@segment_chr,$key); + push(@segment_start,$current_start_with_offset); + push(@segment_end,$current_stop_with_offset); + } + } + else { + $current_stop=$tbl_coord[$i]; + if ($i==$#tbl_coord){ + #OFFSET + if ($current_start>$OFFSET){$current_start_with_offset=$current_start-$OFFSET;}else{$current_start_with_offset=1;} + $current_stop_with_offset = $current_stop + $OFFSET; + ####### + print OS $key,":",$current_start_with_offset,"..",$current_stop_with_offset,"\n"; + push(@segment_chr,$key); + push(@segment_start,$current_start_with_offset); + push(@segment_end,$current_stop_with_offset); + } + } + } +} +close(OS); + +### Sequence extraction +if ($EXTRACT_SEQ eq "YES"){ + + my %genome; + my $current_header; + my $current_seq=""; + open(AF, $input_assembly_file) or die ("Can't open $input_assembly_file\n"); + + while (my $ligne = <AF>){ + if ($ligne =~ /^\>(.*?)\s*$/){ + if ($current_header){ + $genome{$current_header} = $current_seq; + } + $current_header=$1; + $current_seq = ""; + } + else { + if ($ligne=~/^([ATGCNXatgcnx]+)\s*$/){ + $current_seq .= $1; + } + else { + print STDERR "Erreur Parsing n°1\n$ligne\n"; + } + } + } + + #TRAITEMENT DU DERNIER + if ($current_header){ + $genome{$current_header} = $current_seq; + undef($current_seq); + } + close (AF); + + open(OF, ">$output_fasta_file") or die ("Can't open $output_fasta_file\n"); + for (my $i=0;$i<=$#segment_chr;$i++){ + my $compt=0; + my $current_seq=""; + print OF ">",$segment_chr[$i],":",$segment_start[$i],"..",$segment_end[$i]."\n"; + ### Modification 0.9.9 + if ($segment_end[$i]>length($genome{$segment_chr[$i]})){ + $segment_end[$i] = length($genome{$segment_chr[$i]}); + } + ### + + my @SEQ = split(//,$genome{$segment_chr[$i]}); + for (my $coord = $segment_start[$i]-1; $coord<=$segment_end[$i]-1;$coord++){ + $compt++; + if ($compt > 60 ){ + $current_seq .= "\n"; + $compt=1; + } + $current_seq .= $SEQ[$coord]; + + } + print OF "$current_seq\n"; + } + close (OF); +} + +### GENE and BLAST Extraction +my @blast_by_base; +my @header; + + +my @blastfiles = split(/\,/,$input_blast_files); +for (my $i=0;$i<=$#blastfiles;$i++){ + my $current_blast_file = $blastfiles[$i]; + my $current_blast_header = "DEFAULT"; + my %current_blast; + open (B,"$current_blast_file") or die ("Can't open $current_blast_file\n"); + while (my $line =<B>){ + if ($line =~ /^\#\#(.*?)$/){ + $current_blast_header = $1; + print LF $current_blast_header."\n"; + } + elsif ($line =~ /^\#/){ + # blast file column legend + } + else { + my @fields = split(/\s+/,$line); + my $gene_id = $fields[0]; + my @blast_for_this_gene; + if ($current_blast{$gene_id}){ + @blast_for_this_gene = @{$current_blast{$gene_id}}; + } + + if ($#blast_for_this_gene<$MAX_BLAST_LINES-1){ + push(@blast_for_this_gene,$line); + print LF $gene_id,"\n"; + } + $current_blast{$gene_id}=\@blast_for_this_gene; + } + } + close(B); + push (@blast_by_base,\%current_blast); + push (@header,$current_blast_header); +} + + +open (OGL,">$output_genes_list_file") or die ("Can't open $output_genes_list_file\n"); + +for (my $i=0;$i<=$#segment_chr;$i++){ + my $segment_chr = $segment_chr[$i]; + my $segment_start = $segment_start[$i]; + my $segment_end = $segment_end[$i]; + + print OGL "#",$segment_chr[$i],":",$segment_start[$i],"..",$segment_end[$i],"\n"; + + open(IG, $input_genes_position_file) or die("Can't open $input_genes_position_file\n"); + while (my $gene_desc=<IG>){ + my @gene_desc = split(/\s+/,$gene_desc); + if ($#gene_desc != 4){ + print STDERR "Error in gene position file format\n$gene_desc\n"; + exit(0); + } + my $gene_id = $gene_desc[0]; + my $cds_id = $gene_desc[1]; + my $gene_chr = $gene_desc[2]; + my $gene_start = $gene_desc[3]; + my $gene_end = $gene_desc[4]; + if ($segment_chr eq $gene_chr){ + if ((($gene_start>=$segment_start)&&($gene_start<=$segment_end))||(($gene_end>=$segment_start)&&($gene_end<=$segment_end))){ + print OGL $gene_id," / ",$cds_id,"\n"; + + for (my $i=0;$i<=$#blast_by_base;$i++){ + #print LF $header[$i]."\n"; + my %current_blast = %{$blast_by_base[$i]}; + if ($current_blast{$cds_id}){ + my @blast_by_gene = @{$current_blast{$cds_id}}; + #print LF $#blast_by_gene."\n"; + for (my $j=0;$j<=$#blast_by_gene;$j++){ + my @fields = split(/\t/,$blast_by_gene[$j]); + print OGL $header[$i],"\t"; + print OGL $fields[1],"\t"; + print OGL $fields[3],"\t"; + print OGL $fields[4],"\t"; + print OGL $fields[5],"\t"; + print OGL $fields[10],"\t"; + print OGL $fields[6],"..",$fields[7],"(",$fields[11],")","\t"; + print OGL $fields[8],"..",$fields[9],"(",$fields[12],")","\t"; + print OGL $fields[13]; + } + print OGL "\n"; + } + else { + print OGL $header[$i],"\t","No BLAST results\n"; + print LF $gene_id," / ",$cds_id,"\n"; + } + } + } + } + + } + close(IG); +} + + +close (OGL); + +close (LF); +