Mercurial > repos > mcharles > genephys
view genephys/GenePhys.pl @ 4:3d79224aa2dc draft
Uploaded
author | mcharles |
---|---|
date | Thu, 30 Oct 2014 06:06:34 -0400 |
parents | 8dfa09868059 |
children |
line wrap: on
line source
#!/usr/bin/perl #V1.1.1 minor bug correction (see in code) #V1.1.0 integrated gene extraction #V1.0.2 integrated segment fasta extraction #V1.0.1 added log and option #V1.0.0 use strict; use warnings; use Getopt::Long; my $input_blast_files; my $input_genes_position_file; my $input_assembly_file; my $input_markers_position_file; my $input_markers_file; my $log_file; my $output_fasta_file; my $output_segment_file; my $output_genes_list_file; my $EXTRACT_SEQ = "NO"; my $WINDOW = 200000; my $OFFSET = 100000; my $MAX_BLAST_LINES = 1; GetOptions ( "input_assembly_file=s" => \$input_assembly_file, "input_markers_position_file=s" => \$input_markers_position_file, "input_markers_file=s" => \$input_markers_file, "log_file=s" => \$log_file, "output_fasta_file=s" => \$output_fasta_file, "output_segment_file=s" => \$output_segment_file, "extractseq=s" => \$EXTRACT_SEQ, "window=i" => \$WINDOW, "offset=i" =>\$OFFSET, "input_blast_files=s" => \$input_blast_files, "input_genes_position_file=s"=> \$input_genes_position_file, "output_genes_list_file=s"=>\$output_genes_list_file, "max_blast_lines=i" => \$MAX_BLAST_LINES ) or die("Error in command line arguments\n"); open(LF, ">$log_file") or die("Can't open $log_file\n"); #print LF $EXTRACT_SEQ."\n"; my $current_annotation=""; my @list_marquer; my %chr; my %position; open(MP, $input_markers_position_file) or die("Can't open $input_markers_position_file\n"); my $compt=0; while (my $line=<MP>){ $compt++; my @cols = split(/\t/,$line); if ($#cols != 3){ print STDERR "Error in marker position file format\n$compt : $line\n"; exit(0); } my %current; # Number#Map#Name#Chr#Position#GeneAT#FunctionAT my $Name = $cols[0]; my $Locus = $cols[1]; my $Chr = $cols[2]; my $Position = $cols[3]; $chr{$Name} = $Chr; $position{$Name} = $Position; ### Modification 0.9.9 if ($Locus ne $Name){ $chr{$Locus} = $Chr; $position{$Locus} = $Position; } ### } close (MP); open(MA, $input_markers_file) or die("Can't open $input_markers_file\n"); while (my $line=<MA>){ my @cols = split (/\s+/,$line); for (my $i=0;$i<=$#cols;$i++){ my $current = $cols[$i]; chomp($current); if ($current !~ /^\s+$/){ push(@list_marquer,$current); } } } close (MA); my %coord_by_chr; for (my $i=0;$i<=$#list_marquer;$i++){ my $current_name = $list_marquer[$i]; my $current_chr = $chr{$current_name}; my $current_position = $position{$current_name}; if ($current_position =~ /^\d+$/){ my @tbl_coord_for_current_chr; if ($coord_by_chr{$current_chr}){ @tbl_coord_for_current_chr = @{$coord_by_chr{$current_chr}}; } push(@tbl_coord_for_current_chr,$current_position); $coord_by_chr{$current_chr}=\@tbl_coord_for_current_chr; } elsif (($current_position =~/\s*-\s*/)||($current_position =~/none/i)){ } else { chomp($current_position); print STDERR "Error Parsing $current_name\tposition not recognized : $current_position \n"; print $list_marquer[$i],"\n"; } } open(OS, ">$output_segment_file") or die ("Can't open $output_segment_file\n"); my @segment_chr; my @segment_start; my @segment_end; foreach my $key (sort keys %coord_by_chr){ my @tbl_coord = @{$coord_by_chr{$key}}; @tbl_coord = sort { $a <=> $b } @tbl_coord; my $current_start; my $current_stop; my $current_start_with_offset; my $current_stop_with_offset; for (my $i=0;$i<=$#tbl_coord;$i++){ if (!$current_start){$current_start=$tbl_coord[$i];$current_stop=$tbl_coord[$i]} # print "$i : $current_start / $current_stop\n"; if ($tbl_coord[$i]>$current_stop+$WINDOW){ #OFFSET if ($current_start>$OFFSET){$current_start_with_offset=$current_start-$OFFSET;}else{$current_start_with_offset=1;} $current_stop_with_offset = $current_stop + $OFFSET; ####### print OS $key,":",$current_start_with_offset,"..",$current_stop_with_offset,"\n"; push(@segment_chr,$key); push(@segment_start,$current_start_with_offset); push(@segment_end,$current_stop_with_offset); $current_start = $tbl_coord[$i]; $current_stop = $tbl_coord[$i]; if ($i==$#tbl_coord){ #OFFSET if ($current_start>$OFFSET){$current_start_with_offset=$current_start-$OFFSET;}else{$current_start_with_offset=1;} $current_stop_with_offset = $current_stop + $OFFSET; ####### print OS $key,":",$current_start_with_offset,"..",$current_stop_with_offset,"\n"; push(@segment_chr,$key); push(@segment_start,$current_start_with_offset); push(@segment_end,$current_stop_with_offset); } } else { $current_stop=$tbl_coord[$i]; if ($i==$#tbl_coord){ #OFFSET if ($current_start>$OFFSET){$current_start_with_offset=$current_start-$OFFSET;}else{$current_start_with_offset=1;} $current_stop_with_offset = $current_stop + $OFFSET; ####### print OS $key,":",$current_start_with_offset,"..",$current_stop_with_offset,"\n"; push(@segment_chr,$key); push(@segment_start,$current_start_with_offset); push(@segment_end,$current_stop_with_offset); } } } } close(OS); ### Sequence extraction if ($EXTRACT_SEQ eq "YES"){ my %genome; my $current_header; my $current_seq=""; open(AF, $input_assembly_file) or die ("Can't open $input_assembly_file\n"); while (my $ligne = <AF>){ if ($ligne =~ /^\>(.*?)\s*$/){ if ($current_header){ $genome{$current_header} = $current_seq; } $current_header=$1; $current_seq = ""; } else { if ($ligne=~/^([ATGCNXatgcnx]+)\s*$/){ $current_seq .= $1; } else { print STDERR "Erreur Parsing n°1\n$ligne\n"; } } } #TRAITEMENT DU DERNIER if ($current_header){ $genome{$current_header} = $current_seq; undef($current_seq); } close (AF); open(OF, ">$output_fasta_file") or die ("Can't open $output_fasta_file\n"); for (my $i=0;$i<=$#segment_chr;$i++){ my $compt=0; my $current_seq=""; print OF ">",$segment_chr[$i],":",$segment_start[$i],"..",$segment_end[$i]."\n"; ### Modification 0.9.9 if ($segment_end[$i]>length($genome{$segment_chr[$i]})){ $segment_end[$i] = length($genome{$segment_chr[$i]}); } ### my @SEQ = split(//,$genome{$segment_chr[$i]}); for (my $coord = $segment_start[$i]-1; $coord<=$segment_end[$i]-1;$coord++){ $compt++; if ($compt > 60 ){ $current_seq .= "\n"; $compt=1; } $current_seq .= $SEQ[$coord]; } print OF "$current_seq\n"; } close (OF); } ### GENE and BLAST Extraction my @blast_by_base; my @header; my @blastfiles = split(/\,/,$input_blast_files); for (my $i=0;$i<=$#blastfiles;$i++){ my $current_blast_file = $blastfiles[$i]; my $current_blast_header = "DEFAULT"; my %current_blast; open (B,"$current_blast_file") or die ("Can't open $current_blast_file\n"); while (my $line =<B>){ if ($line =~ /^\#\#(.*?)$/){ $current_blast_header = $1; print LF $current_blast_header."\n"; } elsif ($line =~ /^\#/){ # blast file column legend } else { my @fields = split(/\s+/,$line); my $gene_id = $fields[0]; my @blast_for_this_gene; if ($current_blast{$gene_id}){ @blast_for_this_gene = @{$current_blast{$gene_id}}; } if ($#blast_for_this_gene<$MAX_BLAST_LINES-1){ push(@blast_for_this_gene,$line); print LF $gene_id,"\n"; } $current_blast{$gene_id}=\@blast_for_this_gene; } } close(B); push (@blast_by_base,\%current_blast); push (@header,$current_blast_header); } open (OGL,">$output_genes_list_file") or die ("Can't open $output_genes_list_file\n"); for (my $i=0;$i<=$#segment_chr;$i++){ my $segment_chr = $segment_chr[$i]; my $segment_start = $segment_start[$i]; my $segment_end = $segment_end[$i]; print OGL "#",$segment_chr[$i],":",$segment_start[$i],"..",$segment_end[$i],"\n"; open(IG, $input_genes_position_file) or die("Can't open $input_genes_position_file\n"); while (my $gene_desc=<IG>){ my @gene_desc = split(/\t+/,$gene_desc);#Modification in 1.1.1 : \s => \t if ($#gene_desc != 4){ print STDERR "Error in gene position file format\n$gene_desc\n"; exit(0); } my $gene_id = $gene_desc[0]; my $cds_id = $gene_desc[1]; my $gene_chr = $gene_desc[2]; my $gene_start = $gene_desc[3]; my $gene_end = $gene_desc[4]; if ($segment_chr eq $gene_chr){ if ((($gene_start>=$segment_start)&&($gene_start<=$segment_end))||(($gene_end>=$segment_start)&&($gene_end<=$segment_end))){ print OGL $gene_id," / ",$cds_id,"\n"; for (my $i=0;$i<=$#blast_by_base;$i++){ #print LF $header[$i]."\n"; my %current_blast = %{$blast_by_base[$i]}; if ($current_blast{$cds_id}){ my @blast_by_gene = @{$current_blast{$cds_id}}; #print LF $#blast_by_gene."\n"; for (my $j=0;$j<=$#blast_by_gene;$j++){ my @fields = split(/\t/,$blast_by_gene[$j]); print OGL $header[$i],"\t"; print OGL $fields[1],"\t"; print OGL $fields[3],"\t"; print OGL $fields[4],"\t"; print OGL $fields[5],"\t"; print OGL $fields[10],"\t"; print OGL $fields[6],"..",$fields[7],"(",$fields[11],")","\t"; print OGL $fields[8],"..",$fields[9],"(",$fields[12],")","\t"; print OGL $fields[13]; } print OGL "\n"; } else { print OGL $header[$i],"\t","No BLAST results\n"; print LF $gene_id," / ",$cds_id,"\n"; } } } } } close(IG); } close (OGL); close (LF);