diff genephys/GenePhys.pl @ 3:8dfa09868059 draft

Uploaded
author mcharles
date Fri, 24 Oct 2014 05:54:20 -0400
parents
children 3d79224aa2dc
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/genephys/GenePhys.pl	Fri Oct 24 05:54:20 2014 -0400
@@ -0,0 +1,335 @@
+#!/usr/bin/perl
+#V1.1.0 integrated gene extraction 
+#V1.0.2 integrated segment fasta extraction
+#V1.0.1 added log and option
+#V1.0.0
+use strict;
+use warnings;
+use Getopt::Long;
+
+my $input_blast_files;
+my $input_genes_position_file;
+my $input_assembly_file;
+my $input_markers_position_file;
+my $input_markers_file;
+my $log_file;
+my $output_fasta_file;
+my $output_segment_file;
+my $output_genes_list_file;
+my $EXTRACT_SEQ = "NO";
+my $WINDOW = 200000;
+my $OFFSET = 100000;
+my $MAX_BLAST_LINES = 1;
+
+GetOptions (
+"input_assembly_file=s" => \$input_assembly_file,
+"input_markers_position_file=s" => \$input_markers_position_file,
+"input_markers_file=s" => \$input_markers_file,
+"log_file=s" => \$log_file,
+"output_fasta_file=s" => \$output_fasta_file,
+"output_segment_file=s" => \$output_segment_file,
+"extractseq=s" => \$EXTRACT_SEQ,
+"window=i" => \$WINDOW,
+"offset=i" =>\$OFFSET,
+"input_blast_files=s" => \$input_blast_files,
+"input_genes_position_file=s"=> \$input_genes_position_file,
+"output_genes_list_file=s"=>\$output_genes_list_file,
+"max_blast_lines=i" => \$MAX_BLAST_LINES
+) or die("Error in command line arguments\n");
+
+open(LF, ">$log_file")  or die("Can't open $log_file\n");
+#print LF $EXTRACT_SEQ."\n";
+
+my $current_annotation="";
+my @list_marquer;
+my %chr;
+my %position;
+
+open(MP, $input_markers_position_file)  or die("Can't open $input_markers_position_file\n");
+
+my $compt=0;
+while (my $line=<MP>){
+	$compt++;
+	my @cols = split(/\t/,$line);
+	if ($#cols != 3){
+		print STDERR "Error in marker position file format\n$compt : $line\n";
+		exit(0);
+	}
+	my %current;
+	# Number#Map#Name#Chr#Position#GeneAT#FunctionAT
+	my $Name = $cols[0];
+	my $Locus = $cols[1];
+	my $Chr = $cols[2];
+	my $Position = $cols[3];
+
+
+	$chr{$Name} = $Chr;
+	$position{$Name} = $Position;
+	
+	### Modification 0.9.9
+	if ($Locus ne $Name){ 
+		$chr{$Locus} = $Chr;
+		$position{$Locus} = $Position;	
+	}
+	###
+	
+}
+close (MP);
+
+open(MA, $input_markers_file)  or die("Can't open $input_markers_file\n");
+while (my $line=<MA>){
+	my @cols = split (/\s+/,$line);
+	for (my $i=0;$i<=$#cols;$i++){
+		my $current = $cols[$i];
+		chomp($current);
+		if ($current !~ /^\s+$/){
+			push(@list_marquer,$current);
+		}
+	}
+}
+close (MA);
+
+my %coord_by_chr;
+for (my $i=0;$i<=$#list_marquer;$i++){
+	my $current_name = $list_marquer[$i];
+	my $current_chr = $chr{$current_name};
+	my $current_position = $position{$current_name};
+	
+	if ($current_position =~ /^\d+$/){
+		my @tbl_coord_for_current_chr;
+		if ($coord_by_chr{$current_chr}){
+			@tbl_coord_for_current_chr = @{$coord_by_chr{$current_chr}};
+		}
+		push(@tbl_coord_for_current_chr,$current_position);
+		$coord_by_chr{$current_chr}=\@tbl_coord_for_current_chr;
+	}
+	elsif (($current_position =~/\s*-\s*/)||($current_position =~/none/i)){
+		
+	}
+	else {
+		chomp($current_position);
+		print STDERR "Error Parsing $current_name\tposition not recognized : $current_position \n";
+		print $list_marquer[$i],"\n";
+	}
+}
+
+open(OS, ">$output_segment_file") or die ("Can't open $output_segment_file\n");
+
+my @segment_chr;
+my @segment_start;
+my @segment_end;
+
+foreach my $key (sort keys %coord_by_chr){
+	my @tbl_coord = @{$coord_by_chr{$key}};
+	@tbl_coord = sort { $a <=> $b } @tbl_coord;
+	my $current_start;
+	my $current_stop;
+	my $current_start_with_offset;
+	my $current_stop_with_offset;
+	
+	for (my $i=0;$i<=$#tbl_coord;$i++){
+		if (!$current_start){$current_start=$tbl_coord[$i];$current_stop=$tbl_coord[$i]}
+		
+		# print "$i : $current_start / $current_stop\n";
+		if ($tbl_coord[$i]>$current_stop+$WINDOW){
+			#OFFSET
+			if ($current_start>$OFFSET){$current_start_with_offset=$current_start-$OFFSET;}else{$current_start_with_offset=1;}
+			$current_stop_with_offset = $current_stop + $OFFSET;
+			#######
+			print OS $key,":",$current_start_with_offset,"..",$current_stop_with_offset,"\n";
+			push(@segment_chr,$key);
+			push(@segment_start,$current_start_with_offset);
+			push(@segment_end,$current_stop_with_offset);
+			
+			$current_start = $tbl_coord[$i];
+			$current_stop = $tbl_coord[$i];
+
+			if ($i==$#tbl_coord){				
+				#OFFSET
+				if ($current_start>$OFFSET){$current_start_with_offset=$current_start-$OFFSET;}else{$current_start_with_offset=1;}
+				$current_stop_with_offset = $current_stop + $OFFSET;
+				#######
+				print OS $key,":",$current_start_with_offset,"..",$current_stop_with_offset,"\n";
+				push(@segment_chr,$key);
+				push(@segment_start,$current_start_with_offset);
+				push(@segment_end,$current_stop_with_offset);
+			}
+		}
+		else {
+			$current_stop=$tbl_coord[$i];
+			if ($i==$#tbl_coord){
+				#OFFSET
+				if ($current_start>$OFFSET){$current_start_with_offset=$current_start-$OFFSET;}else{$current_start_with_offset=1;}
+				$current_stop_with_offset = $current_stop + $OFFSET;
+				#######
+				print OS $key,":",$current_start_with_offset,"..",$current_stop_with_offset,"\n";
+				push(@segment_chr,$key);
+				push(@segment_start,$current_start_with_offset);
+				push(@segment_end,$current_stop_with_offset);
+			}
+		}
+	}
+}
+close(OS);
+
+### Sequence extraction
+if ($EXTRACT_SEQ eq "YES"){
+
+	my %genome;
+	my $current_header;
+	my $current_seq="";
+	open(AF, $input_assembly_file) or die ("Can't open $input_assembly_file\n");
+
+	while (my $ligne = <AF>){
+		if ($ligne =~ /^\>(.*?)\s*$/){
+			if ($current_header){
+				$genome{$current_header} = $current_seq;
+			}
+			$current_header=$1;
+			$current_seq = "";
+		}
+		else {
+			if ($ligne=~/^([ATGCNXatgcnx]+)\s*$/){
+				$current_seq .= $1;
+			}
+			else {
+				print STDERR "Erreur Parsing n°1\n$ligne\n";
+			}
+		}
+	}
+
+	#TRAITEMENT DU DERNIER
+	if ($current_header){
+		$genome{$current_header} = $current_seq;
+		undef($current_seq);
+	}
+	close (AF);
+
+	open(OF, ">$output_fasta_file") or die ("Can't open $output_fasta_file\n");
+	for (my $i=0;$i<=$#segment_chr;$i++){
+		my $compt=0;
+		my $current_seq="";
+		print OF ">",$segment_chr[$i],":",$segment_start[$i],"..",$segment_end[$i]."\n";
+		### Modification 0.9.9
+		if ($segment_end[$i]>length($genome{$segment_chr[$i]})){
+			$segment_end[$i] = length($genome{$segment_chr[$i]});
+		}
+		###
+
+		my @SEQ = split(//,$genome{$segment_chr[$i]});
+		for (my $coord = $segment_start[$i]-1; $coord<=$segment_end[$i]-1;$coord++){
+			$compt++;
+			if ($compt > 60 ){
+				$current_seq .= "\n";
+				$compt=1;
+			}
+			$current_seq .= $SEQ[$coord];
+		
+		}
+		print OF "$current_seq\n";
+	}
+	close (OF);
+}
+
+### GENE and BLAST Extraction
+my @blast_by_base;
+my @header;
+
+
+my @blastfiles = split(/\,/,$input_blast_files);
+for (my $i=0;$i<=$#blastfiles;$i++){
+	my $current_blast_file = $blastfiles[$i];
+	my $current_blast_header = "DEFAULT";
+	my %current_blast; 
+	open (B,"$current_blast_file") or die ("Can't open $current_blast_file\n");
+	while (my $line =<B>){
+		if ($line =~ /^\#\#(.*?)$/){
+			$current_blast_header = $1;
+			print LF $current_blast_header."\n";
+		}
+		elsif ($line =~ /^\#/){
+			# blast file column legend
+		}
+		else {
+			my @fields = split(/\s+/,$line);
+			my $gene_id = $fields[0];
+			my @blast_for_this_gene;
+			if ($current_blast{$gene_id}){
+				@blast_for_this_gene = @{$current_blast{$gene_id}};
+			}			
+			
+			if ($#blast_for_this_gene<$MAX_BLAST_LINES-1){
+				push(@blast_for_this_gene,$line);
+				print LF $gene_id,"\n";
+			}
+			$current_blast{$gene_id}=\@blast_for_this_gene;
+		}
+	}
+	close(B);
+	push (@blast_by_base,\%current_blast);
+	push (@header,$current_blast_header);
+}
+
+
+open (OGL,">$output_genes_list_file") or die ("Can't open $output_genes_list_file\n");
+
+for (my $i=0;$i<=$#segment_chr;$i++){
+	my $segment_chr = $segment_chr[$i];
+	my $segment_start = $segment_start[$i];
+	my $segment_end = $segment_end[$i];
+
+	print OGL "#",$segment_chr[$i],":",$segment_start[$i],"..",$segment_end[$i],"\n";
+	
+	open(IG, $input_genes_position_file)  or die("Can't open $input_genes_position_file\n");
+	while (my $gene_desc=<IG>){
+		my @gene_desc = split(/\s+/,$gene_desc);
+		if ($#gene_desc != 4){
+			print STDERR "Error in gene position file format\n$gene_desc\n";
+			exit(0);
+		}
+		my $gene_id = $gene_desc[0];
+		my $cds_id = $gene_desc[1];
+		my $gene_chr = $gene_desc[2];
+		my $gene_start = $gene_desc[3];
+		my $gene_end = $gene_desc[4];
+		if ($segment_chr eq $gene_chr){
+			if ((($gene_start>=$segment_start)&&($gene_start<=$segment_end))||(($gene_end>=$segment_start)&&($gene_end<=$segment_end))){
+				print OGL $gene_id," / ",$cds_id,"\n";
+				
+				for (my $i=0;$i<=$#blast_by_base;$i++){
+					#print LF $header[$i]."\n";
+					my %current_blast = %{$blast_by_base[$i]};
+					if ($current_blast{$cds_id}){
+						my @blast_by_gene = @{$current_blast{$cds_id}};
+						#print LF $#blast_by_gene."\n";
+						for (my $j=0;$j<=$#blast_by_gene;$j++){
+							my @fields = split(/\t/,$blast_by_gene[$j]);
+							print OGL $header[$i],"\t";
+							print OGL $fields[1],"\t";
+							print OGL $fields[3],"\t";
+							print OGL $fields[4],"\t";
+							print OGL $fields[5],"\t";
+							print OGL $fields[10],"\t";
+							print OGL $fields[6],"..",$fields[7],"(",$fields[11],")","\t";
+							print OGL $fields[8],"..",$fields[9],"(",$fields[12],")","\t";
+							print OGL $fields[13];
+						}
+						print OGL "\n";
+					}
+					else {
+						print OGL $header[$i],"\t","No BLAST results\n";
+						print LF $gene_id," / ",$cds_id,"\n";
+					}
+				}
+			}
+		}
+
+	}
+	close(IG);
+}
+
+
+close (OGL);
+
+close (LF);
+