Mercurial > repos > mcharles > rapsodyn
view rapsodyn/mpileupfilteronblastxml.pl @ 17:9435ccd4e0c4 draft
Uploaded
author | mcharles |
---|---|
date | Wed, 20 Aug 2014 12:23:06 -0400 |
parents | ad321ff1b67d |
children |
line wrap: on
line source
#!/usr/bin/perl use strict; use warnings; use Getopt::Long; my $input_variant_file; my $input_blastxml_file; my $window_length = 50; my $nb_mismatch_max = 2; GetOptions ( "input_variant_file=s" => \$input_variant_file, "input_blastxml_file=s" => \$input_blastxml_file, "window_length=i" => \$window_length, "nb_mismatch_max=i" => \$nb_mismatch_max ) or die("Error in command line arguments\n"); open(INB, $input_blastxml_file) or die ("Can't open $input_blastxml_file\n"); my $iteration_stop="</Iteration>"; my $hit_stop="</Hit>"; my $hsp_stop="</Hsp>"; my $query_flag_start="<Iteration_query-def>"; my $query_flag_stop="</Iteration_query-def>"; my $subject_flag_start="<Hit_def>"; my $subject_flag_stop="</Hit_def>"; my $query_HSP_flag_from_start="<Hsp_query-from>"; my $query_HSP_flag_from_stop="</Hsp_query-from>"; my $query_HSP_flag_to_start="<Hsp_query-to>"; my $query_HSP_flag_to_stop="</Hsp_query-to>"; my $subject_HSP_flag_from_start="<Hsp_hit-from>"; my $subject_HSP_flag_from_stop="</Hsp_hit-from>"; my $subject_HSP_flag_to_start="<Hsp_hit-to>"; my $subject_HSP_flag_to_stop="</Hsp_hit-to>"; my $query_HSP_flag_seq_start="<Hsp_qseq>"; my $query_HSP_flag_seq_stop="</Hsp_qseq>"; my $subject_HSP_flag_seq_start="<Hsp_hseq>"; my $subject_HSP_flag_seq_stop="</Hsp_hseq>"; my $HSP_midline_flag_start="<Hsp_midline>"; my $HSP_midline_flag_stop="</Hsp_midline>"; my %hash; my $compt=0; my $current_query=""; my $current_subject=""; my $current_HSP_query_from; my $current_HSP_query_to; my $current_HSP_subject_from; my $current_HSP_subject_to; my $current_HSP_midline; my $current_HSP_qseq; my $current_HSP_hseq; my $current_query_short; while (my $ligne = <INB>) { if ($ligne=~/$subject_flag_start(.*?)$subject_flag_stop/){ $current_subject=$1; #print "--",$1,"\n"; } if ($ligne=~/$query_flag_start(.*?)$query_flag_stop/){ $current_query=$1; $current_query_short=$current_query; if ($current_query =~ /^(.*?_\d+)\_/){ $current_query_short=$1; } if (!$hash{$current_query_short}){ $hash{$current_query_short}=0; } #print "--",$1,"\n"; } if ($ligne=~/$query_HSP_flag_from_start(.*?)$query_HSP_flag_from_stop/){ $current_HSP_query_from=$1; #print "--",$1,"..."; } if ($ligne=~/$query_HSP_flag_to_start(.*?)$query_HSP_flag_to_stop/){ $current_HSP_query_to=$1; #print $1,"\n"; } if ($ligne=~/$subject_HSP_flag_from_start(.*?)$subject_HSP_flag_from_stop/){ $current_HSP_subject_from=$1; #print "--",$1,"..."; } if ($ligne=~/$subject_HSP_flag_to_start(.*?)$subject_HSP_flag_to_stop/){ $current_HSP_subject_to=$1; #print $1,"\n"; } if ($ligne=~/$query_HSP_flag_seq_start(.*?)$query_HSP_flag_seq_stop/){ $current_HSP_qseq=$1; #print "--",$1,"\n"; } if ($ligne=~/$subject_HSP_flag_seq_start(.*?)$subject_HSP_flag_seq_stop/){ $current_HSP_hseq=$1; #print "--",$1,"\n"; } if ($ligne=~/$HSP_midline_flag_start(.*?)$HSP_midline_flag_stop/){ $current_HSP_midline=$1; #print "--",$1,"\n"; } if ($ligne=~/$hsp_stop/){ if ($current_HSP_query_from){ #print "\ntest1\n"; #print "Query : $current_query\n"; #print "Subject : $current_subject\n"; #print "$current_HSP_query_from ... $current_HSP_query_to\n"; #print "$current_HSP_subject_from ... $current_HSP_subject_to\n"; for (my $i=1;$i<$current_HSP_query_from;$i++){ $current_HSP_qseq = "N".$current_HSP_qseq; $current_HSP_midline = " ".$current_HSP_midline; $current_HSP_hseq = "N".$current_HSP_hseq; } for (my $i=$current_HSP_query_to+1;$i<=$window_length*2+1;$i++){ $current_HSP_qseq .= "N"; $current_HSP_midline .= " "; $current_HSP_hseq .= "N"; } my @qseq = split(//,$current_HSP_qseq); my @midline = split(//,$current_HSP_midline); my @hseq = split(//,$current_HSP_hseq); my $comptbase=0; my $compt5p=0; my $compt3p=0; for (my $i=0;$i<=$#qseq;$i++){ if ($qseq[$i] ne "-"){ $comptbase++; # Va de 1 -> $window_length *2 +1 } if ($midline[$i] eq " "){ if ($comptbase<=$window_length){ #1 -> $window_length $compt5p++; } elsif ($comptbase>=$window_length+2){ #$window_length+2 -> $window_length *2 + 1; $compt3p++; } else { #+1-$window_length*2+1 } } } if (($compt3p<=$nb_mismatch_max)||($compt5p<=$nb_mismatch_max)){ $hash{$current_query_short}++; } #print "$current_HSP_qseq\n"; #print "$current_HSP_midline\n"; #print "$current_HSP_hseq\n"; #print "$compt5p // $compt3p\n"; #print $hash{$current_query_short},"\n"; } undef $current_HSP_query_from; undef $current_HSP_query_to; undef $current_HSP_subject_from; undef $current_HSP_subject_to; $current_HSP_midline=""; $current_HSP_qseq=""; $current_HSP_hseq=""; } if ($ligne=~/$iteration_stop/){ if ($current_HSP_query_from){ #print "\ntest2\n"; #print "Query : $current_query\n"; #print "Subject : $current_subject\n"; #print "$current_HSP_query_from ... $current_HSP_query_to\n"; #print "$current_HSP_subject_from ... $current_HSP_subject_to\n"; for (my $i=1;$i<$current_HSP_query_from;$i++){ $current_HSP_qseq = "N".$current_HSP_qseq; $current_HSP_midline = " ".$current_HSP_midline; $current_HSP_hseq = "N".$current_HSP_hseq; } for (my $i=$current_HSP_query_to+1;$i<=$window_length*2+1;$i++){ $current_HSP_qseq .= "N"; $current_HSP_midline .= " "; $current_HSP_hseq .= "N"; } my @qseq = split(//,$current_HSP_qseq); my @midline = split(//,$current_HSP_midline); my @hseq = split(//,$current_HSP_hseq); my $comptbase=0; my $compt5p=0; my $compt3p=0; for (my $i=0;$i<=$#qseq;$i++){ if ($qseq[$i] ne "-"){ $comptbase++; # Va de 1 -> $window_length *2 +1 } if ($midline[$i] eq " "){ if ($comptbase<=$window_length){ #1 -> $window_length $compt5p++; } elsif ($comptbase>=$window_length+2){ #$window_length+2 -> $window_length *2 + 1; $compt3p++; } else { #+1-$window_length*2+1 } } } if (($compt3p<=$nb_mismatch_max)||($compt5p<=$nb_mismatch_max)){ $hash{$current_query_short}++; } #print "$current_HSP_qseq\n"; #print "$current_HSP_midline\n"; #print "$current_HSP_hseq\n"; #print "$compt5p // $compt3p\n"; #print $hash{$current_query_short},"\n"; } $current_query=""; $current_query_short=""; $current_subject=""; undef $current_HSP_query_from; undef $current_HSP_query_to; undef $current_HSP_subject_from; undef $current_HSP_subject_to; $current_HSP_midline=""; $current_HSP_qseq=""; $current_HSP_hseq=""; } } close (INB); # foreach my $key (sort trinombre keys %hash){ # print $key," ",$hash{$key},"\n"; # } # exit(0); open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n"); while (my $ligne = <INV>) { my @champs = split (/\s+/,$ligne); my $header = $champs[0]."_".$champs[1]; if ($hash{$header}){ if ($hash{$header}==1){ print "$ligne"; } else { #print $hash{$header}," $ligne"; } } else { print STDERR "No blast result for ",$header,"\n"; } } close(INV); sub trinombre { my $chra=$a; my $posa=0; my $chrb=$b; my $posb=0; if ($a =~/(.*?)\_(\d+)/){ $chra=$1; $posa=$2; } if ($b =~/(.*?)\_(\d+)/){ $chrb=$1; $posb=$2; } $chra cmp $chrb || $posa <=> $posb; }