view rapsodyn/mpileupfilteronblastxml.pl @ 15:b65abe3ecda1 draft

Uploaded
author mcharles
date Wed, 20 Aug 2014 12:17:41 -0400
parents ad321ff1b67d
children
line wrap: on
line source

#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;

my $input_variant_file;
my $input_blastxml_file;
my $window_length = 50;
my $nb_mismatch_max = 2;

GetOptions (
"input_variant_file=s" => \$input_variant_file,
"input_blastxml_file=s" => \$input_blastxml_file,
"window_length=i" => \$window_length,
"nb_mismatch_max=i" => \$nb_mismatch_max
) or die("Error in command line arguments\n");


open(INB, $input_blastxml_file) or die ("Can't open $input_blastxml_file\n");


my $iteration_stop="</Iteration>";
my $hit_stop="</Hit>";
my $hsp_stop="</Hsp>";
my $query_flag_start="<Iteration_query-def>";
my $query_flag_stop="</Iteration_query-def>";
my $subject_flag_start="<Hit_def>";
my $subject_flag_stop="</Hit_def>";
my $query_HSP_flag_from_start="<Hsp_query-from>";
my $query_HSP_flag_from_stop="</Hsp_query-from>";
my $query_HSP_flag_to_start="<Hsp_query-to>";
my $query_HSP_flag_to_stop="</Hsp_query-to>";
my $subject_HSP_flag_from_start="<Hsp_hit-from>";
my $subject_HSP_flag_from_stop="</Hsp_hit-from>";
my $subject_HSP_flag_to_start="<Hsp_hit-to>";
my $subject_HSP_flag_to_stop="</Hsp_hit-to>";
my $query_HSP_flag_seq_start="<Hsp_qseq>";
my $query_HSP_flag_seq_stop="</Hsp_qseq>";
my $subject_HSP_flag_seq_start="<Hsp_hseq>";
my $subject_HSP_flag_seq_stop="</Hsp_hseq>";
my $HSP_midline_flag_start="<Hsp_midline>";
my $HSP_midline_flag_stop="</Hsp_midline>";

my %hash;

my $compt=0;
my $current_query="";
my $current_subject="";
my $current_HSP_query_from;
my $current_HSP_query_to;
my $current_HSP_subject_from;
my $current_HSP_subject_to;
my $current_HSP_midline;
my $current_HSP_qseq;
my $current_HSP_hseq;
my $current_query_short;

while (my $ligne = <INB>) {

	if ($ligne=~/$subject_flag_start(.*?)$subject_flag_stop/){
		$current_subject=$1;
		#print "--",$1,"\n";
	}
	
	if ($ligne=~/$query_flag_start(.*?)$query_flag_stop/){
		$current_query=$1;
		$current_query_short=$current_query;
		if ($current_query =~ /^(.*?_\d+)\_/){
			$current_query_short=$1;
		}
		if (!$hash{$current_query_short}){
			$hash{$current_query_short}=0;
		}
			

		#print "--",$1,"\n";
	}
	if ($ligne=~/$query_HSP_flag_from_start(.*?)$query_HSP_flag_from_stop/){
		$current_HSP_query_from=$1;
		#print "--",$1,"...";
	}
	if ($ligne=~/$query_HSP_flag_to_start(.*?)$query_HSP_flag_to_stop/){
		$current_HSP_query_to=$1;
		#print $1,"\n";
	}
	if ($ligne=~/$subject_HSP_flag_from_start(.*?)$subject_HSP_flag_from_stop/){
		$current_HSP_subject_from=$1;
		#print "--",$1,"...";
	}
	if ($ligne=~/$subject_HSP_flag_to_start(.*?)$subject_HSP_flag_to_stop/){
		$current_HSP_subject_to=$1;
		#print $1,"\n";
	}
	if ($ligne=~/$query_HSP_flag_seq_start(.*?)$query_HSP_flag_seq_stop/){
		$current_HSP_qseq=$1;
		#print "--",$1,"\n";
	}
	if ($ligne=~/$subject_HSP_flag_seq_start(.*?)$subject_HSP_flag_seq_stop/){
		$current_HSP_hseq=$1;
		#print "--",$1,"\n";
	}
	if ($ligne=~/$HSP_midline_flag_start(.*?)$HSP_midline_flag_stop/){
		$current_HSP_midline=$1;
		#print "--",$1,"\n";
	}
	
	if ($ligne=~/$hsp_stop/){
		if ($current_HSP_query_from){
			#print "\ntest1\n";
			#print "Query : $current_query\n";
			#print "Subject : $current_subject\n";
			#print "$current_HSP_query_from ... $current_HSP_query_to\n";
			#print "$current_HSP_subject_from ... $current_HSP_subject_to\n";
			for (my $i=1;$i<$current_HSP_query_from;$i++){
				$current_HSP_qseq = "N".$current_HSP_qseq;
				$current_HSP_midline = " ".$current_HSP_midline;
				$current_HSP_hseq = "N".$current_HSP_hseq;
			}
			for (my $i=$current_HSP_query_to+1;$i<=$window_length*2+1;$i++){
				$current_HSP_qseq .= "N";
				$current_HSP_midline .= " ";
				$current_HSP_hseq .= "N";
			}
			
			my @qseq = split(//,$current_HSP_qseq);
			my @midline = split(//,$current_HSP_midline);
			my @hseq = split(//,$current_HSP_hseq);
			
			my $comptbase=0;
			my $compt5p=0;
			my $compt3p=0;
			for (my $i=0;$i<=$#qseq;$i++){
				if ($qseq[$i] ne "-"){
					$comptbase++; # Va de 1 -> $window_length *2 +1
				}
				if ($midline[$i] eq " "){
					if ($comptbase<=$window_length){ #1 -> $window_length
						$compt5p++;
					}
					elsif ($comptbase>=$window_length+2){ #$window_length+2 -> $window_length *2 + 1;
						$compt3p++;
					}
					else { #+1-$window_length*2+1
						
					}
				}
			}
			if (($compt3p<=$nb_mismatch_max)||($compt5p<=$nb_mismatch_max)){
				$hash{$current_query_short}++;
			}
			
			#print "$current_HSP_qseq\n";
			#print "$current_HSP_midline\n";
			#print "$current_HSP_hseq\n";
			#print "$compt5p // $compt3p\n";
			#print $hash{$current_query_short},"\n";

		}
		
		undef $current_HSP_query_from;
		undef $current_HSP_query_to;
		undef $current_HSP_subject_from;
		undef $current_HSP_subject_to;
		$current_HSP_midline="";
		$current_HSP_qseq="";
		$current_HSP_hseq="";
	}
	
	if ($ligne=~/$iteration_stop/){
		if ($current_HSP_query_from){
			#print "\ntest2\n";
			#print "Query : $current_query\n";
			#print "Subject : $current_subject\n";
			#print "$current_HSP_query_from ... $current_HSP_query_to\n";
			#print "$current_HSP_subject_from ... $current_HSP_subject_to\n";
			for (my $i=1;$i<$current_HSP_query_from;$i++){
				$current_HSP_qseq = "N".$current_HSP_qseq;
				$current_HSP_midline = " ".$current_HSP_midline;
				$current_HSP_hseq = "N".$current_HSP_hseq;
			}
			for (my $i=$current_HSP_query_to+1;$i<=$window_length*2+1;$i++){
				$current_HSP_qseq .= "N";
				$current_HSP_midline .= " ";
				$current_HSP_hseq .= "N";
			}
			
			my @qseq = split(//,$current_HSP_qseq);
			my @midline = split(//,$current_HSP_midline);
			my @hseq = split(//,$current_HSP_hseq);
			
			my $comptbase=0;
			my $compt5p=0;
			my $compt3p=0;
			for (my $i=0;$i<=$#qseq;$i++){
				if ($qseq[$i] ne "-"){
					$comptbase++; # Va de 1 -> $window_length *2 +1
				}
				if ($midline[$i] eq " "){
					if ($comptbase<=$window_length){ #1 -> $window_length
						$compt5p++;
					}
					elsif ($comptbase>=$window_length+2){ #$window_length+2 -> $window_length *2 + 1;
						$compt3p++;
					}
					else { #+1-$window_length*2+1
						
					}
				}
			}
			if (($compt3p<=$nb_mismatch_max)||($compt5p<=$nb_mismatch_max)){
				$hash{$current_query_short}++;
			}
			
			#print "$current_HSP_qseq\n";
			#print "$current_HSP_midline\n";
			#print "$current_HSP_hseq\n";
			#print "$compt5p // $compt3p\n";
			#print $hash{$current_query_short},"\n";
		}
		$current_query="";
		$current_query_short="";
		$current_subject="";
		undef $current_HSP_query_from;
		undef $current_HSP_query_to;
		undef $current_HSP_subject_from;
		undef $current_HSP_subject_to;
		$current_HSP_midline="";
		$current_HSP_qseq="";
		$current_HSP_hseq="";
	}
	
}

close (INB);

# foreach my $key (sort trinombre keys %hash){
	# print $key," ",$hash{$key},"\n";
	
# }
# exit(0);

open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n");

while (my $ligne = <INV>) {
	my @champs = split (/\s+/,$ligne);
	my $header = $champs[0]."_".$champs[1];

	if ($hash{$header}){
		if ($hash{$header}==1){
			print "$ligne";
		}
		else {
			#print $hash{$header}," $ligne";
		}
	}
	else {
		print STDERR "No blast result for ",$header,"\n";
	}

	
}

close(INV);



sub trinombre {
	my $chra=$a;
	my $posa=0;
	my $chrb=$b;
	my $posb=0;
	
	if ($a =~/(.*?)\_(\d+)/){
		$chra=$1;
		$posa=$2;
	}
	if ($b =~/(.*?)\_(\d+)/){
		$chrb=$1;
		$posb=$2;
	}
	
	
	$chra cmp $chrb
	||
	$posa <=> $posb;
}