view rapsodyn/ParseBlastForUniqueMatch.pl @ 10:0a6c1cfe4dc8 draft

Uploaded
author mcharles
date Mon, 19 Jan 2015 04:33:21 -0500
parents 3f7b0788a1c4
children
line wrap: on
line source

#!/usr/bin/perl
#V1.1.0 manage empty files
#V1.0.1 added log, option parameters
use strict;
use warnings;
use Getopt::Long;


my $input_variant_file;
my $input_blast_file;
my $log_file;
my $WINDOW_LENGTH= 50;
my $NB_MISMATCH_MAX = 3;

GetOptions (
"input_variant_file=s" => \$input_variant_file,
"input_blast_file=s" => \$input_blast_file,
"window_length=s" => \$WINDOW_LENGTH,
"log_file=s" => \$log_file,
"nb_mismatch_max=s" => \$NB_MISMATCH_MAX
) or die("Error in command line arguments\n");

my $nb_variant_checked=0;
my $nb_variant_selected=0;
my %hash_name;

open(INB, $input_blast_file) or die ("Can't open $input_blast_file\n");
if ( -z INB){
	exit(0);
}

while (my $line =<INB>){
	my @fields = split (/\s+/,$line);
	# print $#fields,"\n";
	# print $line;
	# exit(0);
	my $query_name;
	my $query_start;
	my $query_stop;
	my $query_aln;
	my $subject_aln;
	my $compt_mismatch_5p=0;
	my $compt_mismatch_3p=0;
	
	if ($#fields == 24){
		$query_name = $fields[0];
		$query_start = $fields[6];
		$query_stop = $fields[7];
		$query_aln = $fields[20];
		$subject_aln = $fields[21];
	}
	elsif ($#fields == 4){
		$query_name = $fields[0];
		$query_start = $fields[1];
		$query_stop = $fields[2];
		$query_aln = $fields[3];
		$subject_aln = $fields[4];
	}
	else {
		print STDERR "Unrecongnized tabular format for blast result\nScript works with 25 column or 5 custom column(qseqid,qstart,qend,ssseq,sseq)\n";
		exit(0);
	}
	
	
	my @field_name = split (/\_/,$query_name);
	my $name = $field_name[0]."_".$field_name[1];
	if ($query_name =~ /random/){
		$name .= "_".$field_name[2];
		# print $name."\n";
	
	}
	if (!$hash_name{$name}){
		$hash_name{$name}=0;
	}
	elsif ($hash_name{$name}>1){
		next;
	}
	# if ($query_name eq "chrUnn_random_8279117_1_M1_a"){
		# print "READ : $query_name\n$name\n";
		# print "HASH : ",$hash_name{$name},"\n";
		# # exit(0);
	# }
	
	
	
	chomp($query_aln);
	chomp($subject_aln);
	
	my $nb_gap_query=0;
	
	if (length($query_aln) == length($subject_aln)){
		if (length($query_aln)<$WINDOW_LENGTH-$NB_MISMATCH_MAX){
		}
		else {
			my @q = split(//,$query_aln);
			my @s = split(//,$subject_aln);
			for (my $i=0;$i<=$#q;$i++){
				my $global_idx = $query_start-1+$i-$nb_gap_query;
				if ($q[$i] eq "-"){
					if ($global_idx < $WINDOW_LENGTH){
						$compt_mismatch_5p++;
					}
					elsif ($global_idx > $WINDOW_LENGTH){
						$compt_mismatch_3p++;
					}
					$nb_gap_query++; #On compte les gap dans la query pour les soustraire de l'index global
				}
				else {
					if ($q[$i] ne $s[$i]){
						if ($global_idx < $WINDOW_LENGTH){
							$compt_mismatch_5p++;
						}
						elsif ($global_idx > $WINDOW_LENGTH){
							$compt_mismatch_3p++;
						}
					}
				}
			}
			$compt_mismatch_5p += $query_start-1;
			$compt_mismatch_3p += $WINDOW_LENGTH *2 + 1 - $query_stop;
			
			# for (my $i=0;$i<$window_length;$i++){
				# if ($tbl_q_aln[$i] eq "#"){
					# $compt_mismatch_5p++;
				# }
				# elsif ($tbl_q_aln[$i] ne $tbl_s_aln[$i]){
					# $compt_mismatch_5p++;
				# }
				# else {
				# }

			# }
			# for (my $i=$window_length+1;$i<=$window_length*2;$i++){
				# if ($tbl_q_aln[$i] eq "#"){
					# $compt_mismatch_3p++;
				# }
				# elsif ($tbl_q_aln[$i] ne $tbl_s_aln[$i]){
					# $compt_mismatch_3p++;
				# }
				# else {
				# }
			# }
			if (($compt_mismatch_5p <= $NB_MISMATCH_MAX)||($compt_mismatch_3p <= $NB_MISMATCH_MAX)){
				$hash_name{$name}++;
			}
			
		}
	}
	else {
		print STDERR "incompatible subject and query alignement length\n $query_aln\n$subject_aln\n";
	}
	
	# if ($line=~/chrCnn_random_49828229/){
		# print $line;
		# print $query_aln,"\n";
		# print $subject_aln,"\n";
		# print $compt_mismatch_5p,"\t",$compt_mismatch_3p,"\n";
		# print $hash_name{"chrCnn_random_49828229"},"\n";
		# print "\n";
	# }
	
	
	
}
# exit(0);
	
close (INB);

open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n");

while (my $ligne = <INV>) {
	$nb_variant_checked++;
	
	my @champs = split (/\s+/,$ligne);
	my $header = $champs[0]."_".$champs[1];

	if ($hash_name{$header}){
		if ($hash_name{$header}==1){
			print $ligne;
			$nb_variant_selected++;
		}
	}
	else {
		#print STDERR "No blast result for ",$header,"\n";
	}

	
}

close(INV);

open (LF,">$log_file") or die("Can't open $log_file\n");
print LF "\n####\t Blast filtering \n";
print LF "Variant checked  :\t$nb_variant_checked\n";
print LF "Variant selected :\t$nb_variant_selected\n";
close (LF);


# foreach my $key (sort keys %hash_name){
	# print $key,"\t",$hash_name{$key},"\n";
	
# }