diff rapsodyn/ParseBlastForUniqueMatch.pl @ 0:442a7c88b886 draft

Uploaded
author mcharles
date Wed, 10 Sep 2014 09:18:15 -0400
parents
children 3f7b0788a1c4
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rapsodyn/ParseBlastForUniqueMatch.pl	Wed Sep 10 09:18:15 2014 -0400
@@ -0,0 +1,176 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+
+
+my $input_variant_file = $ARGV[0];
+my $input_blast_file = $ARGV[1];
+my $window_length = $ARGV[2];
+my $nb_mismatch_max = $ARGV[3];
+
+my %hash_name;
+
+open(INB, $input_blast_file) or die ("Can't open $input_blast_file\n");
+while (my $line =<INB>){
+	my @fields = split (/\s+/,$line);
+	# print $#fields,"\n";
+	# print $line;
+	# exit(0);
+	my $query_name;
+	my $query_start;
+	my $query_stop;
+	my $query_aln;
+	my $subject_aln;
+	my $compt_mismatch_5p=0;
+	my $compt_mismatch_3p=0;
+	
+	if ($#fields == 24){
+		$query_name = $fields[0];
+		$query_start = $fields[6];
+		$query_stop = $fields[7];
+		$query_aln = $fields[20];
+		$subject_aln = $fields[21];
+	}
+	elsif ($#fields == 4){
+		$query_name = $fields[0];
+		$query_start = $fields[1];
+		$query_stop = $fields[2];
+		$query_aln = $fields[3];
+		$subject_aln = $fields[4];
+	}
+	else {
+		print STDERR "Unrecongnized tabular format for blast result\nScript works with 25 column or 5 custom column(qseqid,qstart,qend,ssseq,sseq)\n";
+		exit(0);
+	}
+	
+	
+	my @field_name = split (/\_/,$query_name);
+	my $name = $field_name[0]."_".$field_name[1];
+	if ($query_name =~ /random/){
+		$name .= "_".$field_name[2];
+		# print $name."\n";
+	
+	}
+	if (!$hash_name{$name}){
+		$hash_name{$name}=0;
+	}
+	elsif ($hash_name{$name}>1){
+		next;
+	}
+	# if ($query_name eq "chrUnn_random_8279117_1_M1_a"){
+		# print "READ : $query_name\n$name\n";
+		# print "HASH : ",$hash_name{$name},"\n";
+		# # exit(0);
+	# }
+	
+	
+	
+	chomp($query_aln);
+	chomp($subject_aln);
+	
+	my $nb_gap_query=0;
+	
+	if (length($query_aln) == length($subject_aln)){
+		if (length($query_aln)<$window_length-$nb_mismatch_max){
+		}
+		else {
+			my @q = split(//,$query_aln);
+			my @s = split(//,$subject_aln);
+			for (my $i=0;$i<=$#q;$i++){
+				my $global_idx = $query_start-1+$i-$nb_gap_query;
+				if ($q[$i] eq "-"){
+					if ($global_idx < $window_length){
+						$compt_mismatch_5p++;
+					}
+					elsif ($global_idx > $window_length){
+						$compt_mismatch_3p++;
+					}
+					$nb_gap_query++; #On compte les gap dans la query pour les soustraire de l'index global
+				}
+				else {
+					if ($q[$i] ne $s[$i]){
+						if ($global_idx < $window_length){
+							$compt_mismatch_5p++;
+						}
+						elsif ($global_idx > $window_length){
+							$compt_mismatch_3p++;
+						}
+					}
+				}
+			}
+			$compt_mismatch_5p += $query_start-1;
+			$compt_mismatch_3p += $window_length *2 + 1 - $query_stop;
+			
+			# for (my $i=0;$i<$window_length;$i++){
+				# if ($tbl_q_aln[$i] eq "#"){
+					# $compt_mismatch_5p++;
+				# }
+				# elsif ($tbl_q_aln[$i] ne $tbl_s_aln[$i]){
+					# $compt_mismatch_5p++;
+				# }
+				# else {
+				# }
+
+			# }
+			# for (my $i=$window_length+1;$i<=$window_length*2;$i++){
+				# if ($tbl_q_aln[$i] eq "#"){
+					# $compt_mismatch_3p++;
+				# }
+				# elsif ($tbl_q_aln[$i] ne $tbl_s_aln[$i]){
+					# $compt_mismatch_3p++;
+				# }
+				# else {
+				# }
+			# }
+			if (($compt_mismatch_5p <= $nb_mismatch_max)||($compt_mismatch_3p <= $nb_mismatch_max)){
+				$hash_name{$name}++;
+			}
+			
+		}
+	}
+	else {
+		print STDERR "incompatible subject and query alignement length\n $query_aln\n$subject_aln\n";
+	}
+	
+	# if ($line=~/chrCnn_random_49828229/){
+		# print $line;
+		# print $query_aln,"\n";
+		# print $subject_aln,"\n";
+		# print $compt_mismatch_5p,"\t",$compt_mismatch_3p,"\n";
+		# print $hash_name{"chrCnn_random_49828229"},"\n";
+		# print "\n";
+	# }
+	
+	
+	
+}
+# exit(0);
+	
+close (INB);
+
+open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n");
+
+while (my $ligne = <INV>) {
+	
+	my @champs = split (/\s+/,$ligne);
+	my $header = $champs[0]."_".$champs[1];
+
+	if ($hash_name{$header}){
+		if ($hash_name{$header}==1){
+			print $ligne;
+		}
+	}
+	else {
+		#print STDERR "No blast result for ",$header,"\n";
+	}
+
+	
+}
+
+close(INV);
+
+
+# foreach my $key (sort keys %hash_name){
+	# print $key,"\t",$hash_name{$key},"\n";
+	
+# }