comparison rapsodyn/ParseBlastForUniqueMatch.pl @ 7:3f7b0788a1c4 draft

Uploaded
author mcharles
date Tue, 07 Oct 2014 10:34:34 -0400
parents 442a7c88b886
children 0a6c1cfe4dc8
comparison
equal deleted inserted replaced
6:1776b8ddd87e 7:3f7b0788a1c4
1 #!/usr/bin/perl 1 #!/usr/bin/perl
2 #V1.0.1 added log, option parameters
2 use strict; 3 use strict;
3 use warnings; 4 use warnings;
5 use Getopt::Long;
4 6
5 7
6 my $input_variant_file = $ARGV[0]; 8 my $input_variant_file;
7 my $input_blast_file = $ARGV[1]; 9 my $input_blast_file;
8 my $window_length = $ARGV[2]; 10 my $log_file;
9 my $nb_mismatch_max = $ARGV[3]; 11 my $WINDOW_LENGTH= 50;
12 my $NB_MISMATCH_MAX = 3;
10 13
14 GetOptions (
15 "input_variant_file=s" => \$input_variant_file,
16 "input_blast_file=s" => \$input_blast_file,
17 "window_length=s" => \$WINDOW_LENGTH,
18 "log_file=s" => \$log_file,
19 "nb_mismatch_max=s" => \$NB_MISMATCH_MAX
20 ) or die("Error in command line arguments\n");
21
22 my $nb_variant_checked=0;
23 my $nb_variant_selected=0;
11 my %hash_name; 24 my %hash_name;
12 25
13 open(INB, $input_blast_file) or die ("Can't open $input_blast_file\n"); 26 open(INB, $input_blast_file) or die ("Can't open $input_blast_file\n");
14 while (my $line =<INB>){ 27 while (my $line =<INB>){
15 my @fields = split (/\s+/,$line); 28 my @fields = split (/\s+/,$line);
69 chomp($subject_aln); 82 chomp($subject_aln);
70 83
71 my $nb_gap_query=0; 84 my $nb_gap_query=0;
72 85
73 if (length($query_aln) == length($subject_aln)){ 86 if (length($query_aln) == length($subject_aln)){
74 if (length($query_aln)<$window_length-$nb_mismatch_max){ 87 if (length($query_aln)<$WINDOW_LENGTH-$NB_MISMATCH_MAX){
75 } 88 }
76 else { 89 else {
77 my @q = split(//,$query_aln); 90 my @q = split(//,$query_aln);
78 my @s = split(//,$subject_aln); 91 my @s = split(//,$subject_aln);
79 for (my $i=0;$i<=$#q;$i++){ 92 for (my $i=0;$i<=$#q;$i++){
80 my $global_idx = $query_start-1+$i-$nb_gap_query; 93 my $global_idx = $query_start-1+$i-$nb_gap_query;
81 if ($q[$i] eq "-"){ 94 if ($q[$i] eq "-"){
82 if ($global_idx < $window_length){ 95 if ($global_idx < $WINDOW_LENGTH){
83 $compt_mismatch_5p++; 96 $compt_mismatch_5p++;
84 } 97 }
85 elsif ($global_idx > $window_length){ 98 elsif ($global_idx > $WINDOW_LENGTH){
86 $compt_mismatch_3p++; 99 $compt_mismatch_3p++;
87 } 100 }
88 $nb_gap_query++; #On compte les gap dans la query pour les soustraire de l'index global 101 $nb_gap_query++; #On compte les gap dans la query pour les soustraire de l'index global
89 } 102 }
90 else { 103 else {
91 if ($q[$i] ne $s[$i]){ 104 if ($q[$i] ne $s[$i]){
92 if ($global_idx < $window_length){ 105 if ($global_idx < $WINDOW_LENGTH){
93 $compt_mismatch_5p++; 106 $compt_mismatch_5p++;
94 } 107 }
95 elsif ($global_idx > $window_length){ 108 elsif ($global_idx > $WINDOW_LENGTH){
96 $compt_mismatch_3p++; 109 $compt_mismatch_3p++;
97 } 110 }
98 } 111 }
99 } 112 }
100 } 113 }
101 $compt_mismatch_5p += $query_start-1; 114 $compt_mismatch_5p += $query_start-1;
102 $compt_mismatch_3p += $window_length *2 + 1 - $query_stop; 115 $compt_mismatch_3p += $WINDOW_LENGTH *2 + 1 - $query_stop;
103 116
104 # for (my $i=0;$i<$window_length;$i++){ 117 # for (my $i=0;$i<$window_length;$i++){
105 # if ($tbl_q_aln[$i] eq "#"){ 118 # if ($tbl_q_aln[$i] eq "#"){
106 # $compt_mismatch_5p++; 119 # $compt_mismatch_5p++;
107 # } 120 # }
120 # $compt_mismatch_3p++; 133 # $compt_mismatch_3p++;
121 # } 134 # }
122 # else { 135 # else {
123 # } 136 # }
124 # } 137 # }
125 if (($compt_mismatch_5p <= $nb_mismatch_max)||($compt_mismatch_3p <= $nb_mismatch_max)){ 138 if (($compt_mismatch_5p <= $NB_MISMATCH_MAX)||($compt_mismatch_3p <= $NB_MISMATCH_MAX)){
126 $hash_name{$name}++; 139 $hash_name{$name}++;
127 } 140 }
128 141
129 } 142 }
130 } 143 }
149 close (INB); 162 close (INB);
150 163
151 open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n"); 164 open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n");
152 165
153 while (my $ligne = <INV>) { 166 while (my $ligne = <INV>) {
167 $nb_variant_checked++;
154 168
155 my @champs = split (/\s+/,$ligne); 169 my @champs = split (/\s+/,$ligne);
156 my $header = $champs[0]."_".$champs[1]; 170 my $header = $champs[0]."_".$champs[1];
157 171
158 if ($hash_name{$header}){ 172 if ($hash_name{$header}){
159 if ($hash_name{$header}==1){ 173 if ($hash_name{$header}==1){
160 print $ligne; 174 print $ligne;
175 $nb_variant_selected++;
161 } 176 }
162 } 177 }
163 else { 178 else {
164 #print STDERR "No blast result for ",$header,"\n"; 179 #print STDERR "No blast result for ",$header,"\n";
165 } 180 }
167 182
168 } 183 }
169 184
170 close(INV); 185 close(INV);
171 186
187 open (LF,">$log_file") or die("Can't open $log_file\n");
188 print LF "\n####\t Blast filtering \n";
189 print LF "Variant checked :\t$nb_variant_checked\n";
190 print LF "Variant selected :\t$nb_variant_selected\n";
191 close (LF);
192
172 193
173 # foreach my $key (sort keys %hash_name){ 194 # foreach my $key (sort keys %hash_name){
174 # print $key,"\t",$hash_name{$key},"\n"; 195 # print $key,"\t",$hash_name{$key},"\n";
175 196
176 # } 197 # }