comparison mpileupfilteronblastxml/mpileupfilteronblastxml.pl @ 5:26d2c851d48e draft

Uploaded
author mcharles
date Mon, 16 Jun 2014 06:18:07 -0400
parents
children
comparison
equal deleted inserted replaced
4:53eff7c08ad3 5:26d2c851d48e
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
4 use Getopt::Long;
5
6 my $input_variant_file;
7 my $input_blastxml_file;
8 my $window_length = 50;
9 my $nb_mismatch_max = 2;
10
11 GetOptions (
12 "input_variant_file=s" => \$input_variant_file,
13 "input_blastxml_file=s" => \$input_blastxml_file,
14 "window_length=i" => \$window_length,
15 "nb_mismatch_max=i" => \$nb_mismatch_max
16 ) or die("Error in command line arguments\n");
17
18
19 open(INB, $input_blastxml_file) or die ("Can't open $input_blastxml_file\n");
20
21
22 my $iteration_stop="</Iteration>";
23 my $hit_stop="</Hit>";
24 my $hsp_stop="</Hsp>";
25 my $query_flag_start="<Iteration_query-def>";
26 my $query_flag_stop="</Iteration_query-def>";
27 my $subject_flag_start="<Hit_def>";
28 my $subject_flag_stop="</Hit_def>";
29 my $query_HSP_flag_from_start="<Hsp_query-from>";
30 my $query_HSP_flag_from_stop="</Hsp_query-from>";
31 my $query_HSP_flag_to_start="<Hsp_query-to>";
32 my $query_HSP_flag_to_stop="</Hsp_query-to>";
33 my $subject_HSP_flag_from_start="<Hsp_hit-from>";
34 my $subject_HSP_flag_from_stop="</Hsp_hit-from>";
35 my $subject_HSP_flag_to_start="<Hsp_hit-to>";
36 my $subject_HSP_flag_to_stop="</Hsp_hit-to>";
37 my $query_HSP_flag_seq_start="<Hsp_qseq>";
38 my $query_HSP_flag_seq_stop="</Hsp_qseq>";
39 my $subject_HSP_flag_seq_start="<Hsp_hseq>";
40 my $subject_HSP_flag_seq_stop="</Hsp_hseq>";
41 my $HSP_midline_flag_start="<Hsp_midline>";
42 my $HSP_midline_flag_stop="</Hsp_midline>";
43
44 my %hash;
45
46 my $compt=0;
47 my $current_query="";
48 my $current_subject="";
49 my $current_HSP_query_from;
50 my $current_HSP_query_to;
51 my $current_HSP_subject_from;
52 my $current_HSP_subject_to;
53 my $current_HSP_midline;
54 my $current_HSP_qseq;
55 my $current_HSP_hseq;
56 my $current_query_short;
57
58 while (my $ligne = <INB>) {
59
60 if ($ligne=~/$subject_flag_start(.*?)$subject_flag_stop/){
61 $current_subject=$1;
62 #print "--",$1,"\n";
63 }
64
65 if ($ligne=~/$query_flag_start(.*?)$query_flag_stop/){
66 $current_query=$1;
67 $current_query_short=$current_query;
68 if ($current_query =~ /^(.*?_\d+)\_/){
69 $current_query_short=$1;
70 }
71 if (!$hash{$current_query_short}){
72 $hash{$current_query_short}=0;
73 }
74
75
76 #print "--",$1,"\n";
77 }
78 if ($ligne=~/$query_HSP_flag_from_start(.*?)$query_HSP_flag_from_stop/){
79 $current_HSP_query_from=$1;
80 #print "--",$1,"...";
81 }
82 if ($ligne=~/$query_HSP_flag_to_start(.*?)$query_HSP_flag_to_stop/){
83 $current_HSP_query_to=$1;
84 #print $1,"\n";
85 }
86 if ($ligne=~/$subject_HSP_flag_from_start(.*?)$subject_HSP_flag_from_stop/){
87 $current_HSP_subject_from=$1;
88 #print "--",$1,"...";
89 }
90 if ($ligne=~/$subject_HSP_flag_to_start(.*?)$subject_HSP_flag_to_stop/){
91 $current_HSP_subject_to=$1;
92 #print $1,"\n";
93 }
94 if ($ligne=~/$query_HSP_flag_seq_start(.*?)$query_HSP_flag_seq_stop/){
95 $current_HSP_qseq=$1;
96 #print "--",$1,"\n";
97 }
98 if ($ligne=~/$subject_HSP_flag_seq_start(.*?)$subject_HSP_flag_seq_stop/){
99 $current_HSP_hseq=$1;
100 #print "--",$1,"\n";
101 }
102 if ($ligne=~/$HSP_midline_flag_start(.*?)$HSP_midline_flag_stop/){
103 $current_HSP_midline=$1;
104 #print "--",$1,"\n";
105 }
106
107 if ($ligne=~/$hsp_stop/){
108 if ($current_HSP_query_from){
109 #print "\ntest1\n";
110 #print "Query : $current_query\n";
111 #print "Subject : $current_subject\n";
112 #print "$current_HSP_query_from ... $current_HSP_query_to\n";
113 #print "$current_HSP_subject_from ... $current_HSP_subject_to\n";
114 for (my $i=1;$i<$current_HSP_query_from;$i++){
115 $current_HSP_qseq = "N".$current_HSP_qseq;
116 $current_HSP_midline = " ".$current_HSP_midline;
117 $current_HSP_hseq = "N".$current_HSP_hseq;
118 }
119 for (my $i=$current_HSP_query_to+1;$i<=$window_length*2+1;$i++){
120 $current_HSP_qseq .= "N";
121 $current_HSP_midline .= " ";
122 $current_HSP_hseq .= "N";
123 }
124
125 my @qseq = split(//,$current_HSP_qseq);
126 my @midline = split(//,$current_HSP_midline);
127 my @hseq = split(//,$current_HSP_hseq);
128
129 my $comptbase=0;
130 my $compt5p=0;
131 my $compt3p=0;
132 for (my $i=0;$i<=$#qseq;$i++){
133 if ($qseq[$i] ne "-"){
134 $comptbase++; # Va de 1 -> $window_length *2 +1
135 }
136 if ($midline[$i] eq " "){
137 if ($comptbase<=$window_length){ #1 -> $window_length
138 $compt5p++;
139 }
140 elsif ($comptbase>=$window_length+2){ #$window_length+2 -> $window_length *2 + 1;
141 $compt3p++;
142 }
143 else { #+1-$window_length*2+1
144
145 }
146 }
147 }
148 if (($compt3p<=$nb_mismatch_max)||($compt5p<=$nb_mismatch_max)){
149 $hash{$current_query_short}++;
150 }
151
152 #print "$current_HSP_qseq\n";
153 #print "$current_HSP_midline\n";
154 #print "$current_HSP_hseq\n";
155 #print "$compt5p // $compt3p\n";
156 #print $hash{$current_query_short},"\n";
157
158 }
159
160 undef $current_HSP_query_from;
161 undef $current_HSP_query_to;
162 undef $current_HSP_subject_from;
163 undef $current_HSP_subject_to;
164 $current_HSP_midline="";
165 $current_HSP_qseq="";
166 $current_HSP_hseq="";
167 }
168
169 if ($ligne=~/$iteration_stop/){
170 if ($current_HSP_query_from){
171 #print "\ntest2\n";
172 #print "Query : $current_query\n";
173 #print "Subject : $current_subject\n";
174 #print "$current_HSP_query_from ... $current_HSP_query_to\n";
175 #print "$current_HSP_subject_from ... $current_HSP_subject_to\n";
176 for (my $i=1;$i<$current_HSP_query_from;$i++){
177 $current_HSP_qseq = "N".$current_HSP_qseq;
178 $current_HSP_midline = " ".$current_HSP_midline;
179 $current_HSP_hseq = "N".$current_HSP_hseq;
180 }
181 for (my $i=$current_HSP_query_to+1;$i<=$window_length*2+1;$i++){
182 $current_HSP_qseq .= "N";
183 $current_HSP_midline .= " ";
184 $current_HSP_hseq .= "N";
185 }
186
187 my @qseq = split(//,$current_HSP_qseq);
188 my @midline = split(//,$current_HSP_midline);
189 my @hseq = split(//,$current_HSP_hseq);
190
191 my $comptbase=0;
192 my $compt5p=0;
193 my $compt3p=0;
194 for (my $i=0;$i<=$#qseq;$i++){
195 if ($qseq[$i] ne "-"){
196 $comptbase++; # Va de 1 -> $window_length *2 +1
197 }
198 if ($midline[$i] eq " "){
199 if ($comptbase<=$window_length){ #1 -> $window_length
200 $compt5p++;
201 }
202 elsif ($comptbase>=$window_length+2){ #$window_length+2 -> $window_length *2 + 1;
203 $compt3p++;
204 }
205 else { #+1-$window_length*2+1
206
207 }
208 }
209 }
210 if (($compt3p<=$nb_mismatch_max)||($compt5p<=$nb_mismatch_max)){
211 $hash{$current_query_short}++;
212 }
213
214 #print "$current_HSP_qseq\n";
215 #print "$current_HSP_midline\n";
216 #print "$current_HSP_hseq\n";
217 #print "$compt5p // $compt3p\n";
218 #print $hash{$current_query_short},"\n";
219 }
220 $current_query="";
221 $current_query_short="";
222 $current_subject="";
223 undef $current_HSP_query_from;
224 undef $current_HSP_query_to;
225 undef $current_HSP_subject_from;
226 undef $current_HSP_subject_to;
227 $current_HSP_midline="";
228 $current_HSP_qseq="";
229 $current_HSP_hseq="";
230 }
231
232 }
233
234 close (INB);
235
236 # foreach my $key (sort trinombre keys %hash){
237 # print $key," ",$hash{$key},"\n";
238
239 # }
240 # exit(0);
241
242 open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n");
243
244 while (my $ligne = <INV>) {
245 my @champs = split (/\s+/,$ligne);
246 my $header = $champs[0]."_".$champs[1];
247
248 if ($hash{$header}){
249 if ($hash{$header}==1){
250 print "$ligne";
251 }
252 else {
253 #print $hash{$header}," $ligne";
254 }
255 }
256 else {
257 print STDERR "No blast result for ",$header,"\n";
258 }
259
260
261 }
262
263 close(INV);
264
265
266
267 sub trinombre {
268 my $chra=$a;
269 my $posa=0;
270 my $chrb=$b;
271 my $posb=0;
272
273 if ($a =~/(.*?)\_(\d+)/){
274 $chra=$1;
275 $posa=$2;
276 }
277 if ($b =~/(.*?)\_(\d+)/){
278 $chrb=$1;
279 $posb=$2;
280 }
281
282
283 $chra cmp $chrb
284 ||
285 $posa <=> $posb;
286 }
287