Mercurial > repos > mcharles > rapsodyn
comparison mpileupfilteronblastxml/mpileupfilteronblastxml.pl @ 5:26d2c851d48e draft
Uploaded
author | mcharles |
---|---|
date | Mon, 16 Jun 2014 06:18:07 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
4:53eff7c08ad3 | 5:26d2c851d48e |
---|---|
1 #!/usr/bin/perl | |
2 use strict; | |
3 use warnings; | |
4 use Getopt::Long; | |
5 | |
6 my $input_variant_file; | |
7 my $input_blastxml_file; | |
8 my $window_length = 50; | |
9 my $nb_mismatch_max = 2; | |
10 | |
11 GetOptions ( | |
12 "input_variant_file=s" => \$input_variant_file, | |
13 "input_blastxml_file=s" => \$input_blastxml_file, | |
14 "window_length=i" => \$window_length, | |
15 "nb_mismatch_max=i" => \$nb_mismatch_max | |
16 ) or die("Error in command line arguments\n"); | |
17 | |
18 | |
19 open(INB, $input_blastxml_file) or die ("Can't open $input_blastxml_file\n"); | |
20 | |
21 | |
22 my $iteration_stop="</Iteration>"; | |
23 my $hit_stop="</Hit>"; | |
24 my $hsp_stop="</Hsp>"; | |
25 my $query_flag_start="<Iteration_query-def>"; | |
26 my $query_flag_stop="</Iteration_query-def>"; | |
27 my $subject_flag_start="<Hit_def>"; | |
28 my $subject_flag_stop="</Hit_def>"; | |
29 my $query_HSP_flag_from_start="<Hsp_query-from>"; | |
30 my $query_HSP_flag_from_stop="</Hsp_query-from>"; | |
31 my $query_HSP_flag_to_start="<Hsp_query-to>"; | |
32 my $query_HSP_flag_to_stop="</Hsp_query-to>"; | |
33 my $subject_HSP_flag_from_start="<Hsp_hit-from>"; | |
34 my $subject_HSP_flag_from_stop="</Hsp_hit-from>"; | |
35 my $subject_HSP_flag_to_start="<Hsp_hit-to>"; | |
36 my $subject_HSP_flag_to_stop="</Hsp_hit-to>"; | |
37 my $query_HSP_flag_seq_start="<Hsp_qseq>"; | |
38 my $query_HSP_flag_seq_stop="</Hsp_qseq>"; | |
39 my $subject_HSP_flag_seq_start="<Hsp_hseq>"; | |
40 my $subject_HSP_flag_seq_stop="</Hsp_hseq>"; | |
41 my $HSP_midline_flag_start="<Hsp_midline>"; | |
42 my $HSP_midline_flag_stop="</Hsp_midline>"; | |
43 | |
44 my %hash; | |
45 | |
46 my $compt=0; | |
47 my $current_query=""; | |
48 my $current_subject=""; | |
49 my $current_HSP_query_from; | |
50 my $current_HSP_query_to; | |
51 my $current_HSP_subject_from; | |
52 my $current_HSP_subject_to; | |
53 my $current_HSP_midline; | |
54 my $current_HSP_qseq; | |
55 my $current_HSP_hseq; | |
56 my $current_query_short; | |
57 | |
58 while (my $ligne = <INB>) { | |
59 | |
60 if ($ligne=~/$subject_flag_start(.*?)$subject_flag_stop/){ | |
61 $current_subject=$1; | |
62 #print "--",$1,"\n"; | |
63 } | |
64 | |
65 if ($ligne=~/$query_flag_start(.*?)$query_flag_stop/){ | |
66 $current_query=$1; | |
67 $current_query_short=$current_query; | |
68 if ($current_query =~ /^(.*?_\d+)\_/){ | |
69 $current_query_short=$1; | |
70 } | |
71 if (!$hash{$current_query_short}){ | |
72 $hash{$current_query_short}=0; | |
73 } | |
74 | |
75 | |
76 #print "--",$1,"\n"; | |
77 } | |
78 if ($ligne=~/$query_HSP_flag_from_start(.*?)$query_HSP_flag_from_stop/){ | |
79 $current_HSP_query_from=$1; | |
80 #print "--",$1,"..."; | |
81 } | |
82 if ($ligne=~/$query_HSP_flag_to_start(.*?)$query_HSP_flag_to_stop/){ | |
83 $current_HSP_query_to=$1; | |
84 #print $1,"\n"; | |
85 } | |
86 if ($ligne=~/$subject_HSP_flag_from_start(.*?)$subject_HSP_flag_from_stop/){ | |
87 $current_HSP_subject_from=$1; | |
88 #print "--",$1,"..."; | |
89 } | |
90 if ($ligne=~/$subject_HSP_flag_to_start(.*?)$subject_HSP_flag_to_stop/){ | |
91 $current_HSP_subject_to=$1; | |
92 #print $1,"\n"; | |
93 } | |
94 if ($ligne=~/$query_HSP_flag_seq_start(.*?)$query_HSP_flag_seq_stop/){ | |
95 $current_HSP_qseq=$1; | |
96 #print "--",$1,"\n"; | |
97 } | |
98 if ($ligne=~/$subject_HSP_flag_seq_start(.*?)$subject_HSP_flag_seq_stop/){ | |
99 $current_HSP_hseq=$1; | |
100 #print "--",$1,"\n"; | |
101 } | |
102 if ($ligne=~/$HSP_midline_flag_start(.*?)$HSP_midline_flag_stop/){ | |
103 $current_HSP_midline=$1; | |
104 #print "--",$1,"\n"; | |
105 } | |
106 | |
107 if ($ligne=~/$hsp_stop/){ | |
108 if ($current_HSP_query_from){ | |
109 #print "\ntest1\n"; | |
110 #print "Query : $current_query\n"; | |
111 #print "Subject : $current_subject\n"; | |
112 #print "$current_HSP_query_from ... $current_HSP_query_to\n"; | |
113 #print "$current_HSP_subject_from ... $current_HSP_subject_to\n"; | |
114 for (my $i=1;$i<$current_HSP_query_from;$i++){ | |
115 $current_HSP_qseq = "N".$current_HSP_qseq; | |
116 $current_HSP_midline = " ".$current_HSP_midline; | |
117 $current_HSP_hseq = "N".$current_HSP_hseq; | |
118 } | |
119 for (my $i=$current_HSP_query_to+1;$i<=$window_length*2+1;$i++){ | |
120 $current_HSP_qseq .= "N"; | |
121 $current_HSP_midline .= " "; | |
122 $current_HSP_hseq .= "N"; | |
123 } | |
124 | |
125 my @qseq = split(//,$current_HSP_qseq); | |
126 my @midline = split(//,$current_HSP_midline); | |
127 my @hseq = split(//,$current_HSP_hseq); | |
128 | |
129 my $comptbase=0; | |
130 my $compt5p=0; | |
131 my $compt3p=0; | |
132 for (my $i=0;$i<=$#qseq;$i++){ | |
133 if ($qseq[$i] ne "-"){ | |
134 $comptbase++; # Va de 1 -> $window_length *2 +1 | |
135 } | |
136 if ($midline[$i] eq " "){ | |
137 if ($comptbase<=$window_length){ #1 -> $window_length | |
138 $compt5p++; | |
139 } | |
140 elsif ($comptbase>=$window_length+2){ #$window_length+2 -> $window_length *2 + 1; | |
141 $compt3p++; | |
142 } | |
143 else { #+1-$window_length*2+1 | |
144 | |
145 } | |
146 } | |
147 } | |
148 if (($compt3p<=$nb_mismatch_max)||($compt5p<=$nb_mismatch_max)){ | |
149 $hash{$current_query_short}++; | |
150 } | |
151 | |
152 #print "$current_HSP_qseq\n"; | |
153 #print "$current_HSP_midline\n"; | |
154 #print "$current_HSP_hseq\n"; | |
155 #print "$compt5p // $compt3p\n"; | |
156 #print $hash{$current_query_short},"\n"; | |
157 | |
158 } | |
159 | |
160 undef $current_HSP_query_from; | |
161 undef $current_HSP_query_to; | |
162 undef $current_HSP_subject_from; | |
163 undef $current_HSP_subject_to; | |
164 $current_HSP_midline=""; | |
165 $current_HSP_qseq=""; | |
166 $current_HSP_hseq=""; | |
167 } | |
168 | |
169 if ($ligne=~/$iteration_stop/){ | |
170 if ($current_HSP_query_from){ | |
171 #print "\ntest2\n"; | |
172 #print "Query : $current_query\n"; | |
173 #print "Subject : $current_subject\n"; | |
174 #print "$current_HSP_query_from ... $current_HSP_query_to\n"; | |
175 #print "$current_HSP_subject_from ... $current_HSP_subject_to\n"; | |
176 for (my $i=1;$i<$current_HSP_query_from;$i++){ | |
177 $current_HSP_qseq = "N".$current_HSP_qseq; | |
178 $current_HSP_midline = " ".$current_HSP_midline; | |
179 $current_HSP_hseq = "N".$current_HSP_hseq; | |
180 } | |
181 for (my $i=$current_HSP_query_to+1;$i<=$window_length*2+1;$i++){ | |
182 $current_HSP_qseq .= "N"; | |
183 $current_HSP_midline .= " "; | |
184 $current_HSP_hseq .= "N"; | |
185 } | |
186 | |
187 my @qseq = split(//,$current_HSP_qseq); | |
188 my @midline = split(//,$current_HSP_midline); | |
189 my @hseq = split(//,$current_HSP_hseq); | |
190 | |
191 my $comptbase=0; | |
192 my $compt5p=0; | |
193 my $compt3p=0; | |
194 for (my $i=0;$i<=$#qseq;$i++){ | |
195 if ($qseq[$i] ne "-"){ | |
196 $comptbase++; # Va de 1 -> $window_length *2 +1 | |
197 } | |
198 if ($midline[$i] eq " "){ | |
199 if ($comptbase<=$window_length){ #1 -> $window_length | |
200 $compt5p++; | |
201 } | |
202 elsif ($comptbase>=$window_length+2){ #$window_length+2 -> $window_length *2 + 1; | |
203 $compt3p++; | |
204 } | |
205 else { #+1-$window_length*2+1 | |
206 | |
207 } | |
208 } | |
209 } | |
210 if (($compt3p<=$nb_mismatch_max)||($compt5p<=$nb_mismatch_max)){ | |
211 $hash{$current_query_short}++; | |
212 } | |
213 | |
214 #print "$current_HSP_qseq\n"; | |
215 #print "$current_HSP_midline\n"; | |
216 #print "$current_HSP_hseq\n"; | |
217 #print "$compt5p // $compt3p\n"; | |
218 #print $hash{$current_query_short},"\n"; | |
219 } | |
220 $current_query=""; | |
221 $current_query_short=""; | |
222 $current_subject=""; | |
223 undef $current_HSP_query_from; | |
224 undef $current_HSP_query_to; | |
225 undef $current_HSP_subject_from; | |
226 undef $current_HSP_subject_to; | |
227 $current_HSP_midline=""; | |
228 $current_HSP_qseq=""; | |
229 $current_HSP_hseq=""; | |
230 } | |
231 | |
232 } | |
233 | |
234 close (INB); | |
235 | |
236 # foreach my $key (sort trinombre keys %hash){ | |
237 # print $key," ",$hash{$key},"\n"; | |
238 | |
239 # } | |
240 # exit(0); | |
241 | |
242 open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n"); | |
243 | |
244 while (my $ligne = <INV>) { | |
245 my @champs = split (/\s+/,$ligne); | |
246 my $header = $champs[0]."_".$champs[1]; | |
247 | |
248 if ($hash{$header}){ | |
249 if ($hash{$header}==1){ | |
250 print "$ligne"; | |
251 } | |
252 else { | |
253 #print $hash{$header}," $ligne"; | |
254 } | |
255 } | |
256 else { | |
257 print STDERR "No blast result for ",$header,"\n"; | |
258 } | |
259 | |
260 | |
261 } | |
262 | |
263 close(INV); | |
264 | |
265 | |
266 | |
267 sub trinombre { | |
268 my $chra=$a; | |
269 my $posa=0; | |
270 my $chrb=$b; | |
271 my $posb=0; | |
272 | |
273 if ($a =~/(.*?)\_(\d+)/){ | |
274 $chra=$1; | |
275 $posa=$2; | |
276 } | |
277 if ($b =~/(.*?)\_(\d+)/){ | |
278 $chrb=$1; | |
279 $posb=$2; | |
280 } | |
281 | |
282 | |
283 $chra cmp $chrb | |
284 || | |
285 $posa <=> $posb; | |
286 } | |
287 |