9
|
1 #!/usr/bin/perl
|
|
2 use strict;
|
|
3 use warnings;
|
|
4 use Getopt::Long;
|
|
5
|
|
6 my $input_variant_file;
|
|
7 my $input_blastxml_file;
|
|
8 my $window_length = 50;
|
|
9 my $nb_mismatch_max = 2;
|
|
10
|
|
11 GetOptions (
|
|
12 "input_variant_file=s" => \$input_variant_file,
|
|
13 "input_blastxml_file=s" => \$input_blastxml_file,
|
|
14 "window_length=i" => \$window_length,
|
|
15 "nb_mismatch_max=i" => \$nb_mismatch_max
|
|
16 ) or die("Error in command line arguments\n");
|
|
17
|
|
18
|
|
19 open(INB, $input_blastxml_file) or die ("Can't open $input_blastxml_file\n");
|
|
20
|
|
21
|
|
22 my $iteration_stop="</Iteration>";
|
|
23 my $hit_stop="</Hit>";
|
|
24 my $hsp_stop="</Hsp>";
|
|
25 my $query_flag_start="<Iteration_query-def>";
|
|
26 my $query_flag_stop="</Iteration_query-def>";
|
|
27 my $subject_flag_start="<Hit_def>";
|
|
28 my $subject_flag_stop="</Hit_def>";
|
|
29 my $query_HSP_flag_from_start="<Hsp_query-from>";
|
|
30 my $query_HSP_flag_from_stop="</Hsp_query-from>";
|
|
31 my $query_HSP_flag_to_start="<Hsp_query-to>";
|
|
32 my $query_HSP_flag_to_stop="</Hsp_query-to>";
|
|
33 my $subject_HSP_flag_from_start="<Hsp_hit-from>";
|
|
34 my $subject_HSP_flag_from_stop="</Hsp_hit-from>";
|
|
35 my $subject_HSP_flag_to_start="<Hsp_hit-to>";
|
|
36 my $subject_HSP_flag_to_stop="</Hsp_hit-to>";
|
|
37 my $query_HSP_flag_seq_start="<Hsp_qseq>";
|
|
38 my $query_HSP_flag_seq_stop="</Hsp_qseq>";
|
|
39 my $subject_HSP_flag_seq_start="<Hsp_hseq>";
|
|
40 my $subject_HSP_flag_seq_stop="</Hsp_hseq>";
|
|
41 my $HSP_midline_flag_start="<Hsp_midline>";
|
|
42 my $HSP_midline_flag_stop="</Hsp_midline>";
|
|
43
|
|
44 my %hash;
|
|
45
|
|
46 my $compt=0;
|
|
47 my $current_query="";
|
|
48 my $current_subject="";
|
|
49 my $current_HSP_query_from;
|
|
50 my $current_HSP_query_to;
|
|
51 my $current_HSP_subject_from;
|
|
52 my $current_HSP_subject_to;
|
|
53 my $current_HSP_midline;
|
|
54 my $current_HSP_qseq;
|
|
55 my $current_HSP_hseq;
|
|
56 my $current_query_short;
|
|
57
|
|
58 while (my $ligne = <INB>) {
|
|
59
|
|
60 if ($ligne=~/$subject_flag_start(.*?)$subject_flag_stop/){
|
|
61 $current_subject=$1;
|
|
62 #print "--",$1,"\n";
|
|
63 }
|
|
64
|
|
65 if ($ligne=~/$query_flag_start(.*?)$query_flag_stop/){
|
|
66 $current_query=$1;
|
|
67 $current_query_short=$current_query;
|
|
68 if ($current_query =~ /^(.*?_\d+)\_/){
|
|
69 $current_query_short=$1;
|
|
70 }
|
|
71 if (!$hash{$current_query_short}){
|
|
72 $hash{$current_query_short}=0;
|
|
73 }
|
|
74
|
|
75
|
|
76 #print "--",$1,"\n";
|
|
77 }
|
|
78 if ($ligne=~/$query_HSP_flag_from_start(.*?)$query_HSP_flag_from_stop/){
|
|
79 $current_HSP_query_from=$1;
|
|
80 #print "--",$1,"...";
|
|
81 }
|
|
82 if ($ligne=~/$query_HSP_flag_to_start(.*?)$query_HSP_flag_to_stop/){
|
|
83 $current_HSP_query_to=$1;
|
|
84 #print $1,"\n";
|
|
85 }
|
|
86 if ($ligne=~/$subject_HSP_flag_from_start(.*?)$subject_HSP_flag_from_stop/){
|
|
87 $current_HSP_subject_from=$1;
|
|
88 #print "--",$1,"...";
|
|
89 }
|
|
90 if ($ligne=~/$subject_HSP_flag_to_start(.*?)$subject_HSP_flag_to_stop/){
|
|
91 $current_HSP_subject_to=$1;
|
|
92 #print $1,"\n";
|
|
93 }
|
|
94 if ($ligne=~/$query_HSP_flag_seq_start(.*?)$query_HSP_flag_seq_stop/){
|
|
95 $current_HSP_qseq=$1;
|
|
96 #print "--",$1,"\n";
|
|
97 }
|
|
98 if ($ligne=~/$subject_HSP_flag_seq_start(.*?)$subject_HSP_flag_seq_stop/){
|
|
99 $current_HSP_hseq=$1;
|
|
100 #print "--",$1,"\n";
|
|
101 }
|
|
102 if ($ligne=~/$HSP_midline_flag_start(.*?)$HSP_midline_flag_stop/){
|
|
103 $current_HSP_midline=$1;
|
|
104 #print "--",$1,"\n";
|
|
105 }
|
|
106
|
|
107 if ($ligne=~/$hsp_stop/){
|
|
108 if ($current_HSP_query_from){
|
|
109 #print "\ntest1\n";
|
|
110 #print "Query : $current_query\n";
|
|
111 #print "Subject : $current_subject\n";
|
|
112 #print "$current_HSP_query_from ... $current_HSP_query_to\n";
|
|
113 #print "$current_HSP_subject_from ... $current_HSP_subject_to\n";
|
|
114 for (my $i=1;$i<$current_HSP_query_from;$i++){
|
|
115 $current_HSP_qseq = "N".$current_HSP_qseq;
|
|
116 $current_HSP_midline = " ".$current_HSP_midline;
|
|
117 $current_HSP_hseq = "N".$current_HSP_hseq;
|
|
118 }
|
|
119 for (my $i=$current_HSP_query_to+1;$i<=$window_length*2+1;$i++){
|
|
120 $current_HSP_qseq .= "N";
|
|
121 $current_HSP_midline .= " ";
|
|
122 $current_HSP_hseq .= "N";
|
|
123 }
|
|
124
|
|
125 my @qseq = split(//,$current_HSP_qseq);
|
|
126 my @midline = split(//,$current_HSP_midline);
|
|
127 my @hseq = split(//,$current_HSP_hseq);
|
|
128
|
|
129 my $comptbase=0;
|
|
130 my $compt5p=0;
|
|
131 my $compt3p=0;
|
|
132 for (my $i=0;$i<=$#qseq;$i++){
|
|
133 if ($qseq[$i] ne "-"){
|
|
134 $comptbase++; # Va de 1 -> $window_length *2 +1
|
|
135 }
|
|
136 if ($midline[$i] eq " "){
|
|
137 if ($comptbase<=$window_length){ #1 -> $window_length
|
|
138 $compt5p++;
|
|
139 }
|
|
140 elsif ($comptbase>=$window_length+2){ #$window_length+2 -> $window_length *2 + 1;
|
|
141 $compt3p++;
|
|
142 }
|
|
143 else { #+1-$window_length*2+1
|
|
144
|
|
145 }
|
|
146 }
|
|
147 }
|
|
148 if (($compt3p<=$nb_mismatch_max)||($compt5p<=$nb_mismatch_max)){
|
|
149 $hash{$current_query_short}++;
|
|
150 }
|
|
151
|
|
152 #print "$current_HSP_qseq\n";
|
|
153 #print "$current_HSP_midline\n";
|
|
154 #print "$current_HSP_hseq\n";
|
|
155 #print "$compt5p // $compt3p\n";
|
|
156 #print $hash{$current_query_short},"\n";
|
|
157
|
|
158 }
|
|
159
|
|
160 undef $current_HSP_query_from;
|
|
161 undef $current_HSP_query_to;
|
|
162 undef $current_HSP_subject_from;
|
|
163 undef $current_HSP_subject_to;
|
|
164 $current_HSP_midline="";
|
|
165 $current_HSP_qseq="";
|
|
166 $current_HSP_hseq="";
|
|
167 }
|
|
168
|
|
169 if ($ligne=~/$iteration_stop/){
|
|
170 if ($current_HSP_query_from){
|
|
171 #print "\ntest2\n";
|
|
172 #print "Query : $current_query\n";
|
|
173 #print "Subject : $current_subject\n";
|
|
174 #print "$current_HSP_query_from ... $current_HSP_query_to\n";
|
|
175 #print "$current_HSP_subject_from ... $current_HSP_subject_to\n";
|
|
176 for (my $i=1;$i<$current_HSP_query_from;$i++){
|
|
177 $current_HSP_qseq = "N".$current_HSP_qseq;
|
|
178 $current_HSP_midline = " ".$current_HSP_midline;
|
|
179 $current_HSP_hseq = "N".$current_HSP_hseq;
|
|
180 }
|
|
181 for (my $i=$current_HSP_query_to+1;$i<=$window_length*2+1;$i++){
|
|
182 $current_HSP_qseq .= "N";
|
|
183 $current_HSP_midline .= " ";
|
|
184 $current_HSP_hseq .= "N";
|
|
185 }
|
|
186
|
|
187 my @qseq = split(//,$current_HSP_qseq);
|
|
188 my @midline = split(//,$current_HSP_midline);
|
|
189 my @hseq = split(//,$current_HSP_hseq);
|
|
190
|
|
191 my $comptbase=0;
|
|
192 my $compt5p=0;
|
|
193 my $compt3p=0;
|
|
194 for (my $i=0;$i<=$#qseq;$i++){
|
|
195 if ($qseq[$i] ne "-"){
|
|
196 $comptbase++; # Va de 1 -> $window_length *2 +1
|
|
197 }
|
|
198 if ($midline[$i] eq " "){
|
|
199 if ($comptbase<=$window_length){ #1 -> $window_length
|
|
200 $compt5p++;
|
|
201 }
|
|
202 elsif ($comptbase>=$window_length+2){ #$window_length+2 -> $window_length *2 + 1;
|
|
203 $compt3p++;
|
|
204 }
|
|
205 else { #+1-$window_length*2+1
|
|
206
|
|
207 }
|
|
208 }
|
|
209 }
|
|
210 if (($compt3p<=$nb_mismatch_max)||($compt5p<=$nb_mismatch_max)){
|
|
211 $hash{$current_query_short}++;
|
|
212 }
|
|
213
|
|
214 #print "$current_HSP_qseq\n";
|
|
215 #print "$current_HSP_midline\n";
|
|
216 #print "$current_HSP_hseq\n";
|
|
217 #print "$compt5p // $compt3p\n";
|
|
218 #print $hash{$current_query_short},"\n";
|
|
219 }
|
|
220 $current_query="";
|
|
221 $current_query_short="";
|
|
222 $current_subject="";
|
|
223 undef $current_HSP_query_from;
|
|
224 undef $current_HSP_query_to;
|
|
225 undef $current_HSP_subject_from;
|
|
226 undef $current_HSP_subject_to;
|
|
227 $current_HSP_midline="";
|
|
228 $current_HSP_qseq="";
|
|
229 $current_HSP_hseq="";
|
|
230 }
|
|
231
|
|
232 }
|
|
233
|
|
234 close (INB);
|
|
235
|
|
236 # foreach my $key (sort trinombre keys %hash){
|
|
237 # print $key," ",$hash{$key},"\n";
|
|
238
|
|
239 # }
|
|
240 # exit(0);
|
|
241
|
|
242 open(INV, $input_variant_file) or die ("Can't open $input_variant_file\n");
|
|
243
|
|
244 while (my $ligne = <INV>) {
|
|
245 my @champs = split (/\s+/,$ligne);
|
|
246 my $header = $champs[0]."_".$champs[1];
|
|
247
|
|
248 if ($hash{$header}){
|
|
249 if ($hash{$header}==1){
|
|
250 print "$ligne";
|
|
251 }
|
|
252 else {
|
|
253 #print $hash{$header}," $ligne";
|
|
254 }
|
|
255 }
|
|
256 else {
|
|
257 print STDERR "No blast result for ",$header,"\n";
|
|
258 }
|
|
259
|
|
260
|
|
261 }
|
|
262
|
|
263 close(INV);
|
|
264
|
|
265
|
|
266
|
|
267 sub trinombre {
|
|
268 my $chra=$a;
|
|
269 my $posa=0;
|
|
270 my $chrb=$b;
|
|
271 my $posb=0;
|
|
272
|
|
273 if ($a =~/(.*?)\_(\d+)/){
|
|
274 $chra=$1;
|
|
275 $posa=$2;
|
|
276 }
|
|
277 if ($b =~/(.*?)\_(\d+)/){
|
|
278 $chrb=$1;
|
|
279 $posb=$2;
|
|
280 }
|
|
281
|
|
282
|
|
283 $chra cmp $chrb
|
|
284 ||
|
|
285 $posa <=> $posb;
|
|
286 }
|
|
287
|