Mercurial > repos > mcharles > rapsodyn
comparison mpileupfilter/mpileupfilter.pl @ 5:26d2c851d48e draft
Uploaded
author | mcharles |
---|---|
date | Mon, 16 Jun 2014 06:18:07 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
4:53eff7c08ad3 | 5:26d2c851d48e |
---|---|
1 #!/usr/bin/perl | |
2 use strict; | |
3 use Getopt::Long; | |
4 | |
5 # | |
6 # Filter a pileup file on forward/reverse presence and %read having the variant | |
7 # The error code | |
8 # 1 : multiple variant type detected insertion/deletion/mutation | |
9 # 1i : inconsistency in insertion | |
10 # 1d : inconsistency in deletion | |
11 # 1m : inconsistency in mutation | |
12 # 2 : insufficient depth | |
13 # 3 : insufficient variant frequency | |
14 # 4 : variant position not covered by forward and reverse reads | |
15 # 5 : variant with other variant in neighbourhood | |
16 # 6 : too much depth | |
17 # 8 : parsing error (couldn't parse the mpileup line correctly) | |
18 # 9 : parsing error (couldn't parse the readbase string correctly) | |
19 | |
20 | |
21 my $inputfile; | |
22 my $logfile; | |
23 my $MIN_DISTANCE=0; | |
24 my $MIN_VARIANTFREQUENCY=0; | |
25 my $MIN_FORWARDREVERSE=0; | |
26 my $MIN_DEPTH=0; | |
27 my $MAX_DEPTH=500; | |
28 my $VERBOSE=0; | |
29 my $ONLY_UNFILTERED_VARIANT="OFF"; | |
30 | |
31 if ($#ARGV<0){ | |
32 print "\n"; | |
33 print "perl 020_FilterPileupv6 -input_file <mpileup_file> [OPTION]\n"; | |
34 print "-input_file \tinputfile in mpileup format\n"; | |
35 print "-log_file \tlogfile containing discarded mpileup lines and the errorcode associated\n"; | |
36 print "-min_depth \tminimum depth required [1]\n"; | |
37 print "-max_depth \tmaximim depth (position with more coverage will be discarded) [100]\n"; | |
38 print "-min_frequency \tminimum variant frequency (0->1) [1] (default 1 => 100% reads show the variant at this position)\n"; | |
39 print "-min_distance \tminimum distance between variant [0]\n"; | |
40 print "-min_forward_and_reverse \tminimum number of reads in forward and reverse covering the variant required [0]\n"; | |
41 print "\n"; | |
42 exit(0); | |
43 } | |
44 | |
45 GetOptions ( | |
46 "input_file=s" => \$inputfile, | |
47 "log_file=s" => \$logfile, | |
48 "min_depth=i" => \$MIN_DEPTH, | |
49 "max_depth=i" => \$MAX_DEPTH, | |
50 "min_frequency=f" => \$MIN_VARIANTFREQUENCY, | |
51 "min_distance=i" => \$MIN_DISTANCE, | |
52 "min_forward_and_reverse=i" => \$MIN_FORWARDREVERSE, | |
53 "variant_only=s" => \$ONLY_UNFILTERED_VARIANT, | |
54 "v=i" => \$VERBOSE | |
55 ) or die("Error in command line arguments\n"); | |
56 | |
57 | |
58 open(IF, $inputfile) or die("Can't open $inputfile\n"); | |
59 | |
60 my @tbl_line; | |
61 my @tbl_variant_position; | |
62 my @tbl_variant_chr; | |
63 my @tbl_variant_refbase; | |
64 my @tbl_variant_coverage; | |
65 my @tbl_variant_readbase_string; | |
66 my @tbl_variant_quality_string; | |
67 | |
68 #Extraction des variants | |
69 my $nb_line=0; | |
70 while (my $line=<IF>){ | |
71 $nb_line++; | |
72 if (($nb_line % 1000000 == 0)&&($VERBOSE==1)){ | |
73 print "$nb_line\n"; | |
74 } | |
75 my $error_code=0; | |
76 if ($line=~/(.*?)\s+(\d+)\s+([ATGCN])\s+(\d+)\s+(.*?)\s+(.*?)$/){ | |
77 my $current_chromosome = $1; | |
78 my $current_position = $2; | |
79 my $current_refbase = $3; | |
80 my $current_coverage = $4; | |
81 my $current_readbase_string = $5; | |
82 my $current_quality_string = $6; | |
83 | |
84 #Suppression of mPileUp special character | |
85 $current_readbase_string =~ s/\$//g; #the read start at this position | |
86 $current_readbase_string =~ s/\^.//g; #the read end at this position followed by quality char | |
87 | |
88 if ($current_readbase_string =~ /[ATGCNatgcn\d]/){ | |
89 push(@tbl_line,$line); | |
90 push(@tbl_variant_chr,$current_chromosome); | |
91 push(@tbl_variant_position,$current_position); | |
92 push(@tbl_variant_refbase,$current_refbase); | |
93 push(@tbl_variant_coverage,$current_coverage); | |
94 push(@tbl_variant_readbase_string,$current_readbase_string); | |
95 push(@tbl_variant_quality_string,$current_quality_string); | |
96 if ($ONLY_UNFILTERED_VARIANT eq "ON"){ | |
97 print $line; | |
98 } | |
99 | |
100 } | |
101 else { | |
102 #Position with no variant | |
103 } | |
104 | |
105 } | |
106 else { | |
107 #Error Parsing | |
108 print STDERR "$line #8"; | |
109 } | |
110 } | |
111 close(IF); | |
112 | |
113 if ($ONLY_UNFILTERED_VARIANT eq "ON"){ | |
114 exit(0); | |
115 } | |
116 | |
117 ####Checking the distance between variant and other filter | |
118 | |
119 if ($logfile){ | |
120 open(LF,">$logfile") or die ("Cant't open $logfile\n"); | |
121 } | |
122 | |
123 for (my $i=0;$i<=$#tbl_line;$i++){ | |
124 # print "ligne : $tbl_line[$i]\n"; | |
125 | |
126 my $error_code=0; | |
127 if ($i==0){ | |
128 #Comparing $i and $i+1 for neighbourhood filter; | |
129 if ($#tbl_line>0){ | |
130 if (($tbl_variant_chr[$i+1] eq $tbl_variant_chr[$i])&&($tbl_variant_position[$i]+$MIN_DISTANCE>=$tbl_variant_position[$i+1])){ | |
131 $error_code=5; | |
132 chomp($tbl_line[$i]); | |
133 if ($logfile){ | |
134 print LF "$tbl_line[$i]\tcode:$error_code\n"; | |
135 } | |
136 next; | |
137 } | |
138 } | |
139 | |
140 #Additionnal filters | |
141 $error_code = check_error($tbl_variant_chr[$i],$tbl_variant_position[$i],$tbl_variant_refbase[$i],$tbl_variant_coverage[$i],$tbl_variant_readbase_string[$i]); | |
142 | |
143 } | |
144 else { | |
145 #Compairing $i and $i-1 for neighbourhood filter | |
146 if (($tbl_variant_chr[$i-1] eq $tbl_variant_chr[$i])&&($tbl_variant_position[$i-1]+$MIN_DISTANCE>=$tbl_variant_position[$i])){ | |
147 $error_code=5; | |
148 chomp($tbl_line[$i]); | |
149 if ($logfile){ | |
150 print LF "$tbl_line[$i]\tcode:$error_code\n"; | |
151 } | |
152 next; | |
153 } | |
154 else { | |
155 #Additionnal filters | |
156 $error_code = check_error($tbl_variant_chr[$i],$tbl_variant_position[$i],$tbl_variant_refbase[$i],$tbl_variant_coverage[$i],$tbl_variant_readbase_string[$i]); | |
157 } | |
158 } | |
159 if ($error_code == 0){ | |
160 print $tbl_line[$i]; | |
161 } | |
162 else { | |
163 chomp($tbl_line[$i]); | |
164 if ($logfile){ | |
165 print LF "$tbl_line[$i]\tcode:$error_code\n"; | |
166 } | |
167 } | |
168 } | |
169 | |
170 if ($logfile){ | |
171 close (LF); | |
172 } | |
173 | |
174 sub check_error{ | |
175 my $current_chromosome = shift; | |
176 my $current_position = shift; | |
177 my $current_refbase = shift; | |
178 my $current_coverage = shift; | |
179 my $current_readbase_string = shift; | |
180 | |
181 # print "test : $current_readbase_string\n"; | |
182 | |
183 | |
184 | |
185 #Extraction of insertions | |
186 | |
187 ################################################################## | |
188 # my @IN = $current_readbase_string =~ m/\+[0-9]+[ACGTNacgtn]+/g; | |
189 # my @DEL = $current_readbase_string =~ m/\-[0-9]+[ACGTNacgtn]+/g; | |
190 # print "IN : @IN\n"; | |
191 # print "DEL :@DEL\n"; | |
192 #$current_readbase_string=~s/[\+\-][0-9]+[ACGTNacgtn]+//g; | |
193 ################################################################## | |
194 #!!! marche pas : exemple .+1Ct. correspond a . / +1C / t /. mais le match de l'expression vire +1Ct | |
195 ################################################################## | |
196 | |
197 # => parcours de boucle | |
198 my @readbase = split(//,$current_readbase_string); | |
199 my $cleaned_readbase_string=""; | |
200 my @IN; | |
201 my @DEL; | |
202 my $current_IN=""; | |
203 my $current_DEL=""; | |
204 my $current_size=0; | |
205 | |
206 for (my $i=0;$i<=$#readbase;$i++){ | |
207 if ($readbase[$i] eq "+"){ | |
208 #Ouverture de IN | |
209 $current_IN="+"; | |
210 | |
211 #Recuperation de la taille | |
212 my $sub = substr $current_readbase_string,$i; | |
213 if ($sub=~/^\+(\d+)/){ | |
214 $current_size = $1; | |
215 } | |
216 my $remaining_size = $current_size; | |
217 while (($remaining_size>0)&&($i<=$#readbase)){ | |
218 $i++; | |
219 $current_IN.=$readbase[$i]; | |
220 if ($readbase[$i]=~ /[ATGCNatgcn]/){ | |
221 $remaining_size--; | |
222 } | |
223 } | |
224 push(@IN,$current_IN); | |
225 } | |
226 elsif ($readbase[$i] eq "-"){ | |
227 #Ouverture de DEL | |
228 $current_DEL="-"; | |
229 | |
230 #Recuperation de la taille | |
231 my $sub = substr $current_readbase_string,$i; | |
232 if ($sub=~/^\-(\d+)/){ | |
233 $current_size = $1; | |
234 } | |
235 my $remaining_size = $current_size; | |
236 while (($remaining_size>0)&&($i<=$#readbase)){ | |
237 $i++; | |
238 $current_DEL.=$readbase[$i]; | |
239 if ($readbase[$i]=~ /[ATGCNatgcn]/){ | |
240 $remaining_size--; | |
241 } | |
242 } | |
243 push(@DEL,$current_DEL); | |
244 | |
245 } | |
246 else { | |
247 #Ajout a la string | |
248 $cleaned_readbase_string .= $readbase[$i]; | |
249 } | |
250 } | |
251 | |
252 | |
253 # print "IN : @IN\n"; | |
254 # print "DEL :@DEL\n"; | |
255 # print "$cleaned_readbase_string\n"; | |
256 | |
257 my @current_readbase_array = split(//,$cleaned_readbase_string); | |
258 | |
259 #Filtering : error detection | |
260 | |
261 if ($#current_readbase_array+1 != $current_coverage){ | |
262 return 9; | |
263 #parsing error (couldn't parse the readbase string correctly) | |
264 } | |
265 elsif ($current_coverage<$MIN_DEPTH){ | |
266 return 2; | |
267 # 2 : insufficient depth | |
268 } | |
269 elsif ($current_coverage>$MAX_DEPTH){ | |
270 return 6; | |
271 # 6 : too much depth | |
272 } | |
273 else { | |
274 if ($#IN>=0){ | |
275 if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){ | |
276 return 1; | |
277 # 1 : variant type overload (multiple variant type detected insertion/deletion/mutation) | |
278 } | |
279 else { | |
280 ########## TEST de coherence des insertions ################ | |
281 # for (my $i=0;$i<=$#IN;$i++){ | |
282 # if (uc($IN[0]) ne uc($IN[$i])){ | |
283 # print uc($IN[0]),"\n"; | |
284 # print uc($IN[$i]),"\n"; | |
285 # return "1i"; | |
286 # } | |
287 # } | |
288 ########################################################### | |
289 | |
290 if($#IN+1 < $current_coverage*$MIN_VARIANTFREQUENCY){ | |
291 return 3; | |
292 # 3 : insufficient variant frequency | |
293 } | |
294 } | |
295 } | |
296 elsif ($#DEL>=0){ | |
297 if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){ | |
298 return 1; | |
299 # 1 : variant type overload (multiple variant type detected insertion/deletion/mutation) | |
300 } | |
301 else { | |
302 ########## TEST de coherence des deletions ################ | |
303 # for (my $i=0;$i<=$#DEL;$i++){ | |
304 # if (uc($DEL[0]) ne uc($DEL[$i])){ | |
305 # print uc($DEL[0]),"\n"; | |
306 # print uc($DEL[$i]),"\n"; | |
307 # return "1d"; | |
308 # } | |
309 # } | |
310 ########################################################### | |
311 | |
312 if($#DEL+1 < $current_coverage*$MIN_VARIANTFREQUENCY){ | |
313 return 3; | |
314 # 3 : insufficient variant frequency | |
315 } | |
316 } | |
317 } | |
318 else { | |
319 my $nbA=0; | |
320 $nbA++ while ($current_readbase_string =~ m/A/g); | |
321 my $nbC=0; | |
322 $nbC++ while ($current_readbase_string =~ m/C/g); | |
323 my $nbT=0; | |
324 $nbT++ while ($current_readbase_string =~ m/T/g); | |
325 my $nbG=0; | |
326 $nbG++ while ($current_readbase_string =~ m/G/g); | |
327 my $nbN=0; | |
328 $nbN++ while ($current_readbase_string =~ m/N/g); | |
329 my $nba=0; | |
330 $nba++ while ($current_readbase_string =~ m/a/g); | |
331 my $nbc=0; | |
332 $nbc++ while ($current_readbase_string =~ m/c/g); | |
333 my $nbt=0; | |
334 $nbt++ while ($current_readbase_string =~ m/t/g); | |
335 my $nbg=0; | |
336 $nbg++ while ($current_readbase_string =~ m/g/g); | |
337 my $nbn=0; | |
338 $nbn++ while ($current_readbase_string =~ m/n/g); | |
339 | |
340 if (($nbA+$nba>0)&&($nbT+$nbt+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){ | |
341 return "1m"; | |
342 } | |
343 if (($nbT+$nbt>0)&&($nbA+$nba+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){ | |
344 return "1m"; | |
345 } | |
346 if (($nbG+$nbg>0)&&($nbA+$nba+$nbT+$nbt+$nbC+$nbc+$nbN+$nbn>0)){ | |
347 return "1m"; | |
348 } | |
349 if (($nbC+$nbc>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbN+$nbn>0)){ | |
350 return "1m"; | |
351 } | |
352 if (($nbN+$nbn>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbC+$nbc>0)){ | |
353 return "1m"; | |
354 } | |
355 | |
356 if ($nbA+$nba >= $current_coverage*$MIN_VARIANTFREQUENCY){ | |
357 if (($nbA<$MIN_FORWARDREVERSE)||($nba<$MIN_FORWARDREVERSE)){ | |
358 return 4; | |
359 # 4 : variant position not covered by forward and reverse reads | |
360 } | |
361 } | |
362 elsif ($nbT+$nbt >= $current_coverage*$MIN_VARIANTFREQUENCY){ | |
363 if (($nbT<$MIN_FORWARDREVERSE)||($nbt<$MIN_FORWARDREVERSE)){ | |
364 return 4; | |
365 # 4 : variant position not covered by forward and reverse reads | |
366 } | |
367 } | |
368 elsif ($nbG+$nbg >= $current_coverage*$MIN_VARIANTFREQUENCY){ | |
369 if (($nbG<$MIN_FORWARDREVERSE)||($nbg<$MIN_FORWARDREVERSE)){ | |
370 return 4; | |
371 # 4 : variant position not covered by forward and reverse reads | |
372 } | |
373 } | |
374 elsif ($nbC+$nbc >= $current_coverage*$MIN_VARIANTFREQUENCY){ | |
375 if (($nbC<$MIN_FORWARDREVERSE)||($nbc<$MIN_FORWARDREVERSE)){ | |
376 return 4; | |
377 # 4 : variant position not covered by forward and reverse reads | |
378 } | |
379 } | |
380 elsif ($nbN+$nbn >= $current_coverage*$MIN_VARIANTFREQUENCY){ | |
381 if (($nbN<$MIN_FORWARDREVERSE)||($nbn<$MIN_FORWARDREVERSE)){ | |
382 return 4; | |
383 # 4 : variant position not covered by forward and reverse reads | |
384 } | |
385 } | |
386 else { | |
387 return 3; | |
388 # 3 : insufficient variant frequency | |
389 } | |
390 } | |
391 } | |
392 | |
393 return 0; | |
394 } |