5
|
1 #!/usr/bin/perl
|
|
2 use strict;
|
|
3 use Getopt::Long;
|
|
4
|
|
5 #
|
|
6 # Filter a pileup file on forward/reverse presence and %read having the variant
|
|
7 # The error code
|
|
8 # 1 : multiple variant type detected insertion/deletion/mutation
|
|
9 # 1i : inconsistency in insertion
|
|
10 # 1d : inconsistency in deletion
|
|
11 # 1m : inconsistency in mutation
|
|
12 # 2 : insufficient depth
|
|
13 # 3 : insufficient variant frequency
|
|
14 # 4 : variant position not covered by forward and reverse reads
|
|
15 # 5 : variant with other variant in neighbourhood
|
|
16 # 6 : too much depth
|
|
17 # 8 : parsing error (couldn't parse the mpileup line correctly)
|
|
18 # 9 : parsing error (couldn't parse the readbase string correctly)
|
|
19
|
|
20
|
|
21 my $inputfile;
|
|
22 my $logfile;
|
|
23 my $MIN_DISTANCE=0;
|
|
24 my $MIN_VARIANTFREQUENCY=0;
|
|
25 my $MIN_FORWARDREVERSE=0;
|
|
26 my $MIN_DEPTH=0;
|
|
27 my $MAX_DEPTH=500;
|
|
28 my $VERBOSE=0;
|
|
29 my $ONLY_UNFILTERED_VARIANT="OFF";
|
|
30
|
|
31 if ($#ARGV<0){
|
|
32 print "\n";
|
|
33 print "perl 020_FilterPileupv6 -input_file <mpileup_file> [OPTION]\n";
|
|
34 print "-input_file \tinputfile in mpileup format\n";
|
|
35 print "-log_file \tlogfile containing discarded mpileup lines and the errorcode associated\n";
|
|
36 print "-min_depth \tminimum depth required [1]\n";
|
|
37 print "-max_depth \tmaximim depth (position with more coverage will be discarded) [100]\n";
|
|
38 print "-min_frequency \tminimum variant frequency (0->1) [1] (default 1 => 100% reads show the variant at this position)\n";
|
|
39 print "-min_distance \tminimum distance between variant [0]\n";
|
|
40 print "-min_forward_and_reverse \tminimum number of reads in forward and reverse covering the variant required [0]\n";
|
|
41 print "\n";
|
|
42 exit(0);
|
|
43 }
|
|
44
|
|
45 GetOptions (
|
|
46 "input_file=s" => \$inputfile,
|
|
47 "log_file=s" => \$logfile,
|
|
48 "min_depth=i" => \$MIN_DEPTH,
|
|
49 "max_depth=i" => \$MAX_DEPTH,
|
|
50 "min_frequency=f" => \$MIN_VARIANTFREQUENCY,
|
|
51 "min_distance=i" => \$MIN_DISTANCE,
|
|
52 "min_forward_and_reverse=i" => \$MIN_FORWARDREVERSE,
|
|
53 "variant_only=s" => \$ONLY_UNFILTERED_VARIANT,
|
|
54 "v=i" => \$VERBOSE
|
|
55 ) or die("Error in command line arguments\n");
|
|
56
|
|
57
|
|
58 open(IF, $inputfile) or die("Can't open $inputfile\n");
|
|
59
|
|
60 my @tbl_line;
|
|
61 my @tbl_variant_position;
|
|
62 my @tbl_variant_chr;
|
|
63 my @tbl_variant_refbase;
|
|
64 my @tbl_variant_coverage;
|
|
65 my @tbl_variant_readbase_string;
|
|
66 my @tbl_variant_quality_string;
|
|
67
|
|
68 #Extraction des variants
|
|
69 my $nb_line=0;
|
|
70 while (my $line=<IF>){
|
|
71 $nb_line++;
|
|
72 if (($nb_line % 1000000 == 0)&&($VERBOSE==1)){
|
|
73 print "$nb_line\n";
|
|
74 }
|
|
75 my $error_code=0;
|
|
76 if ($line=~/(.*?)\s+(\d+)\s+([ATGCN])\s+(\d+)\s+(.*?)\s+(.*?)$/){
|
|
77 my $current_chromosome = $1;
|
|
78 my $current_position = $2;
|
|
79 my $current_refbase = $3;
|
|
80 my $current_coverage = $4;
|
|
81 my $current_readbase_string = $5;
|
|
82 my $current_quality_string = $6;
|
|
83
|
|
84 #Suppression of mPileUp special character
|
|
85 $current_readbase_string =~ s/\$//g; #the read start at this position
|
|
86 $current_readbase_string =~ s/\^.//g; #the read end at this position followed by quality char
|
|
87
|
|
88 if ($current_readbase_string =~ /[ATGCNatgcn\d]/){
|
|
89 push(@tbl_line,$line);
|
|
90 push(@tbl_variant_chr,$current_chromosome);
|
|
91 push(@tbl_variant_position,$current_position);
|
|
92 push(@tbl_variant_refbase,$current_refbase);
|
|
93 push(@tbl_variant_coverage,$current_coverage);
|
|
94 push(@tbl_variant_readbase_string,$current_readbase_string);
|
|
95 push(@tbl_variant_quality_string,$current_quality_string);
|
|
96 if ($ONLY_UNFILTERED_VARIANT eq "ON"){
|
|
97 print $line;
|
|
98 }
|
|
99
|
|
100 }
|
|
101 else {
|
|
102 #Position with no variant
|
|
103 }
|
|
104
|
|
105 }
|
|
106 else {
|
|
107 #Error Parsing
|
|
108 print STDERR "$line #8";
|
|
109 }
|
|
110 }
|
|
111 close(IF);
|
|
112
|
|
113 if ($ONLY_UNFILTERED_VARIANT eq "ON"){
|
|
114 exit(0);
|
|
115 }
|
|
116
|
|
117 ####Checking the distance between variant and other filter
|
|
118
|
|
119 if ($logfile){
|
|
120 open(LF,">$logfile") or die ("Cant't open $logfile\n");
|
|
121 }
|
|
122
|
|
123 for (my $i=0;$i<=$#tbl_line;$i++){
|
|
124 # print "ligne : $tbl_line[$i]\n";
|
|
125
|
|
126 my $error_code=0;
|
|
127 if ($i==0){
|
|
128 #Comparing $i and $i+1 for neighbourhood filter;
|
|
129 if ($#tbl_line>0){
|
|
130 if (($tbl_variant_chr[$i+1] eq $tbl_variant_chr[$i])&&($tbl_variant_position[$i]+$MIN_DISTANCE>=$tbl_variant_position[$i+1])){
|
|
131 $error_code=5;
|
|
132 chomp($tbl_line[$i]);
|
|
133 if ($logfile){
|
|
134 print LF "$tbl_line[$i]\tcode:$error_code\n";
|
|
135 }
|
|
136 next;
|
|
137 }
|
|
138 }
|
|
139
|
|
140 #Additionnal filters
|
|
141 $error_code = check_error($tbl_variant_chr[$i],$tbl_variant_position[$i],$tbl_variant_refbase[$i],$tbl_variant_coverage[$i],$tbl_variant_readbase_string[$i]);
|
|
142
|
|
143 }
|
|
144 else {
|
|
145 #Compairing $i and $i-1 for neighbourhood filter
|
|
146 if (($tbl_variant_chr[$i-1] eq $tbl_variant_chr[$i])&&($tbl_variant_position[$i-1]+$MIN_DISTANCE>=$tbl_variant_position[$i])){
|
|
147 $error_code=5;
|
|
148 chomp($tbl_line[$i]);
|
|
149 if ($logfile){
|
|
150 print LF "$tbl_line[$i]\tcode:$error_code\n";
|
|
151 }
|
|
152 next;
|
|
153 }
|
|
154 else {
|
|
155 #Additionnal filters
|
|
156 $error_code = check_error($tbl_variant_chr[$i],$tbl_variant_position[$i],$tbl_variant_refbase[$i],$tbl_variant_coverage[$i],$tbl_variant_readbase_string[$i]);
|
|
157 }
|
|
158 }
|
|
159 if ($error_code == 0){
|
|
160 print $tbl_line[$i];
|
|
161 }
|
|
162 else {
|
|
163 chomp($tbl_line[$i]);
|
|
164 if ($logfile){
|
|
165 print LF "$tbl_line[$i]\tcode:$error_code\n";
|
|
166 }
|
|
167 }
|
|
168 }
|
|
169
|
|
170 if ($logfile){
|
|
171 close (LF);
|
|
172 }
|
|
173
|
|
174 sub check_error{
|
|
175 my $current_chromosome = shift;
|
|
176 my $current_position = shift;
|
|
177 my $current_refbase = shift;
|
|
178 my $current_coverage = shift;
|
|
179 my $current_readbase_string = shift;
|
|
180
|
|
181 # print "test : $current_readbase_string\n";
|
|
182
|
|
183
|
|
184
|
|
185 #Extraction of insertions
|
|
186
|
|
187 ##################################################################
|
|
188 # my @IN = $current_readbase_string =~ m/\+[0-9]+[ACGTNacgtn]+/g;
|
|
189 # my @DEL = $current_readbase_string =~ m/\-[0-9]+[ACGTNacgtn]+/g;
|
|
190 # print "IN : @IN\n";
|
|
191 # print "DEL :@DEL\n";
|
|
192 #$current_readbase_string=~s/[\+\-][0-9]+[ACGTNacgtn]+//g;
|
|
193 ##################################################################
|
|
194 #!!! marche pas : exemple .+1Ct. correspond a . / +1C / t /. mais le match de l'expression vire +1Ct
|
|
195 ##################################################################
|
|
196
|
|
197 # => parcours de boucle
|
|
198 my @readbase = split(//,$current_readbase_string);
|
|
199 my $cleaned_readbase_string="";
|
|
200 my @IN;
|
|
201 my @DEL;
|
|
202 my $current_IN="";
|
|
203 my $current_DEL="";
|
|
204 my $current_size=0;
|
|
205
|
|
206 for (my $i=0;$i<=$#readbase;$i++){
|
|
207 if ($readbase[$i] eq "+"){
|
|
208 #Ouverture de IN
|
|
209 $current_IN="+";
|
|
210
|
|
211 #Recuperation de la taille
|
|
212 my $sub = substr $current_readbase_string,$i;
|
|
213 if ($sub=~/^\+(\d+)/){
|
|
214 $current_size = $1;
|
|
215 }
|
|
216 my $remaining_size = $current_size;
|
|
217 while (($remaining_size>0)&&($i<=$#readbase)){
|
|
218 $i++;
|
|
219 $current_IN.=$readbase[$i];
|
|
220 if ($readbase[$i]=~ /[ATGCNatgcn]/){
|
|
221 $remaining_size--;
|
|
222 }
|
|
223 }
|
|
224 push(@IN,$current_IN);
|
|
225 }
|
|
226 elsif ($readbase[$i] eq "-"){
|
|
227 #Ouverture de DEL
|
|
228 $current_DEL="-";
|
|
229
|
|
230 #Recuperation de la taille
|
|
231 my $sub = substr $current_readbase_string,$i;
|
|
232 if ($sub=~/^\-(\d+)/){
|
|
233 $current_size = $1;
|
|
234 }
|
|
235 my $remaining_size = $current_size;
|
|
236 while (($remaining_size>0)&&($i<=$#readbase)){
|
|
237 $i++;
|
|
238 $current_DEL.=$readbase[$i];
|
|
239 if ($readbase[$i]=~ /[ATGCNatgcn]/){
|
|
240 $remaining_size--;
|
|
241 }
|
|
242 }
|
|
243 push(@DEL,$current_DEL);
|
|
244
|
|
245 }
|
|
246 else {
|
|
247 #Ajout a la string
|
|
248 $cleaned_readbase_string .= $readbase[$i];
|
|
249 }
|
|
250 }
|
|
251
|
|
252
|
|
253 # print "IN : @IN\n";
|
|
254 # print "DEL :@DEL\n";
|
|
255 # print "$cleaned_readbase_string\n";
|
|
256
|
|
257 my @current_readbase_array = split(//,$cleaned_readbase_string);
|
|
258
|
|
259 #Filtering : error detection
|
|
260
|
|
261 if ($#current_readbase_array+1 != $current_coverage){
|
|
262 return 9;
|
|
263 #parsing error (couldn't parse the readbase string correctly)
|
|
264 }
|
|
265 elsif ($current_coverage<$MIN_DEPTH){
|
|
266 return 2;
|
|
267 # 2 : insufficient depth
|
|
268 }
|
|
269 elsif ($current_coverage>$MAX_DEPTH){
|
|
270 return 6;
|
|
271 # 6 : too much depth
|
|
272 }
|
|
273 else {
|
|
274 if ($#IN>=0){
|
|
275 if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){
|
|
276 return 1;
|
|
277 # 1 : variant type overload (multiple variant type detected insertion/deletion/mutation)
|
|
278 }
|
|
279 else {
|
|
280 ########## TEST de coherence des insertions ################
|
|
281 # for (my $i=0;$i<=$#IN;$i++){
|
|
282 # if (uc($IN[0]) ne uc($IN[$i])){
|
|
283 # print uc($IN[0]),"\n";
|
|
284 # print uc($IN[$i]),"\n";
|
|
285 # return "1i";
|
|
286 # }
|
|
287 # }
|
|
288 ###########################################################
|
|
289
|
|
290 if($#IN+1 < $current_coverage*$MIN_VARIANTFREQUENCY){
|
|
291 return 3;
|
|
292 # 3 : insufficient variant frequency
|
|
293 }
|
|
294 }
|
|
295 }
|
|
296 elsif ($#DEL>=0){
|
|
297 if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){
|
|
298 return 1;
|
|
299 # 1 : variant type overload (multiple variant type detected insertion/deletion/mutation)
|
|
300 }
|
|
301 else {
|
|
302 ########## TEST de coherence des deletions ################
|
|
303 # for (my $i=0;$i<=$#DEL;$i++){
|
|
304 # if (uc($DEL[0]) ne uc($DEL[$i])){
|
|
305 # print uc($DEL[0]),"\n";
|
|
306 # print uc($DEL[$i]),"\n";
|
|
307 # return "1d";
|
|
308 # }
|
|
309 # }
|
|
310 ###########################################################
|
|
311
|
|
312 if($#DEL+1 < $current_coverage*$MIN_VARIANTFREQUENCY){
|
|
313 return 3;
|
|
314 # 3 : insufficient variant frequency
|
|
315 }
|
|
316 }
|
|
317 }
|
|
318 else {
|
|
319 my $nbA=0;
|
|
320 $nbA++ while ($current_readbase_string =~ m/A/g);
|
|
321 my $nbC=0;
|
|
322 $nbC++ while ($current_readbase_string =~ m/C/g);
|
|
323 my $nbT=0;
|
|
324 $nbT++ while ($current_readbase_string =~ m/T/g);
|
|
325 my $nbG=0;
|
|
326 $nbG++ while ($current_readbase_string =~ m/G/g);
|
|
327 my $nbN=0;
|
|
328 $nbN++ while ($current_readbase_string =~ m/N/g);
|
|
329 my $nba=0;
|
|
330 $nba++ while ($current_readbase_string =~ m/a/g);
|
|
331 my $nbc=0;
|
|
332 $nbc++ while ($current_readbase_string =~ m/c/g);
|
|
333 my $nbt=0;
|
|
334 $nbt++ while ($current_readbase_string =~ m/t/g);
|
|
335 my $nbg=0;
|
|
336 $nbg++ while ($current_readbase_string =~ m/g/g);
|
|
337 my $nbn=0;
|
|
338 $nbn++ while ($current_readbase_string =~ m/n/g);
|
|
339
|
|
340 if (($nbA+$nba>0)&&($nbT+$nbt+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){
|
|
341 return "1m";
|
|
342 }
|
|
343 if (($nbT+$nbt>0)&&($nbA+$nba+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){
|
|
344 return "1m";
|
|
345 }
|
|
346 if (($nbG+$nbg>0)&&($nbA+$nba+$nbT+$nbt+$nbC+$nbc+$nbN+$nbn>0)){
|
|
347 return "1m";
|
|
348 }
|
|
349 if (($nbC+$nbc>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbN+$nbn>0)){
|
|
350 return "1m";
|
|
351 }
|
|
352 if (($nbN+$nbn>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbC+$nbc>0)){
|
|
353 return "1m";
|
|
354 }
|
|
355
|
|
356 if ($nbA+$nba >= $current_coverage*$MIN_VARIANTFREQUENCY){
|
|
357 if (($nbA<$MIN_FORWARDREVERSE)||($nba<$MIN_FORWARDREVERSE)){
|
|
358 return 4;
|
|
359 # 4 : variant position not covered by forward and reverse reads
|
|
360 }
|
|
361 }
|
|
362 elsif ($nbT+$nbt >= $current_coverage*$MIN_VARIANTFREQUENCY){
|
|
363 if (($nbT<$MIN_FORWARDREVERSE)||($nbt<$MIN_FORWARDREVERSE)){
|
|
364 return 4;
|
|
365 # 4 : variant position not covered by forward and reverse reads
|
|
366 }
|
|
367 }
|
|
368 elsif ($nbG+$nbg >= $current_coverage*$MIN_VARIANTFREQUENCY){
|
|
369 if (($nbG<$MIN_FORWARDREVERSE)||($nbg<$MIN_FORWARDREVERSE)){
|
|
370 return 4;
|
|
371 # 4 : variant position not covered by forward and reverse reads
|
|
372 }
|
|
373 }
|
|
374 elsif ($nbC+$nbc >= $current_coverage*$MIN_VARIANTFREQUENCY){
|
|
375 if (($nbC<$MIN_FORWARDREVERSE)||($nbc<$MIN_FORWARDREVERSE)){
|
|
376 return 4;
|
|
377 # 4 : variant position not covered by forward and reverse reads
|
|
378 }
|
|
379 }
|
|
380 elsif ($nbN+$nbn >= $current_coverage*$MIN_VARIANTFREQUENCY){
|
|
381 if (($nbN<$MIN_FORWARDREVERSE)||($nbn<$MIN_FORWARDREVERSE)){
|
|
382 return 4;
|
|
383 # 4 : variant position not covered by forward and reverse reads
|
|
384 }
|
|
385 }
|
|
386 else {
|
|
387 return 3;
|
|
388 # 3 : insufficient variant frequency
|
|
389 }
|
|
390 }
|
|
391 }
|
|
392
|
|
393 return 0;
|
|
394 }
|