comparison mpileupfilter/mpileupfilter.pl @ 5:26d2c851d48e draft

Uploaded
author mcharles
date Mon, 16 Jun 2014 06:18:07 -0400
parents
children
comparison
equal deleted inserted replaced
4:53eff7c08ad3 5:26d2c851d48e
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Long;
4
5 #
6 # Filter a pileup file on forward/reverse presence and %read having the variant
7 # The error code
8 # 1 : multiple variant type detected insertion/deletion/mutation
9 # 1i : inconsistency in insertion
10 # 1d : inconsistency in deletion
11 # 1m : inconsistency in mutation
12 # 2 : insufficient depth
13 # 3 : insufficient variant frequency
14 # 4 : variant position not covered by forward and reverse reads
15 # 5 : variant with other variant in neighbourhood
16 # 6 : too much depth
17 # 8 : parsing error (couldn't parse the mpileup line correctly)
18 # 9 : parsing error (couldn't parse the readbase string correctly)
19
20
21 my $inputfile;
22 my $logfile;
23 my $MIN_DISTANCE=0;
24 my $MIN_VARIANTFREQUENCY=0;
25 my $MIN_FORWARDREVERSE=0;
26 my $MIN_DEPTH=0;
27 my $MAX_DEPTH=500;
28 my $VERBOSE=0;
29 my $ONLY_UNFILTERED_VARIANT="OFF";
30
31 if ($#ARGV<0){
32 print "\n";
33 print "perl 020_FilterPileupv6 -input_file <mpileup_file> [OPTION]\n";
34 print "-input_file \tinputfile in mpileup format\n";
35 print "-log_file \tlogfile containing discarded mpileup lines and the errorcode associated\n";
36 print "-min_depth \tminimum depth required [1]\n";
37 print "-max_depth \tmaximim depth (position with more coverage will be discarded) [100]\n";
38 print "-min_frequency \tminimum variant frequency (0->1) [1] (default 1 => 100% reads show the variant at this position)\n";
39 print "-min_distance \tminimum distance between variant [0]\n";
40 print "-min_forward_and_reverse \tminimum number of reads in forward and reverse covering the variant required [0]\n";
41 print "\n";
42 exit(0);
43 }
44
45 GetOptions (
46 "input_file=s" => \$inputfile,
47 "log_file=s" => \$logfile,
48 "min_depth=i" => \$MIN_DEPTH,
49 "max_depth=i" => \$MAX_DEPTH,
50 "min_frequency=f" => \$MIN_VARIANTFREQUENCY,
51 "min_distance=i" => \$MIN_DISTANCE,
52 "min_forward_and_reverse=i" => \$MIN_FORWARDREVERSE,
53 "variant_only=s" => \$ONLY_UNFILTERED_VARIANT,
54 "v=i" => \$VERBOSE
55 ) or die("Error in command line arguments\n");
56
57
58 open(IF, $inputfile) or die("Can't open $inputfile\n");
59
60 my @tbl_line;
61 my @tbl_variant_position;
62 my @tbl_variant_chr;
63 my @tbl_variant_refbase;
64 my @tbl_variant_coverage;
65 my @tbl_variant_readbase_string;
66 my @tbl_variant_quality_string;
67
68 #Extraction des variants
69 my $nb_line=0;
70 while (my $line=<IF>){
71 $nb_line++;
72 if (($nb_line % 1000000 == 0)&&($VERBOSE==1)){
73 print "$nb_line\n";
74 }
75 my $error_code=0;
76 if ($line=~/(.*?)\s+(\d+)\s+([ATGCN])\s+(\d+)\s+(.*?)\s+(.*?)$/){
77 my $current_chromosome = $1;
78 my $current_position = $2;
79 my $current_refbase = $3;
80 my $current_coverage = $4;
81 my $current_readbase_string = $5;
82 my $current_quality_string = $6;
83
84 #Suppression of mPileUp special character
85 $current_readbase_string =~ s/\$//g; #the read start at this position
86 $current_readbase_string =~ s/\^.//g; #the read end at this position followed by quality char
87
88 if ($current_readbase_string =~ /[ATGCNatgcn\d]/){
89 push(@tbl_line,$line);
90 push(@tbl_variant_chr,$current_chromosome);
91 push(@tbl_variant_position,$current_position);
92 push(@tbl_variant_refbase,$current_refbase);
93 push(@tbl_variant_coverage,$current_coverage);
94 push(@tbl_variant_readbase_string,$current_readbase_string);
95 push(@tbl_variant_quality_string,$current_quality_string);
96 if ($ONLY_UNFILTERED_VARIANT eq "ON"){
97 print $line;
98 }
99
100 }
101 else {
102 #Position with no variant
103 }
104
105 }
106 else {
107 #Error Parsing
108 print STDERR "$line #8";
109 }
110 }
111 close(IF);
112
113 if ($ONLY_UNFILTERED_VARIANT eq "ON"){
114 exit(0);
115 }
116
117 ####Checking the distance between variant and other filter
118
119 if ($logfile){
120 open(LF,">$logfile") or die ("Cant't open $logfile\n");
121 }
122
123 for (my $i=0;$i<=$#tbl_line;$i++){
124 # print "ligne : $tbl_line[$i]\n";
125
126 my $error_code=0;
127 if ($i==0){
128 #Comparing $i and $i+1 for neighbourhood filter;
129 if ($#tbl_line>0){
130 if (($tbl_variant_chr[$i+1] eq $tbl_variant_chr[$i])&&($tbl_variant_position[$i]+$MIN_DISTANCE>=$tbl_variant_position[$i+1])){
131 $error_code=5;
132 chomp($tbl_line[$i]);
133 if ($logfile){
134 print LF "$tbl_line[$i]\tcode:$error_code\n";
135 }
136 next;
137 }
138 }
139
140 #Additionnal filters
141 $error_code = check_error($tbl_variant_chr[$i],$tbl_variant_position[$i],$tbl_variant_refbase[$i],$tbl_variant_coverage[$i],$tbl_variant_readbase_string[$i]);
142
143 }
144 else {
145 #Compairing $i and $i-1 for neighbourhood filter
146 if (($tbl_variant_chr[$i-1] eq $tbl_variant_chr[$i])&&($tbl_variant_position[$i-1]+$MIN_DISTANCE>=$tbl_variant_position[$i])){
147 $error_code=5;
148 chomp($tbl_line[$i]);
149 if ($logfile){
150 print LF "$tbl_line[$i]\tcode:$error_code\n";
151 }
152 next;
153 }
154 else {
155 #Additionnal filters
156 $error_code = check_error($tbl_variant_chr[$i],$tbl_variant_position[$i],$tbl_variant_refbase[$i],$tbl_variant_coverage[$i],$tbl_variant_readbase_string[$i]);
157 }
158 }
159 if ($error_code == 0){
160 print $tbl_line[$i];
161 }
162 else {
163 chomp($tbl_line[$i]);
164 if ($logfile){
165 print LF "$tbl_line[$i]\tcode:$error_code\n";
166 }
167 }
168 }
169
170 if ($logfile){
171 close (LF);
172 }
173
174 sub check_error{
175 my $current_chromosome = shift;
176 my $current_position = shift;
177 my $current_refbase = shift;
178 my $current_coverage = shift;
179 my $current_readbase_string = shift;
180
181 # print "test : $current_readbase_string\n";
182
183
184
185 #Extraction of insertions
186
187 ##################################################################
188 # my @IN = $current_readbase_string =~ m/\+[0-9]+[ACGTNacgtn]+/g;
189 # my @DEL = $current_readbase_string =~ m/\-[0-9]+[ACGTNacgtn]+/g;
190 # print "IN : @IN\n";
191 # print "DEL :@DEL\n";
192 #$current_readbase_string=~s/[\+\-][0-9]+[ACGTNacgtn]+//g;
193 ##################################################################
194 #!!! marche pas : exemple .+1Ct. correspond a . / +1C / t /. mais le match de l'expression vire +1Ct
195 ##################################################################
196
197 # => parcours de boucle
198 my @readbase = split(//,$current_readbase_string);
199 my $cleaned_readbase_string="";
200 my @IN;
201 my @DEL;
202 my $current_IN="";
203 my $current_DEL="";
204 my $current_size=0;
205
206 for (my $i=0;$i<=$#readbase;$i++){
207 if ($readbase[$i] eq "+"){
208 #Ouverture de IN
209 $current_IN="+";
210
211 #Recuperation de la taille
212 my $sub = substr $current_readbase_string,$i;
213 if ($sub=~/^\+(\d+)/){
214 $current_size = $1;
215 }
216 my $remaining_size = $current_size;
217 while (($remaining_size>0)&&($i<=$#readbase)){
218 $i++;
219 $current_IN.=$readbase[$i];
220 if ($readbase[$i]=~ /[ATGCNatgcn]/){
221 $remaining_size--;
222 }
223 }
224 push(@IN,$current_IN);
225 }
226 elsif ($readbase[$i] eq "-"){
227 #Ouverture de DEL
228 $current_DEL="-";
229
230 #Recuperation de la taille
231 my $sub = substr $current_readbase_string,$i;
232 if ($sub=~/^\-(\d+)/){
233 $current_size = $1;
234 }
235 my $remaining_size = $current_size;
236 while (($remaining_size>0)&&($i<=$#readbase)){
237 $i++;
238 $current_DEL.=$readbase[$i];
239 if ($readbase[$i]=~ /[ATGCNatgcn]/){
240 $remaining_size--;
241 }
242 }
243 push(@DEL,$current_DEL);
244
245 }
246 else {
247 #Ajout a la string
248 $cleaned_readbase_string .= $readbase[$i];
249 }
250 }
251
252
253 # print "IN : @IN\n";
254 # print "DEL :@DEL\n";
255 # print "$cleaned_readbase_string\n";
256
257 my @current_readbase_array = split(//,$cleaned_readbase_string);
258
259 #Filtering : error detection
260
261 if ($#current_readbase_array+1 != $current_coverage){
262 return 9;
263 #parsing error (couldn't parse the readbase string correctly)
264 }
265 elsif ($current_coverage<$MIN_DEPTH){
266 return 2;
267 # 2 : insufficient depth
268 }
269 elsif ($current_coverage>$MAX_DEPTH){
270 return 6;
271 # 6 : too much depth
272 }
273 else {
274 if ($#IN>=0){
275 if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){
276 return 1;
277 # 1 : variant type overload (multiple variant type detected insertion/deletion/mutation)
278 }
279 else {
280 ########## TEST de coherence des insertions ################
281 # for (my $i=0;$i<=$#IN;$i++){
282 # if (uc($IN[0]) ne uc($IN[$i])){
283 # print uc($IN[0]),"\n";
284 # print uc($IN[$i]),"\n";
285 # return "1i";
286 # }
287 # }
288 ###########################################################
289
290 if($#IN+1 < $current_coverage*$MIN_VARIANTFREQUENCY){
291 return 3;
292 # 3 : insufficient variant frequency
293 }
294 }
295 }
296 elsif ($#DEL>=0){
297 if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){
298 return 1;
299 # 1 : variant type overload (multiple variant type detected insertion/deletion/mutation)
300 }
301 else {
302 ########## TEST de coherence des deletions ################
303 # for (my $i=0;$i<=$#DEL;$i++){
304 # if (uc($DEL[0]) ne uc($DEL[$i])){
305 # print uc($DEL[0]),"\n";
306 # print uc($DEL[$i]),"\n";
307 # return "1d";
308 # }
309 # }
310 ###########################################################
311
312 if($#DEL+1 < $current_coverage*$MIN_VARIANTFREQUENCY){
313 return 3;
314 # 3 : insufficient variant frequency
315 }
316 }
317 }
318 else {
319 my $nbA=0;
320 $nbA++ while ($current_readbase_string =~ m/A/g);
321 my $nbC=0;
322 $nbC++ while ($current_readbase_string =~ m/C/g);
323 my $nbT=0;
324 $nbT++ while ($current_readbase_string =~ m/T/g);
325 my $nbG=0;
326 $nbG++ while ($current_readbase_string =~ m/G/g);
327 my $nbN=0;
328 $nbN++ while ($current_readbase_string =~ m/N/g);
329 my $nba=0;
330 $nba++ while ($current_readbase_string =~ m/a/g);
331 my $nbc=0;
332 $nbc++ while ($current_readbase_string =~ m/c/g);
333 my $nbt=0;
334 $nbt++ while ($current_readbase_string =~ m/t/g);
335 my $nbg=0;
336 $nbg++ while ($current_readbase_string =~ m/g/g);
337 my $nbn=0;
338 $nbn++ while ($current_readbase_string =~ m/n/g);
339
340 if (($nbA+$nba>0)&&($nbT+$nbt+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){
341 return "1m";
342 }
343 if (($nbT+$nbt>0)&&($nbA+$nba+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){
344 return "1m";
345 }
346 if (($nbG+$nbg>0)&&($nbA+$nba+$nbT+$nbt+$nbC+$nbc+$nbN+$nbn>0)){
347 return "1m";
348 }
349 if (($nbC+$nbc>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbN+$nbn>0)){
350 return "1m";
351 }
352 if (($nbN+$nbn>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbC+$nbc>0)){
353 return "1m";
354 }
355
356 if ($nbA+$nba >= $current_coverage*$MIN_VARIANTFREQUENCY){
357 if (($nbA<$MIN_FORWARDREVERSE)||($nba<$MIN_FORWARDREVERSE)){
358 return 4;
359 # 4 : variant position not covered by forward and reverse reads
360 }
361 }
362 elsif ($nbT+$nbt >= $current_coverage*$MIN_VARIANTFREQUENCY){
363 if (($nbT<$MIN_FORWARDREVERSE)||($nbt<$MIN_FORWARDREVERSE)){
364 return 4;
365 # 4 : variant position not covered by forward and reverse reads
366 }
367 }
368 elsif ($nbG+$nbg >= $current_coverage*$MIN_VARIANTFREQUENCY){
369 if (($nbG<$MIN_FORWARDREVERSE)||($nbg<$MIN_FORWARDREVERSE)){
370 return 4;
371 # 4 : variant position not covered by forward and reverse reads
372 }
373 }
374 elsif ($nbC+$nbc >= $current_coverage*$MIN_VARIANTFREQUENCY){
375 if (($nbC<$MIN_FORWARDREVERSE)||($nbc<$MIN_FORWARDREVERSE)){
376 return 4;
377 # 4 : variant position not covered by forward and reverse reads
378 }
379 }
380 elsif ($nbN+$nbn >= $current_coverage*$MIN_VARIANTFREQUENCY){
381 if (($nbN<$MIN_FORWARDREVERSE)||($nbn<$MIN_FORWARDREVERSE)){
382 return 4;
383 # 4 : variant position not covered by forward and reverse reads
384 }
385 }
386 else {
387 return 3;
388 # 3 : insufficient variant frequency
389 }
390 }
391 }
392
393 return 0;
394 }