annotate rapsodyn/mpileupfilter.pl @ 17:9435ccd4e0c4 draft

Uploaded
author mcharles
date Wed, 20 Aug 2014 12:23:06 -0400
parents ad321ff1b67d
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
9
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
1 #!/usr/bin/perl
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
2 use strict;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
3 use Getopt::Long;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
4
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
5 #
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
6 # Filter a pileup file on forward/reverse presence and %read having the variant
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
7 # The error code
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
8 # 1 : multiple variant type detected insertion/deletion/mutation
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
9 # 1i : inconsistency in insertion
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
10 # 1d : inconsistency in deletion
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
11 # 1m : inconsistency in mutation
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
12 # 2 : insufficient depth
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
13 # 3 : insufficient variant frequency
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
14 # 4 : variant position not covered by forward and reverse reads
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
15 # 5 : variant with other variant in neighbourhood
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
16 # 6 : too much depth
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
17 # 8 : parsing error (couldn't parse the mpileup line correctly)
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
18 # 9 : parsing error (couldn't parse the readbase string correctly)
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
19
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
20
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
21 my $inputfile;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
22 my $logfile;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
23 my $MIN_DISTANCE=0;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
24 my $MIN_VARIANTFREQUENCY=0;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
25 my $MIN_FORWARDREVERSE=0;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
26 my $MIN_DEPTH=0;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
27 my $MAX_DEPTH=500;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
28 my $VERBOSE=0;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
29 my $ONLY_UNFILTERED_VARIANT="OFF";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
30
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
31 if ($#ARGV<0){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
32 print "\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
33 print "perl 020_FilterPileupv6 -input_file <mpileup_file> [OPTION]\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
34 print "-input_file \tinputfile in mpileup format\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
35 print "-log_file \tlogfile containing discarded mpileup lines and the errorcode associated\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
36 print "-min_depth \tminimum depth required [1]\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
37 print "-max_depth \tmaximim depth (position with more coverage will be discarded) [100]\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
38 print "-min_frequency \tminimum variant frequency (0->1) [1] (default 1 => 100% reads show the variant at this position)\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
39 print "-min_distance \tminimum distance between variant [0]\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
40 print "-min_forward_and_reverse \tminimum number of reads in forward and reverse covering the variant required [0]\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
41 print "\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
42 exit(0);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
43 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
44
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
45 GetOptions (
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
46 "input_file=s" => \$inputfile,
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
47 "log_file=s" => \$logfile,
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
48 "min_depth=i" => \$MIN_DEPTH,
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
49 "max_depth=i" => \$MAX_DEPTH,
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
50 "min_frequency=f" => \$MIN_VARIANTFREQUENCY,
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
51 "min_distance=i" => \$MIN_DISTANCE,
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
52 "min_forward_and_reverse=i" => \$MIN_FORWARDREVERSE,
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
53 "variant_only=s" => \$ONLY_UNFILTERED_VARIANT,
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
54 "v=i" => \$VERBOSE
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
55 ) or die("Error in command line arguments\n");
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
56
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
57
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
58 open(IF, $inputfile) or die("Can't open $inputfile\n");
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
59
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
60 my @tbl_line;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
61 my @tbl_variant_position;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
62 my @tbl_variant_chr;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
63 my @tbl_variant_refbase;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
64 my @tbl_variant_coverage;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
65 my @tbl_variant_readbase_string;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
66 my @tbl_variant_quality_string;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
67
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
68 #Extraction des variants
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
69 my $nb_line=0;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
70 while (my $line=<IF>){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
71 $nb_line++;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
72 if (($nb_line % 1000000 == 0)&&($VERBOSE==1)){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
73 print "$nb_line\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
74 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
75 my $error_code=0;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
76 if ($line=~/(.*?)\s+(\d+)\s+([ATGCN])\s+(\d+)\s+(.*?)\s+(.*?)$/){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
77 my $current_chromosome = $1;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
78 my $current_position = $2;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
79 my $current_refbase = $3;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
80 my $current_coverage = $4;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
81 my $current_readbase_string = $5;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
82 my $current_quality_string = $6;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
83
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
84 #Suppression of mPileUp special character
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
85 $current_readbase_string =~ s/\$//g; #the read start at this position
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
86 $current_readbase_string =~ s/\^.//g; #the read end at this position followed by quality char
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
87
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
88 if ($current_readbase_string =~ /[ATGCNatgcn\d]/){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
89 push(@tbl_line,$line);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
90 push(@tbl_variant_chr,$current_chromosome);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
91 push(@tbl_variant_position,$current_position);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
92 push(@tbl_variant_refbase,$current_refbase);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
93 push(@tbl_variant_coverage,$current_coverage);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
94 push(@tbl_variant_readbase_string,$current_readbase_string);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
95 push(@tbl_variant_quality_string,$current_quality_string);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
96 if ($ONLY_UNFILTERED_VARIANT eq "ON"){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
97 print $line;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
98 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
99
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
100 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
101 else {
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
102 #Position with no variant
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
103 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
104
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
105 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
106 else {
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
107 #Error Parsing
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
108 print STDERR "$line #8";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
109 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
110 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
111 close(IF);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
112
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
113 if ($ONLY_UNFILTERED_VARIANT eq "ON"){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
114 exit(0);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
115 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
116
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
117 ####Checking the distance between variant and other filter
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
118
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
119 if ($logfile){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
120 open(LF,">$logfile") or die ("Cant't open $logfile\n");
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
121 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
122
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
123 for (my $i=0;$i<=$#tbl_line;$i++){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
124 # print "ligne : $tbl_line[$i]\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
125
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
126 my $error_code=0;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
127 if ($i==0){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
128 #Comparing $i and $i+1 for neighbourhood filter;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
129 if ($#tbl_line>0){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
130 if (($tbl_variant_chr[$i+1] eq $tbl_variant_chr[$i])&&($tbl_variant_position[$i]+$MIN_DISTANCE>=$tbl_variant_position[$i+1])){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
131 $error_code=5;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
132 chomp($tbl_line[$i]);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
133 if ($logfile){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
134 print LF "$tbl_line[$i]\tcode:$error_code\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
135 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
136 next;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
137 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
138 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
139
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
140 #Additionnal filters
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
141 $error_code = check_error($tbl_variant_chr[$i],$tbl_variant_position[$i],$tbl_variant_refbase[$i],$tbl_variant_coverage[$i],$tbl_variant_readbase_string[$i]);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
142
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
143 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
144 else {
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
145 #Compairing $i and $i-1 for neighbourhood filter
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
146 if (($tbl_variant_chr[$i-1] eq $tbl_variant_chr[$i])&&($tbl_variant_position[$i-1]+$MIN_DISTANCE>=$tbl_variant_position[$i])){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
147 $error_code=5;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
148 chomp($tbl_line[$i]);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
149 if ($logfile){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
150 print LF "$tbl_line[$i]\tcode:$error_code\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
151 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
152 next;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
153 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
154 else {
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
155 #Additionnal filters
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
156 $error_code = check_error($tbl_variant_chr[$i],$tbl_variant_position[$i],$tbl_variant_refbase[$i],$tbl_variant_coverage[$i],$tbl_variant_readbase_string[$i]);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
157 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
158 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
159 if ($error_code == 0){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
160 print $tbl_line[$i];
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
161 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
162 else {
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
163 chomp($tbl_line[$i]);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
164 if ($logfile){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
165 print LF "$tbl_line[$i]\tcode:$error_code\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
166 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
167 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
168 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
169
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
170 if ($logfile){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
171 close (LF);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
172 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
173
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
174 sub check_error{
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
175 my $current_chromosome = shift;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
176 my $current_position = shift;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
177 my $current_refbase = shift;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
178 my $current_coverage = shift;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
179 my $current_readbase_string = shift;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
180
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
181 # print "test : $current_readbase_string\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
182
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
183
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
184
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
185 #Extraction of insertions
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
186
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
187 ##################################################################
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
188 # my @IN = $current_readbase_string =~ m/\+[0-9]+[ACGTNacgtn]+/g;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
189 # my @DEL = $current_readbase_string =~ m/\-[0-9]+[ACGTNacgtn]+/g;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
190 # print "IN : @IN\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
191 # print "DEL :@DEL\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
192 #$current_readbase_string=~s/[\+\-][0-9]+[ACGTNacgtn]+//g;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
193 ##################################################################
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
194 #!!! marche pas : exemple .+1Ct. correspond a . / +1C / t /. mais le match de l'expression vire +1Ct
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
195 ##################################################################
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
196
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
197 # => parcours de boucle
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
198 my @readbase = split(//,$current_readbase_string);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
199 my $cleaned_readbase_string="";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
200 my @IN;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
201 my @DEL;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
202 my $current_IN="";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
203 my $current_DEL="";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
204 my $current_size=0;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
205
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
206 for (my $i=0;$i<=$#readbase;$i++){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
207 if ($readbase[$i] eq "+"){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
208 #Ouverture de IN
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
209 $current_IN="+";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
210
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
211 #Recuperation de la taille
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
212 my $sub = substr $current_readbase_string,$i;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
213 if ($sub=~/^\+(\d+)/){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
214 $current_size = $1;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
215 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
216 my $remaining_size = $current_size;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
217 while (($remaining_size>0)&&($i<=$#readbase)){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
218 $i++;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
219 $current_IN.=$readbase[$i];
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
220 if ($readbase[$i]=~ /[ATGCNatgcn]/){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
221 $remaining_size--;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
222 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
223 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
224 push(@IN,$current_IN);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
225 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
226 elsif ($readbase[$i] eq "-"){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
227 #Ouverture de DEL
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
228 $current_DEL="-";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
229
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
230 #Recuperation de la taille
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
231 my $sub = substr $current_readbase_string,$i;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
232 if ($sub=~/^\-(\d+)/){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
233 $current_size = $1;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
234 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
235 my $remaining_size = $current_size;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
236 while (($remaining_size>0)&&($i<=$#readbase)){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
237 $i++;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
238 $current_DEL.=$readbase[$i];
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
239 if ($readbase[$i]=~ /[ATGCNatgcn]/){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
240 $remaining_size--;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
241 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
242 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
243 push(@DEL,$current_DEL);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
244
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
245 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
246 else {
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
247 #Ajout a la string
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
248 $cleaned_readbase_string .= $readbase[$i];
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
249 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
250 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
251
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
252
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
253 # print "IN : @IN\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
254 # print "DEL :@DEL\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
255 # print "$cleaned_readbase_string\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
256
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
257 my @current_readbase_array = split(//,$cleaned_readbase_string);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
258
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
259 #Filtering : error detection
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
260
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
261 if ($#current_readbase_array+1 != $current_coverage){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
262 return 9;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
263 #parsing error (couldn't parse the readbase string correctly)
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
264 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
265 elsif ($current_coverage<$MIN_DEPTH){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
266 return 2;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
267 # 2 : insufficient depth
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
268 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
269 elsif ($current_coverage>$MAX_DEPTH){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
270 return 6;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
271 # 6 : too much depth
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
272 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
273 else {
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
274 if ($#IN>=0){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
275 if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
276 return 1;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
277 # 1 : variant type overload (multiple variant type detected insertion/deletion/mutation)
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
278 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
279 else {
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
280 ########## TEST de coherence des insertions ################
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
281 # for (my $i=0;$i<=$#IN;$i++){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
282 # if (uc($IN[0]) ne uc($IN[$i])){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
283 # print uc($IN[0]),"\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
284 # print uc($IN[$i]),"\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
285 # return "1i";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
286 # }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
287 # }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
288 ###########################################################
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
289
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
290 if($#IN+1 < $current_coverage*$MIN_VARIANTFREQUENCY){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
291 return 3;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
292 # 3 : insufficient variant frequency
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
293 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
294 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
295 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
296 elsif ($#DEL>=0){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
297 if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
298 return 1;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
299 # 1 : variant type overload (multiple variant type detected insertion/deletion/mutation)
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
300 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
301 else {
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
302 ########## TEST de coherence des deletions ################
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
303 # for (my $i=0;$i<=$#DEL;$i++){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
304 # if (uc($DEL[0]) ne uc($DEL[$i])){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
305 # print uc($DEL[0]),"\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
306 # print uc($DEL[$i]),"\n";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
307 # return "1d";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
308 # }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
309 # }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
310 ###########################################################
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
311
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
312 if($#DEL+1 < $current_coverage*$MIN_VARIANTFREQUENCY){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
313 return 3;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
314 # 3 : insufficient variant frequency
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
315 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
316 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
317 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
318 else {
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
319 my $nbA=0;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
320 $nbA++ while ($current_readbase_string =~ m/A/g);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
321 my $nbC=0;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
322 $nbC++ while ($current_readbase_string =~ m/C/g);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
323 my $nbT=0;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
324 $nbT++ while ($current_readbase_string =~ m/T/g);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
325 my $nbG=0;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
326 $nbG++ while ($current_readbase_string =~ m/G/g);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
327 my $nbN=0;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
328 $nbN++ while ($current_readbase_string =~ m/N/g);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
329 my $nba=0;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
330 $nba++ while ($current_readbase_string =~ m/a/g);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
331 my $nbc=0;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
332 $nbc++ while ($current_readbase_string =~ m/c/g);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
333 my $nbt=0;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
334 $nbt++ while ($current_readbase_string =~ m/t/g);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
335 my $nbg=0;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
336 $nbg++ while ($current_readbase_string =~ m/g/g);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
337 my $nbn=0;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
338 $nbn++ while ($current_readbase_string =~ m/n/g);
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
339
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
340 if (($nbA+$nba>0)&&($nbT+$nbt+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
341 return "1m";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
342 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
343 if (($nbT+$nbt>0)&&($nbA+$nba+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
344 return "1m";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
345 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
346 if (($nbG+$nbg>0)&&($nbA+$nba+$nbT+$nbt+$nbC+$nbc+$nbN+$nbn>0)){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
347 return "1m";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
348 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
349 if (($nbC+$nbc>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbN+$nbn>0)){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
350 return "1m";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
351 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
352 if (($nbN+$nbn>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbC+$nbc>0)){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
353 return "1m";
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
354 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
355
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
356 if ($nbA+$nba >= $current_coverage*$MIN_VARIANTFREQUENCY){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
357 if (($nbA<$MIN_FORWARDREVERSE)||($nba<$MIN_FORWARDREVERSE)){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
358 return 4;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
359 # 4 : variant position not covered by forward and reverse reads
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
360 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
361 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
362 elsif ($nbT+$nbt >= $current_coverage*$MIN_VARIANTFREQUENCY){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
363 if (($nbT<$MIN_FORWARDREVERSE)||($nbt<$MIN_FORWARDREVERSE)){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
364 return 4;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
365 # 4 : variant position not covered by forward and reverse reads
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
366 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
367 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
368 elsif ($nbG+$nbg >= $current_coverage*$MIN_VARIANTFREQUENCY){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
369 if (($nbG<$MIN_FORWARDREVERSE)||($nbg<$MIN_FORWARDREVERSE)){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
370 return 4;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
371 # 4 : variant position not covered by forward and reverse reads
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
372 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
373 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
374 elsif ($nbC+$nbc >= $current_coverage*$MIN_VARIANTFREQUENCY){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
375 if (($nbC<$MIN_FORWARDREVERSE)||($nbc<$MIN_FORWARDREVERSE)){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
376 return 4;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
377 # 4 : variant position not covered by forward and reverse reads
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
378 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
379 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
380 elsif ($nbN+$nbn >= $current_coverage*$MIN_VARIANTFREQUENCY){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
381 if (($nbN<$MIN_FORWARDREVERSE)||($nbn<$MIN_FORWARDREVERSE)){
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
382 return 4;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
383 # 4 : variant position not covered by forward and reverse reads
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
384 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
385 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
386 else {
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
387 return 3;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
388 # 3 : insufficient variant frequency
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
389 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
390 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
391 }
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
392
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
393 return 0;
ad321ff1b67d Uploaded
mcharles
parents:
diff changeset
394 }