Mercurial > repos > mcharles > rapsosnp
comparison rapsodyn/PrepareFastqLight.pl @ 0:442a7c88b886 draft
Uploaded
author | mcharles |
---|---|
date | Wed, 10 Sep 2014 09:18:15 -0400 |
parents | |
children | 9074a5104cdd |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:442a7c88b886 |
---|---|
1 #!/usr/bin/perl | |
2 use strict; | |
3 use warnings; | |
4 | |
5 my $read1 = $ARGV[0]; | |
6 my $read2 = $ARGV[1]; | |
7 | |
8 my $output1 = $ARGV[2]; | |
9 my $output2 = $ARGV[3]; | |
10 | |
11 my $TYPE = $ARGV[4]; | |
12 my $MIN_LENGTH = $ARGV[5]; | |
13 my $MIN_QUALITY = $ARGV[6]; | |
14 | |
15 my $VERBOSE = $ARGV[7]; | |
16 | |
17 if (!$VERBOSE){ | |
18 $VERBOSE ="OFF"; | |
19 } | |
20 | |
21 open(READ1, $read1) or die ("Can't open $read1\n"); | |
22 open(READ2, $read2) or die ("Can't open $read2\n"); | |
23 open(OUT1, ">$output1") or die ("Can't open $output1\n"); | |
24 open(OUT2, ">$output2") or die ("Can't open $output2\n"); | |
25 | |
26 | |
27 my $error1=0; | |
28 my $error2=0; | |
29 my $error3=0; | |
30 my $error4=0; | |
31 my $error5=0; | |
32 my $error6=0; | |
33 my $error7=0; | |
34 my $error8=0; | |
35 my $error9=0; | |
36 my $error10=0; | |
37 | |
38 while (my $ligne1_r1 =<READ1>){ | |
39 my $ligne2_r1 =<READ1>; | |
40 my $ligne3_r1 =<READ1>; | |
41 my $ligne4_r1 =<READ1>; | |
42 my $ligne1_r2 =<READ2>; | |
43 my $ligne2_r2 =<READ2>; | |
44 my $ligne3_r2 =<READ2>; | |
45 my $ligne4_r2 =<READ2>; | |
46 | |
47 #@ 1 sec | |
48 if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){ | |
49 if ($VERBOSE eq "ON"){ | |
50 print "Error in file format"; | |
51 if ($ligne1_r1){print $ligne1_r1;} | |
52 if ($ligne2_r1){print $ligne2_r1;} | |
53 if ($ligne3_r1){print $ligne3_r1;} | |
54 if ($ligne4_r1){print $ligne4_r1;} | |
55 if ($ligne1_r2){print $ligne1_r2;} | |
56 if ($ligne2_r2){print $ligne2_r2;} | |
57 if ($ligne3_r2){print $ligne3_r2;} | |
58 if ($ligne4_r2){print $ligne4_r2;} | |
59 print "\n"; | |
60 } | |
61 $error1++; | |
62 } | |
63 elsif(($ligne1_r1 !~/^\@/)||($ligne1_r2 !~/^\@/)||($ligne3_r1 !~/^\+/)||($ligne3_r2 !~/^\+/)){ | |
64 if ($VERBOSE eq "ON"){ | |
65 print "Error in header : format\n"; | |
66 print $ligne1_r1; | |
67 print $ligne2_r1; | |
68 print $ligne3_r1; | |
69 print $ligne4_r1; | |
70 print $ligne1_r2; | |
71 print $ligne2_r2; | |
72 print $ligne3_r2; | |
73 print $ligne4_r2; | |
74 print "\n"; | |
75 } | |
76 $error2++; | |
77 } | |
78 #@ 1 - 2 sec | |
79 else { | |
80 | |
81 my $length_seq1 = length($ligne2_r1); | |
82 my $length_qual1 =length($ligne4_r1); | |
83 my $seq1; | |
84 my $qual1; | |
85 | |
86 my $length_seq2 = length($ligne2_r2); | |
87 my $length_qual2 =length($ligne4_r2); | |
88 my $seq2; | |
89 my $qual2; | |
90 my $header1=""; | |
91 my $header2=""; | |
92 my $repheader1=""; | |
93 my $repheader2=""; | |
94 | |
95 if ($ligne1_r1 =~/^\@(.*?)\#/){ | |
96 $header1 = $1; | |
97 } | |
98 | |
99 if ($ligne3_r1 =~/^\+(.*?)\#/){ | |
100 $repheader1 = $1; | |
101 } | |
102 | |
103 if ($ligne1_r2 =~/^\@(.*?)\#/){ | |
104 $header2 = $1; | |
105 } | |
106 | |
107 if ($ligne3_r2 =~/^\+(.*?)\#/){ | |
108 $repheader2 = $1; | |
109 } | |
110 #@ 2 sec | |
111 | |
112 ### Verification de la coherence sequence /qualité @ 1 sec | |
113 if (($TYPE eq "illumina")&&((!$header1)||(!$header2)||(!$repheader1)||(!$repheader2))){ | |
114 if ($VERBOSE eq "ON"){ | |
115 print "Error in header : empty\n"; | |
116 print $ligne1_r1; | |
117 print $ligne2_r1; | |
118 print $ligne3_r1; | |
119 print $ligne4_r1; | |
120 print $ligne1_r2; | |
121 print $ligne2_r2; | |
122 print $ligne3_r2; | |
123 print $ligne4_r2; | |
124 print "\n"; | |
125 } | |
126 $error3++; | |
127 } | |
128 elsif (($TYPE eq "sanger")&&((!$header1)||(!$header2))){ | |
129 if ($VERBOSE eq "ON"){ | |
130 print "Error in header refgsd : empty\n"; | |
131 print $ligne1_r1; | |
132 print $ligne2_r1; | |
133 print $ligne3_r1; | |
134 print $ligne4_r1; | |
135 print $ligne1_r2; | |
136 print $ligne2_r2; | |
137 print $ligne3_r2; | |
138 print $ligne4_r2; | |
139 print "\n"; | |
140 } | |
141 $error3++; | |
142 } | |
143 elsif (($TYPE eq "illumina")&&(($header1 ne $repheader1)||($header2 ne $repheader2)||($header1 ne $header2))){ | |
144 if ($VERBOSE eq "ON"){ | |
145 print "Error in header : different\n"; | |
146 print $ligne1_r1; | |
147 print $ligne2_r1; | |
148 print $ligne3_r1; | |
149 print $ligne4_r1; | |
150 print $ligne1_r2; | |
151 print $ligne2_r2; | |
152 print $ligne3_r2; | |
153 print $ligne4_r2; | |
154 print "\n"; | |
155 } | |
156 $error4++; | |
157 } | |
158 elsif (($TYPE eq "sanger")&&($header1 ne $header2)){ | |
159 if ($VERBOSE eq "ON"){ | |
160 print "Error in header : different\n"; | |
161 print $ligne1_r1; | |
162 print $ligne2_r1; | |
163 print $ligne3_r1; | |
164 print $ligne4_r1; | |
165 print $ligne1_r2; | |
166 print $ligne2_r2; | |
167 print $ligne3_r2; | |
168 print $ligne4_r2; | |
169 print "\n"; | |
170 } | |
171 $error4++; | |
172 } | |
173 elsif (($length_seq1 != $length_qual1)||($length_seq2 != $length_qual2)){ | |
174 if ($VERBOSE eq "ON"){ | |
175 print "Error in seq/qual length\n"; | |
176 print $ligne1_r1; | |
177 print $ligne2_r1; | |
178 print $ligne3_r1; | |
179 print $ligne4_r1; | |
180 print $ligne1_r2; | |
181 print $ligne2_r2; | |
182 print $ligne3_r2; | |
183 print $ligne4_r2; | |
184 print "\n"; | |
185 } | |
186 $error5++; | |
187 } | |
188 #@ 1 - 2 sec | |
189 else { | |
190 ### Parsing sequence & qualité | |
191 if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){ | |
192 $seq1 = $1; | |
193 } | |
194 if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){ | |
195 $seq2 = $1; | |
196 } | |
197 if ($ligne4_r1 =~ /^(.*)\s*$/i){ | |
198 $qual1 = $1; | |
199 } | |
200 if ($ligne4_r2 =~ /^(.*)\s*$/i){ | |
201 $qual2 = $1; | |
202 } | |
203 #@ 2 sec | |
204 ### Verification du parsing et de la coherence sequence /qualité (n°2) | |
205 if ((!$seq1)||(!$seq2)||(!$qual1)||(!$qual2)){ | |
206 if ($VERBOSE eq "ON"){ | |
207 print "Error parsing seq / quality \n"; | |
208 print $ligne1_r1; | |
209 print $ligne2_r1; | |
210 print $ligne3_r1; | |
211 print $ligne4_r1; | |
212 print $ligne1_r2; | |
213 print $ligne2_r2; | |
214 print $ligne3_r2; | |
215 print $ligne4_r2; | |
216 print "\n"; | |
217 } | |
218 $error6++; | |
219 } | |
220 elsif ((length($seq1) != length($qual1))||(length($seq2) != length($qual2))){ | |
221 if ($VERBOSE eq "ON"){ | |
222 print "Error in seq/qual length after parsing\n"; | |
223 print $ligne1_r1; | |
224 print $ligne2_r1; | |
225 print $ligne3_r1; | |
226 print $ligne4_r1; | |
227 print $ligne1_r2; | |
228 print $ligne2_r2; | |
229 print $ligne3_r2; | |
230 print $ligne4_r2; | |
231 print "\n"; | |
232 } | |
233 $error7++; | |
234 } | |
235 #@ <1 sec | |
236 else { | |
237 my $fastq_lines_r1=""; | |
238 my $fastq_lines_r2=""; | |
239 $fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1); | |
240 if ($fastq_lines_r1){ | |
241 $fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2); | |
242 } | |
243 if ($fastq_lines_r2){ | |
244 print OUT1 $fastq_lines_r1; | |
245 print OUT2 $fastq_lines_r2; | |
246 } | |
247 } | |
248 } | |
249 | |
250 # print OUT1 $ligne1_r1; | |
251 # print OUT1 $ligne2_r1; | |
252 # print OUT1 $ligne3_r1; | |
253 # print OUT1 $ligne4_r1; | |
254 # print OUT2 $ligne1_r2; | |
255 # print OUT2 $ligne2_r2; | |
256 # print OUT2 $ligne3_r2; | |
257 # print OUT2 $ligne4_r2; | |
258 | |
259 #@ 7 sec | |
260 } | |
261 } | |
262 | |
263 | |
264 | |
265 close (READ1); | |
266 close (READ2); | |
267 close (OUT1); | |
268 close (OUT2); | |
269 | |
270 | |
271 | |
272 | |
273 sub grooming_and_trimming{ | |
274 my $header = shift; | |
275 my $seq = shift; | |
276 my $quality = shift; | |
277 my $quality_converted=""; | |
278 | |
279 my $startnoN = 0; | |
280 my $stopnoN = length($quality)-1; | |
281 | |
282 #print "SEQ :\n$seq\n"; | |
283 | |
284 my $chercheN = $seq; | |
285 my @bad_position; | |
286 my $current_index = index($chercheN,"N"); | |
287 my $abs_index = $current_index; | |
288 while ($current_index >=0){ | |
289 push (@bad_position,$abs_index); | |
290 | |
291 if ($current_index<length($seq)){ | |
292 $chercheN = substr($chercheN,$current_index+1); | |
293 $current_index = index($chercheN,"N"); | |
294 $abs_index = $current_index + $bad_position[$#bad_position]+1; | |
295 } | |
296 else { | |
297 last; | |
298 } | |
299 } | |
300 | |
301 | |
302 if ($#bad_position>=0){ | |
303 my %coord=%{&extract_longer_string_coordinates_from_bad_position($startnoN,$stopnoN,\@bad_position)}; | |
304 $startnoN = $coord{"start"}; | |
305 $stopnoN = $coord{"stop"}; | |
306 } | |
307 my $lengthnoN = $stopnoN - $startnoN + 1; | |
308 my $seqnoN = substr($seq,$startnoN,$lengthnoN); | |
309 #print "$seqnoN\n"; | |
310 | |
311 if ($lengthnoN >= $MIN_LENGTH){ | |
312 my $startTrim = $startnoN; | |
313 my $stopTrim = $stopnoN; | |
314 | |
315 my $quality_converted=""; | |
316 my @bad_position; | |
317 | |
318 my @q = split(//,$quality); | |
319 #print "QUALITY\n"; | |
320 #print "$quality\n"; | |
321 for (my $i=0;$i<=$stopnoN;$i++){ | |
322 my $chr = $q[$i]; | |
323 my $num = ord($q[$i]); | |
324 if ($TYPE eq "illumina"){ | |
325 $num = $num -64+33; | |
326 $quality_converted .= chr($num); | |
327 } | |
328 | |
329 if ($num <$MIN_QUALITY + 64 - 33 ){ | |
330 push(@bad_position,$i+$startnoN); | |
331 } | |
332 } | |
333 if ($quality_converted){$quality = $quality_converted;} | |
334 #print "$quality\n"; | |
335 | |
336 | |
337 | |
338 if ($#bad_position>=0){ | |
339 # for (my $i=0;$i<=$#bad_position;$i++){ | |
340 # print $bad_position[$i]."\t"; | |
341 # } | |
342 # print "\n"; | |
343 my %coord=%{&extract_longer_string_coordinates_from_bad_position($startnoN,$stopnoN,\@bad_position)}; | |
344 $startTrim = $coord{"start"}; | |
345 $stopTrim = $coord{"stop"}; | |
346 #print "$startTrim .. $stopTrim\n"; | |
347 | |
348 } | |
349 my $lengthTrim = $stopTrim - $startTrim +1; | |
350 | |
351 | |
352 my $fastq_lines=""; | |
353 | |
354 if ($lengthTrim >= $MIN_LENGTH){ | |
355 $fastq_lines .= $header; | |
356 $fastq_lines .= substr($seq,$startTrim,$lengthTrim)."\n"; | |
357 $fastq_lines .= "+\n"; | |
358 $fastq_lines .= substr($quality,$startTrim,$lengthTrim)."\n"; | |
359 return $fastq_lines; | |
360 } | |
361 else { | |
362 return ""; | |
363 } | |
364 | |
365 | |
366 | |
367 } | |
368 else { | |
369 return ""; | |
370 } | |
371 | |
372 | |
373 # my @s = split(//,$seq); | |
374 # my $sanger_quality=""; | |
375 | |
376 | |
377 | |
378 | |
379 # return $sanger_quality; | |
380 } | |
381 | |
382 sub extract_longer_string_coordinates_from_bad_position{ | |
383 my $start=shift; | |
384 my $stop =shift; | |
385 my $refbad = shift; | |
386 my @bad_position = @$refbad; | |
387 my %coord; | |
388 | |
389 my $current_start = $start; | |
390 my $current_stop = $bad_position[0]-1; | |
391 if ($current_stop < $start){$current_stop = $start;} | |
392 | |
393 | |
394 #debut -> premier N | |
395 my $current_length = $current_stop - $current_start +1; | |
396 my $test_length; | |
397 | |
398 #entre les N | |
399 for (my $i=1;$i<=$#bad_position;$i++){ | |
400 $test_length = $bad_position[$i]+1-$bad_position[$i-1]-1; | |
401 if ( $test_length > $current_length){ | |
402 $current_start = $bad_position[$i-1]+1; | |
403 $current_stop = $bad_position[$i]-1; | |
404 $current_length = $current_stop - $current_start +1; | |
405 } | |
406 } | |
407 | |
408 #dernier N -> fin | |
409 $test_length = $stop-$bad_position[$#bad_position]+1; | |
410 if ( $test_length > $current_length){ | |
411 $current_start = $bad_position[$#bad_position]+1; | |
412 if ($current_start > $stop){$current_start=$stop;} | |
413 $current_stop = $stop; | |
414 } | |
415 $coord{"start"}=$current_start; | |
416 $coord{"stop"}= $current_stop; | |
417 $coord{"lenght"}=$current_stop-$current_start+1; | |
418 | |
419 return \%coord; | |
420 } |