0
|
1 #!/usr/bin/perl
|
15
|
2 #v1.1.1 new check on read synchro
|
10
|
3 #v1.1.0 manage empty files
|
|
4 #v1.0.4 bug correction, last read not considered
|
8
|
5 #v1.0.3 support rapsodyn header (.... 1:... / .... 2:...)
|
|
6 #V1.0.2 added auto type detection
|
7
|
7 #V1.0.1 added log, option parameters
|
0
|
8 use strict;
|
|
9 use warnings;
|
7
|
10 use Getopt::Long;
|
0
|
11
|
7
|
12 my $read1_file;
|
|
13 my $read2_file;
|
|
14 my $log_file;
|
|
15 my $output1_file;
|
|
16 my $output2_file;
|
0
|
17
|
7
|
18 my $TYPE="sanger";
|
|
19 my $MIN_LENGTH=30;
|
|
20 my $MIN_QUALITY=30;
|
|
21
|
|
22 my $VERBOSE = "OFF";
|
0
|
23
|
7
|
24 GetOptions (
|
|
25 "read1_file=s" => \$read1_file,
|
|
26 "read2_file=s" => \$read2_file,
|
|
27 "log_file=s" => \$log_file,
|
|
28 "output1_file=s" => \$output1_file,
|
|
29 "output2_file=s" => \$output2_file,
|
|
30 "type=s" => \$TYPE,
|
|
31 "min_length=i" => \$MIN_LENGTH,
|
|
32 "min_quality=i" => \$MIN_QUALITY,
|
|
33 "verbose=s" => \$VERBOSE
|
|
34 ) or die("Error in command line arguments\n");
|
0
|
35
|
|
36
|
7
|
37 my $nb_read1=0;
|
|
38 my $nb_base_read1=0;
|
|
39 my $nb_read2=0;
|
|
40 my $nb_base_read2=0;
|
0
|
41
|
7
|
42 my $nb_read1_t=0;
|
|
43 my $nb_base_read1_t=0;
|
|
44 my $nb_read2_t=0;
|
|
45 my $nb_base_read2_t=0;
|
|
46
|
|
47 my $nb_base_current_t=0;
|
|
48
|
|
49 open(READ1, $read1_file) or die ("Can't open $read1_file\n");
|
|
50 open(READ2, $read2_file) or die ("Can't open $read2_file\n");
|
|
51 open(OUT1, ">$output1_file") or die ("Can't open $output1_file\n");
|
|
52 open(OUT2, ">$output2_file") or die ("Can't open $output2_file\n");
|
8
|
53 open (LF,">$log_file") or die("Can't open $log_file\n");
|
|
54
|
10
|
55 if (( -z READ1)&&( -z READ2)){
|
|
56 exit(0);
|
|
57 }
|
|
58 elsif (( -z READ1)||( -z READ2)){
|
|
59 print STDERR "One empty File\n";
|
|
60 exit(0);
|
|
61 }
|
|
62
|
0
|
63
|
|
64 my $error1=0;
|
|
65 my $error2=0;
|
|
66 my $error3=0;
|
|
67 my $error4=0;
|
|
68 my $error5=0;
|
|
69 my $error6=0;
|
|
70 my $error7=0;
|
|
71 my $error8=0;
|
|
72 my $error9=0;
|
|
73 my $error10=0;
|
|
74
|
8
|
75 my $auto_type="";
|
|
76 my %qual;
|
|
77 if ($TYPE eq "auto"){
|
|
78 my $compt=0;
|
|
79 open(DETECT, $read1_file) or die ("Can't open $read1_file\n");
|
|
80 while (my $ligne1_r1 =<DETECT>){
|
|
81 my $ligne2_r1 =<DETECT>;
|
|
82 my $ligne3_r1 =<DETECT>;
|
|
83 my $ligne4_r1 =<DETECT>;
|
|
84 $compt++;
|
|
85 if ($ligne4_r1 =~ /^(.*)\s*$/i){
|
|
86 my $qual = $1;
|
|
87 my @q = split(//,$qual);
|
|
88 for (my $i=0;$i<=$#q;$i++){
|
|
89 my $num = ord($q[$i]);
|
|
90 if ($qual{$num}){
|
|
91 $qual{$num}++;
|
|
92 }
|
|
93 else {
|
|
94 $qual{$num} = 1;
|
|
95 }
|
|
96 #range sanger / illumina 1.8+ : 33->94
|
|
97 #range illumina 1.3->1.7 : 64->105
|
|
98 if ($num > 94){$auto_type = "illumina";last;}
|
|
99 if ($num < 64){$auto_type = "sanger";last;}
|
|
100 }
|
|
101 }
|
|
102 else {
|
|
103 print STDERR "Error in format detection : quality not recognized\n$ligne4_r1";
|
|
104 exit(0);
|
|
105 }
|
|
106
|
|
107 if ($auto_type ne ""){
|
|
108 last;
|
|
109 }
|
|
110
|
|
111 }
|
|
112 close (DETECT);
|
|
113 if ($auto_type eq ""){
|
|
114 print STDERR "Error in format detection : type not recognized parsing read1\n";
|
|
115 foreach my $key (sort {$a <=> $b} keys %qual){
|
|
116 print "$key\t:\t",$qual{$key},"\n";
|
|
117 }
|
|
118 exit(0);
|
|
119 }
|
|
120 else {
|
|
121 $TYPE = $auto_type;
|
|
122 }
|
|
123 }
|
|
124
|
|
125
|
|
126
|
10
|
127 my $compt=0;
|
0
|
128 while (my $ligne1_r1 =<READ1>){
|
|
129 my $ligne2_r1 =<READ1>;
|
|
130 my $ligne3_r1 =<READ1>;
|
|
131 my $ligne4_r1 =<READ1>;
|
|
132 my $ligne1_r2 =<READ2>;
|
|
133 my $ligne2_r2 =<READ2>;
|
|
134 my $ligne3_r2 =<READ2>;
|
|
135 my $ligne4_r2 =<READ2>;
|
10
|
136
|
|
137 $compt++;
|
7
|
138 $nb_read1++;
|
|
139 $nb_read2++;
|
0
|
140
|
15
|
141
|
0
|
142 if ((!$ligne1_r1)||(!$ligne2_r1)||(!$ligne3_r1)||(!$ligne4_r1)||(!$ligne1_r2)||(!$ligne2_r2)||(!$ligne3_r2)||(!$ligne4_r2)){
|
|
143 if ($VERBOSE eq "ON"){
|
|
144 print "Error in file format";
|
|
145 if ($ligne1_r1){print $ligne1_r1;}
|
|
146 if ($ligne2_r1){print $ligne2_r1;}
|
|
147 if ($ligne3_r1){print $ligne3_r1;}
|
|
148 if ($ligne4_r1){print $ligne4_r1;}
|
|
149 if ($ligne1_r2){print $ligne1_r2;}
|
|
150 if ($ligne2_r2){print $ligne2_r2;}
|
|
151 if ($ligne3_r2){print $ligne3_r2;}
|
|
152 if ($ligne4_r2){print $ligne4_r2;}
|
|
153 print "\n";
|
|
154 }
|
|
155 $error1++;
|
|
156 }
|
|
157 elsif(($ligne1_r1 !~/^\@/)||($ligne1_r2 !~/^\@/)||($ligne3_r1 !~/^\+/)||($ligne3_r2 !~/^\+/)){
|
|
158 if ($VERBOSE eq "ON"){
|
|
159 print "Error in header : format\n";
|
|
160 print $ligne1_r1;
|
|
161 print $ligne2_r1;
|
|
162 print $ligne3_r1;
|
|
163 print $ligne4_r1;
|
|
164 print $ligne1_r2;
|
|
165 print $ligne2_r2;
|
|
166 print $ligne3_r2;
|
|
167 print $ligne4_r2;
|
|
168 print "\n";
|
|
169 }
|
|
170 $error2++;
|
|
171 }
|
15
|
172
|
0
|
173 else {
|
|
174
|
10
|
175 my $length_seq1 = length(chomp($ligne2_r1));
|
|
176 my $length_qual1 =length(chomp($ligne4_r1));
|
0
|
177 my $seq1;
|
|
178 my $qual1;
|
|
179
|
10
|
180 my $length_seq2 = length(chomp($ligne2_r2));
|
|
181 my $length_qual2 =length(chomp($ligne4_r2));
|
0
|
182 my $seq2;
|
|
183 my $qual2;
|
|
184 my $header1="";
|
|
185 my $header2="";
|
|
186 my $repheader1="";
|
|
187 my $repheader2="";
|
7
|
188
|
15
|
189 my @tbl_header1;
|
|
190 my @tbl_header2;
|
|
191 if ($ligne1_r1 =~/^\@(.*?)\s*$/){
|
0
|
192 $header1 = $1;
|
15
|
193 @tbl_header1 = split(//,$header1);
|
0
|
194 }
|
|
195
|
15
|
196 if ($ligne3_r1 =~/^\+(.*?)\s*$/){
|
0
|
197 $repheader1 = $1;
|
|
198 }
|
|
199
|
15
|
200 if ($ligne1_r2 =~/^\@(.*?)\s*$/){
|
0
|
201 $header2 = $1;
|
15
|
202 @tbl_header2 = split(//,$header2);
|
0
|
203 }
|
|
204
|
15
|
205 if ($ligne3_r2 =~/^\+(.*?)\s*$/){
|
0
|
206 $repheader2 = $1;
|
|
207 }
|
15
|
208 my $diffheader=0;
|
|
209 if ($#tbl_header1 == $#tbl_header2){
|
|
210 for (my $i=0;$i<=$#tbl_header1;$i++){
|
|
211 if ($tbl_header1[$i] ne $tbl_header2[$i]){
|
|
212 $diffheader++;
|
|
213 }
|
|
214 }
|
|
215 }
|
0
|
216
|
15
|
217
|
|
218
|
|
219 ### Verification de la coherence sequence /qualité
|
|
220 if ((!$header1)||(!$header2)){
|
0
|
221 if ($VERBOSE eq "ON"){
|
|
222 print "Error in header : empty\n";
|
|
223 print $ligne1_r1;
|
|
224 print $ligne2_r1;
|
|
225 print $ligne3_r1;
|
|
226 print $ligne4_r1;
|
|
227 print $ligne1_r2;
|
|
228 print $ligne2_r2;
|
|
229 print $ligne3_r2;
|
|
230 print $ligne4_r2;
|
|
231 print "\n";
|
|
232 }
|
|
233 $error3++;
|
|
234 }
|
15
|
235 elsif ((($repheader1)&&($header1 ne $repheader1))||(($repheader2)&&($header2 ne $repheader2))){
|
0
|
236 if ($VERBOSE eq "ON"){
|
15
|
237 print "Error : difference in header and header repeat\n";
|
0
|
238 print $ligne1_r1;
|
|
239 print $ligne2_r1;
|
|
240 print $ligne3_r1;
|
|
241 print $ligne4_r1;
|
|
242 print $ligne1_r2;
|
|
243 print $ligne2_r2;
|
|
244 print $ligne3_r2;
|
|
245 print $ligne4_r2;
|
|
246 print "\n";
|
|
247 }
|
|
248 $error4++;
|
|
249 }
|
15
|
250 elsif ($#tbl_header1 != $#tbl_header2){
|
0
|
251 if ($VERBOSE eq "ON"){
|
15
|
252 print "Error : difference in header size between reads\n";
|
|
253 print $ligne1_r1;
|
|
254 print $ligne2_r1;
|
|
255 print $ligne3_r1;
|
|
256 print $ligne4_r1;
|
|
257 print $ligne1_r2;
|
|
258 print $ligne2_r2;
|
|
259 print $ligne3_r2;
|
|
260 print $ligne4_r2;
|
|
261 print "\n";
|
|
262 }
|
|
263 $error4++;
|
|
264 }
|
|
265 elsif ($diffheader > 1 ){ # More than ...1 and ...2 difference in read1 and read2 header
|
|
266 if ($VERBOSE eq "ON"){
|
|
267 print "Error can't establish synchro between reads, more than 1 difference between headers\n";
|
0
|
268 print $ligne1_r1;
|
|
269 print $ligne2_r1;
|
|
270 print $ligne3_r1;
|
|
271 print $ligne4_r1;
|
|
272 print $ligne1_r2;
|
|
273 print $ligne2_r2;
|
|
274 print $ligne3_r2;
|
|
275 print $ligne4_r2;
|
|
276 print "\n";
|
|
277 }
|
|
278 $error4++;
|
|
279 }
|
|
280 elsif (($length_seq1 != $length_qual1)||($length_seq2 != $length_qual2)){
|
|
281 if ($VERBOSE eq "ON"){
|
|
282 print "Error in seq/qual length\n";
|
10
|
283 print "$length_seq1 / $length_qual1 \t $length_seq2 / $length_qual2\n";
|
0
|
284 print $ligne1_r1;
|
|
285 print $ligne2_r1;
|
|
286 print $ligne3_r1;
|
|
287 print $ligne4_r1;
|
10
|
288 print "\n";
|
0
|
289 print $ligne1_r2;
|
|
290 print $ligne2_r2;
|
|
291 print $ligne3_r2;
|
|
292 print $ligne4_r2;
|
|
293 print "\n";
|
|
294 }
|
|
295 $error5++;
|
|
296 }
|
|
297 #@ 1 - 2 sec
|
|
298 else {
|
10
|
299 #print "TEST : $compt\n";
|
0
|
300 ### Parsing sequence & qualité
|
|
301 if ($ligne2_r1 =~ /^([ATGCNX]+)\s*$/i){
|
|
302 $seq1 = $1;
|
7
|
303 $nb_base_read1 += length($seq1);
|
0
|
304 }
|
|
305 if ($ligne2_r2 =~ /^([ATGCNX]+)\s*$/i){
|
|
306 $seq2 = $1;
|
7
|
307 $nb_base_read2 += length($seq2);
|
0
|
308 }
|
|
309 if ($ligne4_r1 =~ /^(.*)\s*$/i){
|
|
310 $qual1 = $1;
|
|
311 }
|
|
312 if ($ligne4_r2 =~ /^(.*)\s*$/i){
|
|
313 $qual2 = $1;
|
|
314 }
|
|
315 #@ 2 sec
|
|
316 ### Verification du parsing et de la coherence sequence /qualité (n°2)
|
|
317 if ((!$seq1)||(!$seq2)||(!$qual1)||(!$qual2)){
|
|
318 if ($VERBOSE eq "ON"){
|
|
319 print "Error parsing seq / quality \n";
|
|
320 print $ligne1_r1;
|
|
321 print $ligne2_r1;
|
|
322 print $ligne3_r1;
|
|
323 print $ligne4_r1;
|
|
324 print $ligne1_r2;
|
|
325 print $ligne2_r2;
|
|
326 print $ligne3_r2;
|
|
327 print $ligne4_r2;
|
|
328 print "\n";
|
|
329 }
|
|
330 $error6++;
|
|
331 }
|
|
332 elsif ((length($seq1) != length($qual1))||(length($seq2) != length($qual2))){
|
|
333 if ($VERBOSE eq "ON"){
|
|
334 print "Error in seq/qual length after parsing\n";
|
|
335 print $ligne1_r1;
|
|
336 print $ligne2_r1;
|
|
337 print $ligne3_r1;
|
|
338 print $ligne4_r1;
|
|
339 print $ligne1_r2;
|
|
340 print $ligne2_r2;
|
|
341 print $ligne3_r2;
|
|
342 print $ligne4_r2;
|
|
343 print "\n";
|
|
344 }
|
|
345 $error7++;
|
|
346 }
|
|
347 #@ <1 sec
|
|
348 else {
|
|
349 my $fastq_lines_r1="";
|
|
350 my $fastq_lines_r2="";
|
7
|
351 my $nb_base_current_read1_t = 0;
|
|
352 my $nb_base_current_read2_t = 0;
|
|
353
|
0
|
354 $fastq_lines_r1 = &grooming_and_trimming($ligne1_r1,$seq1,$qual1);
|
7
|
355 $nb_base_current_read1_t = $nb_base_current_t;
|
0
|
356 if ($fastq_lines_r1){
|
|
357 $fastq_lines_r2 = &grooming_and_trimming($ligne1_r2,$seq2,$qual2);
|
7
|
358 $nb_base_current_read2_t = $nb_base_current_t;
|
0
|
359 }
|
|
360 if ($fastq_lines_r2){
|
|
361 print OUT1 $fastq_lines_r1;
|
|
362 print OUT2 $fastq_lines_r2;
|
7
|
363
|
|
364 $nb_read1_t++;
|
|
365 $nb_read2_t++;
|
|
366 $nb_base_read1_t += $nb_base_current_read1_t;
|
|
367 $nb_base_read2_t += $nb_base_current_read2_t;
|
|
368
|
|
369
|
0
|
370 }
|
|
371 }
|
|
372 }
|
|
373
|
7
|
374
|
0
|
375 #@ 7 sec
|
|
376 }
|
|
377 }
|
|
378
|
|
379 close (READ1);
|
|
380 close (READ2);
|
|
381 close (OUT1);
|
|
382 close (OUT2);
|
|
383
|
7
|
384 print LF "\n####\t Fastq preparation \n";
|
8
|
385 print LF "Fastq format : $TYPE\n";
|
7
|
386 print LF "## Before preparation\n";
|
|
387 print LF "#Read1 :\t$nb_read1\t#Base :\t$nb_base_read1\n";
|
|
388 print LF "#Read2 :\t$nb_read2\t#Base :\t$nb_base_read2\n";
|
|
389 print LF "## After preparation\n";
|
|
390 print LF "#Read1 :\t$nb_read1_t\t#Base :\t$nb_base_read1_t\n";
|
|
391 print LF "#Read2 :\t$nb_read2_t\t#Base :\t$nb_base_read2_t\n";
|
|
392 close (LF);
|
|
393
|
0
|
394
|
|
395 sub grooming_and_trimming{
|
|
396 my $header = shift;
|
|
397 my $seq = shift;
|
|
398 my $quality = shift;
|
|
399 my $quality_converted="";
|
5
|
400 my $quality_ori=$quality;
|
0
|
401
|
5
|
402 my $lengthseq = length($seq);
|
|
403 my $startTrim = 0;
|
|
404 my $stopTrim = length($quality)-1;
|
|
405 my $startnoN = $startTrim;
|
|
406 my $stopnoN = $stopTrim;
|
0
|
407
|
|
408
|
|
409 my $chercheN = $seq;
|
5
|
410 my @bad_position_N;
|
|
411 my @bad_position_Q;
|
0
|
412 my $current_index = index($chercheN,"N");
|
|
413 my $abs_index = $current_index;
|
|
414 while ($current_index >=0){
|
5
|
415 push (@bad_position_N,$abs_index);
|
0
|
416
|
|
417 if ($current_index<length($seq)){
|
|
418 $chercheN = substr($chercheN,$current_index+1);
|
|
419 $current_index = index($chercheN,"N");
|
5
|
420 $abs_index = $current_index + $bad_position_N[$#bad_position_N]+1;
|
0
|
421 }
|
|
422 else {
|
|
423 last;
|
|
424 }
|
|
425 }
|
5
|
426
|
|
427 my @q = split(//,$quality);
|
|
428 for (my $i=0;$i<=$#q;$i++){
|
|
429 my $chr = $q[$i];
|
|
430 my $num = ord($q[$i]);
|
|
431 if ($TYPE eq "illumina"){
|
|
432 $num = $num - 31; # 31 comme la difference entre la plage sanger (33-> 93 / 0->60) et illumina (64->104 / 0->40)
|
|
433 $quality_converted .= chr($num);
|
|
434 }
|
|
435
|
|
436 if ($num < $MIN_QUALITY + 33){ #33 comme le départ de la plage sanger
|
|
437 push(@bad_position_Q,$i);
|
|
438 }
|
|
439 }
|
|
440 if ($quality_converted){$quality = $quality_converted;}
|
0
|
441
|
5
|
442 my @bad_position = (@bad_position_N, @bad_position_Q);
|
0
|
443
|
|
444 if ($#bad_position>=0){
|
5
|
445 @bad_position = sort {$a <=> $b} @bad_position;
|
|
446 my %coord=%{&extract_longer_string_coordinates_from_bad_position(0,$stopTrim,\@bad_position)};
|
|
447 $startTrim = $coord{"start"};
|
|
448 $stopTrim = $coord{"stop"};
|
|
449 #print "$startTrim .. $stopTrim\n";
|
0
|
450
|
5
|
451 }
|
|
452 my $lengthTrim = $stopTrim - $startTrim +1;
|
7
|
453
|
|
454 #if ($stats_length{$lengthTrim}){
|
|
455 # $stats_length{$lengthTrim} = 1;
|
|
456 #}
|
|
457 #else {
|
|
458 # $stats_length{$lengthTrim}++;
|
|
459 #}
|
5
|
460 my $fastq_lines="";
|
|
461
|
|
462 # if ($header =~ /GA8\-EAS671_0005\:3\:1\:1043\:4432/){
|
|
463 # print "HEAD:\t$header";
|
|
464 # print "SEQ:\n$seq\n";
|
|
465 # print "$quality_ori\n";
|
|
466 # print "$quality\n";
|
|
467 # for (my $i=0;$i<=$#bad_position;$i++){
|
|
468 # print $bad_position[$i]."(".$q[$bad_position[$i]]." : ".ord($q[$bad_position[$i]]).")"."\t";
|
|
469 # }
|
|
470 # print "\n";
|
|
471 # print "$startTrim .. $stopTrim / $lengthTrim \n";
|
|
472 # print $fastq_lines;
|
|
473 # print "\n";
|
|
474 # }
|
|
475
|
7
|
476 #for (my $i=$startTrim;$i<=$stopTrim;$i++){
|
|
477 # if ($stats_quality{ord($q{$i])}){
|
|
478 # $stats_quality{ord($q{$i])}=1;
|
|
479 # }
|
|
480 # else {
|
|
481 # $stats_quality{ord($q{$i])}++;
|
|
482 # }
|
|
483 #}
|
|
484
|
5
|
485 if ($lengthTrim >= $MIN_LENGTH){
|
|
486 $fastq_lines .= $header;
|
7
|
487 my $new_seq = substr($seq,$startTrim,$lengthTrim);
|
|
488 $nb_base_current_t = length($new_seq);
|
|
489 $fastq_lines .= $new_seq."\n";
|
5
|
490 $fastq_lines .= "+\n";
|
7
|
491 my $new_q = substr($quality,$startTrim,$lengthTrim);
|
|
492 $fastq_lines .= $new_q."\n";
|
5
|
493 return $fastq_lines;
|
0
|
494
|
|
495 }
|
|
496 else {
|
5
|
497 #print "Insufficient length after trimming\n";
|
0
|
498 return "";
|
|
499 }
|
|
500 }
|
|
501
|
|
502 sub extract_longer_string_coordinates_from_bad_position{
|
|
503 my $start=shift;
|
|
504 my $stop =shift;
|
|
505 my $refbad = shift;
|
|
506 my @bad_position = @$refbad;
|
|
507 my %coord;
|
|
508
|
|
509 my $current_start = $start;
|
|
510 my $current_stop = $bad_position[0]-1;
|
|
511 if ($current_stop < $start){$current_stop = $start;}
|
|
512
|
|
513
|
|
514 #debut -> premier N
|
|
515 my $current_length = $current_stop - $current_start +1;
|
|
516 my $test_length;
|
|
517
|
|
518 #entre les N
|
|
519 for (my $i=1;$i<=$#bad_position;$i++){
|
|
520 $test_length = $bad_position[$i]+1-$bad_position[$i-1]-1;
|
|
521 if ( $test_length > $current_length){
|
|
522 $current_start = $bad_position[$i-1]+1;
|
|
523 $current_stop = $bad_position[$i]-1;
|
|
524 $current_length = $current_stop - $current_start +1;
|
|
525 }
|
|
526 }
|
|
527
|
|
528 #dernier N -> fin
|
|
529 $test_length = $stop-$bad_position[$#bad_position]+1;
|
|
530 if ( $test_length > $current_length){
|
|
531 $current_start = $bad_position[$#bad_position]+1;
|
|
532 if ($current_start > $stop){$current_start=$stop;}
|
|
533 $current_stop = $stop;
|
|
534 }
|
|
535 $coord{"start"}=$current_start;
|
|
536 $coord{"stop"}= $current_stop;
|
|
537 $coord{"lenght"}=$current_stop-$current_start+1;
|
|
538
|
|
539 return \%coord;
|
|
540 }
|