comparison trim_galore @ 0:3c1664caa8e3 draft

Uploaded
author bgruening
date Sat, 06 Jul 2013 09:52:23 -0400
parents
children 898db63d2e84
comparison
equal deleted inserted replaced
-1:000000000000 0:3c1664caa8e3
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
4 use Getopt::Long;
5 use IPC::Open3;
6 use File::Spec;
7 use File::Basename;
8 use Cwd;
9
10 ## This program is Copyright (C) 2012, Felix Krueger (felix.krueger@babraham.ac.uk)
11
12 ## This program is free software: you can redistribute it and/or modify
13 ## it under the terms of the GNU General Public License as published by
14 ## the Free Software Foundation, either version 3 of the License, or
15 ## (at your option) any later version.
16
17 ## This program is distributed in the hope that it will be useful,
18 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ## GNU General Public License for more details.
21
22 ## You should have received a copy of the GNU General Public License
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
24
25
26
27 ## this script is taking in FastQ sequences and trims them with Cutadapt
28 ## last modified on 18 10 2012
29
30 ########################################################################
31
32 # change these paths if needed
33
34 my $path_to_cutadapt = 'cutadapt';
35 my $path_to_fastqc = 'fastqc';
36
37 ########################################################################
38
39
40 my $trimmer_version = '0.2.5';
41 my $DOWARN = 1; # print on screen warning and text by default
42 BEGIN { $SIG{'__WARN__'} = sub { warn $_[0] if $DOWARN } };
43
44 my ($cutoff,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$a2,$error_rate,$output_dir,$no_report_file) = process_commandline();
45
46 ### SETTING DEFAULTS UNLESS THEY WERE SPECIFIED
47 unless (defined $cutoff){
48 $cutoff = 20;
49 }
50 my $phred_score_cutoff = $cutoff; # only relevant for report
51
52 unless (defined $adapter){
53 $adapter = 'AGATCGGAAGAGC';
54 }
55 unless (defined $a2){ # optional adapter for the second read in a pair. Only works for --paired trimming
56 $a2 = '';
57 }
58
59 unless (defined $stringency){
60 $stringency = 1;
61 }
62
63 unless (defined $length_cutoff){
64 $length_cutoff = 20;
65 }
66
67 if ($phred_encoding == 64){
68 $cutoff += 31;
69 }
70
71 my @filenames = @ARGV;
72
73 my $file_1;
74 my $file_2;
75
76 foreach my $filename (@ARGV){
77 trim ($filename);
78 }
79
80
81 sub trim{
82 my $filename = shift;
83
84 my $output_filename = (split (/\//,$filename))[-1];
85 # warn "Here is the outputfile name: $output_filename\n";
86
87 my $report = $output_filename;
88 $report =~ s/$/_trimming_report.txt/;
89
90 if ($no_report_file) {
91 $report = File::Spec->devnull;
92 open (REPORT,'>',$report) or die "Failed to write to file: $!\n";
93 # warn "Redirecting report output to /dev/null\n";
94 }
95 else{
96 open (REPORT,'>',$output_dir.$report) or die "Failed to write to file: $!\n";
97 warn "Writing report to '$output_dir$report'\n";
98 }
99
100 warn "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n";
101 print REPORT "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n";
102
103 warn "Quality Phred score cutoff: $phred_score_cutoff\n";
104 print REPORT "Quality Phred score cutoff: $phred_score_cutoff\n";
105
106 warn "Quality encoding type selected: ASCII+$phred_encoding\n";
107 print REPORT "Quality encoding type selected: ASCII+$phred_encoding\n";
108
109 warn "Adapter sequence: '$adapter'\n";
110 print REPORT "Adapter sequence: '$adapter'\n";
111
112 if ($error_rate == 0.1){
113 warn "Maximum trimming error rate: $error_rate (default)\n";
114 }
115 else{
116 warn "Maximum trimming error rate: $error_rate\n";
117 }
118
119 print REPORT "Maximum trimming error rate: $error_rate";
120 if ($error_rate == 0.1){
121 print REPORT " (default)\n";
122 }
123 else{
124 print REPORT "\n";
125 }
126
127 if ($a2){
128 warn "Optional adapter 2 sequence (only used for read 2 of paired-end files): '$a2'\n";
129 print REPORT "Optional adapter 2 sequence (only used for read 2 of paired-end files): '$a2'\n";
130 }
131
132 warn "Minimum required adapter overlap (stringency): $stringency bp\n";
133 print REPORT "Minimum required adapter overlap (stringency): $stringency bp\n";
134
135 if ($validate){
136 warn "Minimum required sequence length for both reads before a sequence pair gets removed: $length_cutoff bp\n";
137 print REPORT "Minimum required sequence length for both reads before a sequence pair gets removed: $length_cutoff bp\n";
138 }
139 else{
140 warn "Minimum required sequence length before a sequence gets removed: $length_cutoff bp\n";
141 print REPORT "Minimum required sequence length before a sequence gets removed: $length_cutoff bp\n";
142 }
143
144 if ($validate){ # only for paired-end files
145
146 if ($retain){ # keeping single-end reads if only one end is long enough
147
148 if ($length_read_1 == 35){
149 warn "Length cut-off for read 1: $length_read_1 bp (default)\n";
150 print REPORT "Length cut-off for read 1: $length_read_1 bp (default)\n";
151 }
152 else{
153 warn "Length cut-off for read 1: $length_read_1 bp\n";
154 print REPORT "Length cut-off for read 1: $length_read_1 bp\n";
155 }
156
157 if ($length_read_2 == 35){
158 warn "Length cut-off for read 2: $length_read_2 b (default)\n";
159 print REPORT "Length cut-off for read 2: $length_read_2 bp (default)\n";
160 }
161 else{
162 warn "Length cut-off for read 2: $length_read_2 bp\n";
163 print REPORT "Length cut-off for read 2: $length_read_2 bp\n";
164 }
165 }
166 }
167
168 if ($rrbs){
169 warn "File was specified to be an MspI-digested RRBS sample. Sequences with adapter contamination will be trimmed a further 2 bp to remove potential methylation-biased bases from the end-repair reaction\n";
170 print REPORT "File was specified to be an MspI-digested RRBS sample. Sequences with adapter contamination will be trimmed a further 2 bp to remove potential methylation-biased bases from the end-repair reaction\n";
171 }
172
173 if ($non_directional){
174 warn "File was specified to be a non-directional MspI-digested RRBS sample. Sequences starting with either 'CAA' or 'CGA' will have the first 2 bp trimmed off to remove potential methylation-biased bases from the end-repair reaction\n";
175 print REPORT "File was specified to be a non-directional MspI-digested RRBS sample. Sequences starting with either 'CAA' or 'CGA' will have the first 2 bp trimmed off to remove potential methylation-biased bases from the end-repair reaction\n";
176 }
177
178 if ($trim){
179 warn "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n";
180 print REPORT "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n";
181 }
182
183 if ($fastqc){
184 warn "Running FastQC on the data once trimming has completed\n";
185 print REPORT "Running FastQC on the data once trimming has completed\n";
186
187 if ($fastqc_args){
188 warn "Running FastQC with the following extra arguments: '$fastqc_args'\n";
189 print REPORT "Running FastQC with the following extra arguments: $fastqc_args\n";
190 }
191 }
192
193 if ($keep and $rrbs){
194 warn "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n";
195 print REPORT "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n";
196 }
197
198 if ($gzip or $filename =~ /\.gz$/){
199 warn "Output file will be GZIP compressed\n";
200 print REPORT "Output file will be GZIP compressed\n";
201 }
202
203 warn "\n";
204 print REPORT "\n";
205 sleep (3);
206
207 my $temp;
208
209 ### Proceeding differently for RRBS and other type of libraries
210 if ($rrbs){
211
212 ### Skipping quality filtering for RRBS libraries if a quality cutoff of 0 was specified
213 if ($cutoff == 0){
214 warn "Quality cutoff selected was 0 - Skipping quality trimming altogether\n\n";
215 sleep (3);
216 }
217 else{
218
219 $temp = $filename;
220 $temp =~ s/$/_qual_trimmed.fastq/;
221 open (TEMP,'>',$output_dir.$temp) or die "Can't write to $temp: $!";
222
223 warn " >>> Now performing adaptive quality trimming with a Phred-score cutoff of: $cutoff <<<\n\n";
224 sleep (3);
225
226 open (QUAL,"$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -a X $filename |") or die "Can't open pipe to Cutadapt: $!";
227
228 my $qual_count = 0;
229
230 while (1){
231 my $l1 = <QUAL>;
232 my $seq = <QUAL>;
233 my $l3 = <QUAL>;
234 my $qual = <QUAL>;
235 last unless (defined $qual);
236
237 $qual_count++;
238 if ($qual_count%10000000 == 0){
239 warn "$qual_count sequences processed\n";
240 }
241 print TEMP "$l1$seq$l3$qual";
242 }
243
244 warn "\n >>> Quality trimming completed <<<\n$qual_count sequences processed in total\n\n";
245 close QUAL or die "Unable to close filehandle: $!\n";
246 sleep (3);
247
248 }
249 }
250
251
252 if ($output_filename =~ /\.fastq$/){
253 $output_filename =~ s/\.fastq$/_trimmed.fq/;
254 }
255 elsif ($output_filename =~ /\.fastq\.gz$/){
256 $output_filename =~ s/\.fastq\.gz$/_trimmed.fq/;
257 }
258 elsif ($output_filename =~ /\.fq$/){
259 $output_filename =~ s/\.fq$/_trimmed.fq/;
260 }
261 elsif ($output_filename =~ /\.fq\.gz$/){
262 $output_filename =~ s/\.fq\.gz$/_trimmed.fq/;
263 }
264 else{
265 $output_filename =~ s/$/_trimmed.fq/;
266 }
267
268 warn "Writing final adapter and quality trimmed output to $output_filename\n\n";
269 open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n";
270 sleep (2);
271
272 my $count = 0;
273 my $too_short = 0;
274 my $quality_trimmed = 0;
275 my $rrbs_trimmed = 0;
276 my $rrbs_trimmed_start = 0;
277 my $CAA = 0;
278 my $CGA = 0;
279
280 if ($rrbs and $cutoff != 0){
281
282 ### optionally using 2 different adapters for read 1 and read 2
283 if ($validate and $a2){
284 ### Figure out whether current file counts as read 1 or read 2 of paired-end files
285 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
286 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n";
287 sleep (3);
288 open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
289 }
290 else{ # this is read 2 of a pair
291 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$a2' from file $temp <<< \n";
292 sleep (3);
293 open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $a2 $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
294 }
295 }
296 ### Using the same adapter for both read 1 and read 2
297 else{
298 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n";
299 sleep (3);
300 open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
301 }
302
303 close WRITER or die $!; # not needed
304
305 open (QUAL,"$output_dir$temp") or die $!; # quality trimmed file
306
307 if ($filename =~ /\.gz$/){
308 open (IN,"zcat $filename |") or die $!; # original, untrimmed file
309 }
310 else{
311 open (IN,$filename) or die $!; # original, untrimmed file
312 }
313
314 while (1){
315
316 # we can process the output from Cutadapt and the original input 1 by 1 to decide if the adapter has been removed or not
317 my $l1 = <TRIM>;
318 my $seq = <TRIM>; # adapter trimmed sequence
319 my $l3 = <TRIM>;
320 my $qual = <TRIM>;
321
322 $_ = <IN>; # irrelevant
323 my $original_seq = <IN>;
324 $_ = <IN>; # irrelevant
325 $_ = <IN>; # irrelevant
326
327 $_ = <QUAL>; # irrelevant
328 my $qual_trimmed_seq = <QUAL>;
329 $_ = <QUAL>; # irrelevant
330 my $qual_trimmed_qual = <QUAL>;
331
332 last unless (defined $qual and defined $qual_trimmed_qual); # could be empty strings
333
334 $count++;
335 if ($count%10000000 == 0){
336 warn "$count sequences processed\n";
337 }
338
339 chomp $seq;
340 chomp $qual;
341 chomp $qual_trimmed_seq;
342 chomp $original_seq;
343
344 my $quality_trimmed_seq_length = length $qual_trimmed_seq;
345
346 if (length $original_seq > length $qual_trimmed_seq){
347 ++$quality_trimmed;
348 }
349
350 my $nd = 0;
351
352 ### NON-DIRECTIONAL RRBS
353 if ($non_directional){
354 if (length$seq > 2){
355 if ($seq =~ /^CAA/){
356 ++$CAA;
357 $seq = substr ($seq,2,length($seq)-2);
358 $qual = substr ($qual,2,length($qual)-2);
359 ++$rrbs_trimmed_start;
360 $nd = 1;
361 }
362 elsif ($seq =~ /^CGA/){
363 $seq = substr ($seq,2,length($seq)-2);
364 $qual = substr ($qual,2,length($qual)-2);
365 ++$CGA;
366 ++$rrbs_trimmed_start;
367 $nd = 1;
368 }
369 }
370 }
371
372 ### directional read
373 unless ($nd == 1){
374 if (length $seq >= 2 and length$seq < $quality_trimmed_seq_length){
375 $seq = substr ($seq,0,length($seq)-2);
376 $qual = substr ($qual,0,length($qual)-2);
377 ++$rrbs_trimmed;
378 }
379 }
380
381 ### Shortening all sequences by 1 bp on the 3' end
382 if ($trim){
383 $seq = substr($seq,0,length($seq)-1);
384 $qual = substr($qual,0,length($qual)-1);
385 }
386
387 ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE
388 if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later)
389 print OUT "$l1$seq\n$l3$qual\n";
390 }
391 else{ # single end
392 if (length $seq < $length_cutoff){
393 ++$too_short;
394 next;
395 }
396 else{
397 print OUT "$l1$seq\n$l3$qual\n";
398 }
399 }
400 }
401
402 print REPORT "\n";
403 while (<ERROR>){
404 warn $_;
405 print REPORT $_;
406 }
407
408 close IN or die "Unable to close IN filehandle: $!";
409 close QUAL or die "Unable to close QUAL filehandle: $!";
410 close TRIM or die "Unable to close TRIM filehandle: $!";
411 close OUT or die "Unable to close OUT filehandle: $!";
412 }
413 else{
414
415 ### optionally using 2 different adapters for read 1 and read 2
416 if ($validate and $a2){
417 ### Figure out whether current file counts as read 1 or read 2 of paired-end files
418 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
419 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n";
420 sleep (3);
421 open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $adapter $filename") or die "Failed to launch Cutadapt: $!";
422 }
423 else{ # this is read 2 of a pair
424 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$a2' from file $filename <<< \n";
425 sleep (3);
426 open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $a2 $filename") or die "Failed to launch Cutadapt: $!";
427 }
428 }
429 ### Using the same adapter for both read 1 and read 2
430 else{
431 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n";
432 sleep (3);
433 open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $adapter $filename") or die "Failed to launch Cutadapt: $!";
434 }
435
436 close WRITER or die $!; # not needed
437
438 while (1){
439
440 my $l1 = <TRIM>;
441 my $seq = <TRIM>; # quality and/or adapter trimmed sequence
442 my $l3 = <TRIM>;
443 my $qual = <TRIM>;
444 # print "$l1$seq\n$l3$qual\n";
445 last unless (defined $qual); # could be an empty string
446
447 $count++;
448 if ($count%10000000 == 0){
449 warn "$count sequences processed\n";
450 }
451
452 chomp $seq;
453 chomp $qual;
454
455 ### Shortening all sequences by 1 bp on the 3' end
456 if ($trim){
457 $seq = substr($seq,0,length($seq)-1);
458 $qual = substr($qual,0,length($qual)-1);
459 }
460
461 ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE
462 if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later)
463 print OUT "$l1$seq\n$l3$qual\n";
464 }
465 else{ # single end
466 if (length $seq < $length_cutoff){
467 ++$too_short;
468 next;
469 }
470 else{
471 print OUT "$l1$seq\n$l3$qual\n";
472 }
473 }
474 }
475
476 print REPORT "\n";
477 while (<ERROR>){
478 warn $_;
479 print REPORT $_;
480 }
481
482 close TRIM or die "Unable to close TRIM filehandle: $!\n";
483 close ERROR or die "Unable to close ERROR filehandle: $!\n";
484 close OUT or die "Unable to close OUT filehandle: $!\n";
485
486 }
487
488 if ($rrbs){
489 unless ($keep){ # keeping the quality trimmed intermediate file for RRBS files
490
491 # deleting temporary quality trimmed file
492 my $deleted = unlink "$output_dir$temp";
493
494 if ($deleted){
495 warn "Successfully deleted temporary file $temp\n\n";
496 }
497 else{
498 warn "Could not delete temporary file $temp";
499 }
500 }
501 }
502
503 warn "\nRUN STATISTICS FOR INPUT FILE: $filename\n";
504 print REPORT "\nRUN STATISTICS FOR INPUT FILE: $filename\n";
505
506 warn "="x 45,"\n";
507 print REPORT "="x 45,"\n";
508
509 warn "$count sequences processed in total\n";
510 print REPORT "$count sequences processed in total\n";
511
512 ### only reporting this separately if quality and adapter trimming were performed separately
513 if ($rrbs){
514 my $percentage_shortened = sprintf ("%.1f",$quality_trimmed/$count*100);
515 warn "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n";
516 print REPORT "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n";
517 }
518
519 my $percentage_too_short = sprintf ("%.1f",$too_short/$count*100);
520 warn "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n";
521 print REPORT "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n";
522
523 if ($rrbs){
524 my $percentage_rrbs_trimmed = sprintf ("%.1f",$rrbs_trimmed/$count*100);
525 warn "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n";
526 print REPORT "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n";
527 }
528
529 if ($non_directional){
530 my $percentage_rrbs_trimmed_at_start = sprintf ("%.1f",$rrbs_trimmed_start/$count*100);
531 warn "RRBS reads trimmed by 2 bp at the start when read started with CAA ($CAA) or CGA ($CGA) in total:\t$rrbs_trimmed_start ($percentage_rrbs_trimmed_at_start%)\n";
532 print REPORT "RRBS reads trimmed by 2 bp at the start when read started with CAA ($CAA) or CGA ($CGA) in total:\t$rrbs_trimmed_start ($percentage_rrbs_trimmed_at_start%)\n";
533 }
534
535 warn "\n";
536 print REPORT "\n";
537
538 ### RUNNING FASTQC
539 if ($fastqc){
540 warn "\n >>> Now running FastQC on the data <<<\n\n";
541 sleep (5);
542 if ($fastqc_args){
543 system ("$path_to_fastqc $fastqc_args $output_filename");
544 }
545 else{
546 system ("$path_to_fastqc $output_filename");
547 }
548 }
549
550 ### COMPRESSING OUTPUT FILE
551 unless ($validate){ # not gzipping intermediate files for paired-end files
552 if ($gzip or $filename =~ /\.gz$/){
553 warn "\n >>> GZIP-ing the output file <<<\n\n";
554 system ("gzip -f $output_filename");
555 $output_filename = $output_filename.'.gz';
556 }
557 }
558
559 ### VALIDATE PAIRED-END FILES
560 if ($validate){
561
562 ### Figure out whether current file counts as read 1 or read 2 of paired-end files
563
564 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
565 $file_1 = $output_filename;
566 shift @filenames;
567 # warn "This is read 1: $file_1\n\n";
568 }
569 else{ # this is read 2 of a pair
570 $file_2 = $output_filename;
571 shift @filenames;
572 # warn "This is read 2: $file_2\n\n";
573 }
574
575 if ($file_1 and $file_2){
576 warn "Validate paired-end files $file_1 and $file_2\n";
577 sleep (1);
578
579 my ($val_1,$val_2,$un_1,$un_2) = validate_paired_end_files($file_1,$file_2);
580
581 ### RUNNING FASTQC
582 if ($fastqc){
583
584 warn "\n >>> Now running FastQC on the validated data $val_1<<<\n\n";
585 sleep (3);
586
587 if ($fastqc_args){
588 system ("$path_to_fastqc $fastqc_args $val_1");
589 }
590 else{
591 system ("$path_to_fastqc $val_1");
592 }
593
594 warn "\n >>> Now running FastQC on the validated data $val_2<<<\n\n";
595 sleep (3);
596
597 if ($fastqc_args){
598 system ("$path_to_fastqc $fastqc_args $val_2");
599 }
600 else{
601 system ("$path_to_fastqc $val_2");
602 }
603
604 }
605
606 if ($gzip or $filename =~ /\.gz$/){
607
608 # compressing main fastQ output files
609 warn "Compressing the validated output file $val_1 ...\n";
610 system ("gzip -f $val_1");
611
612 warn "Compressing the validated output file $val_2 ...\n";
613 system ("gzip -f $val_2");
614
615
616 if ($retain){ # compressing unpaired reads
617 warn "Compressing the unpaired read output $un_1 ...\n";
618 system ("gzip -f $un_1");
619
620 warn "Compressing the unpaired read output $un_2 ...\n";
621 system ("gzip -f $un_2");
622 }
623 }
624
625 warn "Deleting both intermediate output files $file_1 and $file_2\n";
626 unlink "$output_dir$file_1";
627 unlink "$output_dir$file_2";
628
629 warn "\n",'='x100,"\n\n";
630 sleep (3);
631
632 $file_1 = undef; # setting file_1 and file_2 to undef once validation is completed
633 $file_2 = undef;
634 }
635 }
636
637 }
638
639 sub validate_paired_end_files{
640
641 my $file_1 = shift;
642 my $file_2 = shift;
643
644 warn "file_1 $file_1 file_2 $file_2\n\n";
645
646 if ($file_1 =~ /\.gz$/){
647 open (IN1,"zcat $output_dir$file_1 |") or die "Couldn't read from file $file_1: $!\n";
648 }
649 else{
650 open (IN1, "$output_dir$file_1") or die "Couldn't read from file $file_1: $!\n";
651 }
652
653 if ($file_2 =~ /\.gz$/){
654 open (IN2,"zcat $output_dir$file_2 |") or die "Couldn't read from file $file_2: $!\n";
655 }
656 else{
657 open (IN2, "$output_dir$file_2") or die "Couldn't read from file $file_2: $!\n";
658 }
659
660 warn "\n>>>>> Now validing the length of the 2 paired-end infiles: $file_1 and $file_2 <<<<<\n";
661 sleep (3);
662
663 my $out_1 = $file_1;
664 my $out_2 = $file_2;
665
666 if ($out_1 =~ /gz$/){
667 $out_1 =~ s/trimmed\.fq\.gz$/val_1.fq/;
668 }
669 else{
670 $out_1 =~ s/trimmed\.fq$/val_1.fq/;
671 }
672
673 if ($out_2 =~ /gz$/){
674 $out_2 =~ s/trimmed\.fq\.gz$/val_2.fq/;
675 }
676 else{
677 $out_2 =~ s/trimmed\.fq$/val_2.fq/;
678 }
679
680 open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n";
681 open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n";
682 warn "Writing validated paired-end read 1 reads to $out_1\n";
683 warn "Writing validated paired-end read 2 reads to $out_2\n\n";
684
685 my $unpaired_1;
686 my $unpaired_2;
687
688 if ($retain){
689
690 $unpaired_1 = $file_1;
691 $unpaired_2 = $file_2;
692
693 if ($unpaired_1 =~ /gz$/){
694 $unpaired_1 =~ s/trimmed\.fq\.gz$/unpaired_1.fq/;
695 }
696 else{
697 $unpaired_1 =~ s/trimmed\.fq$/unpaired_1.fq/;
698 }
699
700 if ($unpaired_2 =~ /gz$/){
701 $unpaired_2 =~ s/trimmed\.fq\.gz$/unpaired_2.fq/;
702 }
703 else{
704 $unpaired_2 =~ s/trimmed\.fq$/unpaired_2.fq/;
705 }
706
707 open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n";
708 open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n";
709
710 warn "Writing unpaired read 1 reads to $unpaired_1\n";
711 warn "Writing unpaired read 2 reads to $unpaired_2\n\n";
712 }
713
714 my $sequence_pairs_removed = 0;
715 my $read_1_printed = 0;
716 my $read_2_printed = 0;
717
718 my $count = 0;
719
720 while (1){
721 my $id_1 = <IN1>;
722 my $seq_1 = <IN1>;
723 my $l3_1 = <IN1>;
724 my $qual_1 = <IN1>;
725 last unless ($id_1 and $seq_1 and $l3_1 and $qual_1);
726
727 my $id_2 = <IN2>;
728 my $seq_2 = <IN2>;
729 my $l3_2 = <IN2>;
730 my $qual_2 = <IN2>;
731 last unless ($id_2 and $seq_2 and $l3_2 and $qual_2);
732
733 ++$count;
734
735
736 ## small check if the sequence files appear to FastQ files
737 if ($count == 1){ # performed just once
738 if ($id_1 !~ /^\@/ or $l3_1 !~ /^\+/){
739 die "Input file doesn't seem to be in FastQ format at sequence $count\n";
740 }
741 if ($id_2 !~ /^\@/ or $l3_2 !~ /^\+/){
742 die "Input file doesn't seem to be in FastQ format at sequence $count\n";
743 }
744 }
745
746 chomp $seq_1;
747 chomp $seq_2;
748
749
750 ### making sure that the reads do have a sensible length
751 if ( (length($seq_1) < $length_cutoff) or (length($seq_2) < $length_cutoff) ){
752 ++$sequence_pairs_removed;
753 if ($retain){ # writing out single-end reads if they are longer than the cutoff
754
755 if ( length($seq_1) >= $length_read_1){ # read 1 is long enough
756 print UNPAIRED1 $id_1;
757 print UNPAIRED1 "$seq_1\n";
758 print UNPAIRED1 $l3_1;
759 print UNPAIRED1 $qual_1;
760 ++$read_1_printed;
761 }
762
763 if ( length($seq_2) >= $length_read_2){ # read 2 is long enough
764 print UNPAIRED2 $id_2;
765 print UNPAIRED2 "$seq_2\n";
766 print UNPAIRED2 $l3_2;
767 print UNPAIRED2 $qual_2;
768 ++$read_2_printed;
769 }
770
771 }
772 }
773 else{
774 print R1 $id_1;
775 print R1 "$seq_1\n";
776 print R1 $l3_1;
777 print R1 $qual_1;
778
779 print R2 $id_2;
780 print R2 "$seq_2\n";
781 print R2 $l3_2;
782 print R2 $qual_2;
783 }
784
785 }
786 my $percentage = sprintf("%.2f",$sequence_pairs_removed/$count*100);
787 warn "Total number of sequences analysed: $count\n\n";
788 warn "Number of sequence pairs removed: $sequence_pairs_removed ($percentage%)\n";
789
790 print REPORT "Total number of sequences analysed for the sequence pair length validation: $count\n\n";
791 print REPORT "Number of sequence pairs removed because at least one read was shorter than the length cutoff ($length_cutoff bp): $sequence_pairs_removed ($percentage%)\n";
792
793 if ($keep){
794 warn "Number of unpaired read 1 reads printed: $read_1_printed\n";
795 warn "Number of unpaired read 2 reads printed: $read_2_printed\n";
796 }
797
798 warn "\n";
799 if ($retain){
800 return ($out_1,$out_2,$unpaired_1,$unpaired_2);
801 }
802 else{
803 return ($out_1,$out_2);
804 }
805 }
806
807 sub process_commandline{
808 my $help;
809 my $quality;
810 my $adapter;
811 my $adapter2;
812 my $stringency;
813 my $report;
814 my $version;
815 my $rrbs;
816 my $length_cutoff;
817 my $keep;
818 my $fastqc;
819 my $non_directional;
820 my $phred33;
821 my $phred64;
822 my $fastqc_args;
823 my $trim;
824 my $gzip;
825 my $validate;
826 my $retain;
827 my $length_read_1;
828 my $length_read_2;
829 my $error_rate;
830 my $output_dir;
831 my $no_report_file;
832 my $suppress_warn;
833
834 my $command_line = GetOptions ('help|man' => \$help,
835 'q|quality=i' => \$quality,
836 'a|adapter=s' => \$adapter,
837 'a2|adapter2=s' => \$adapter2,
838 'report' => \$report,
839 'version' => \$version,
840 'stringency=i' => \$stringency,
841 'fastqc' => \$fastqc,
842 'RRBS' => \$rrbs,
843 'keep' => \$keep,
844 'length=i' => \$length_cutoff,
845 'non_directional' => \$non_directional,
846 'phred33' => \$phred33,
847 'phred64' => \$phred64,
848 'fastqc_args=s' => \$fastqc_args,
849 'trim1' => \$trim,
850 'gzip' => \$gzip,
851 'paired_end' => \$validate,
852 'retain_unpaired' => \$retain,
853 'length_1|r1=i' => \$length_read_1,
854 'length_2|r2=i' => \$length_read_2,
855 'e|error_rate=s' => \$error_rate,
856 'o|output_dir=s' => \$output_dir,
857 'no_report_file' => \$no_report_file,
858 'suppress_warn' => \$suppress_warn,
859 );
860
861 ### EXIT ON ERROR if there were errors with any of the supplied options
862 unless ($command_line){
863 die "Please respecify command line options\n";
864 }
865
866 ### HELPFILE
867 if ($help){
868 print_helpfile();
869 exit;
870 }
871
872
873
874
875 if ($version){
876 print << "VERSION";
877
878 Quality-/Adapter-/RRBS-Trimming
879 (powered by Cutadapt)
880 version $trimmer_version
881
882 Last update: 18 10 2012
883
884 VERSION
885 exit;
886 }
887
888 ### RRBS
889 unless ($rrbs){
890 $rrbs = 0;
891 }
892
893 ### SUPRESS WARNINGS
894 if (defined $suppress_warn){
895 $DOWARN = 0;
896 }
897
898 ### QUALITY SCORES
899 my $phred_encoding;
900 if ($phred33){
901 if ($phred64){
902 die "Please specify only a single quality encoding type (--phred33 or --phred64)\n\n";
903 }
904 $phred_encoding = 33;
905 }
906 elsif ($phred64){
907 $phred_encoding = 64;
908 }
909 unless ($phred33 or $phred64){
910 warn "No quality encoding type selected. Assuming that the data provided uses Sanger encoded Phred scores (default)\n\n";
911 $phred_encoding = 33;
912 sleep (3);
913 }
914
915 ### NON-DIRECTIONAL RRBS
916 if ($non_directional){
917 unless ($rrbs){
918 die "Option '--non_directional' requires '--rrbs' to be specified as well. Please re-specify!\n";
919 }
920 }
921 else{
922 $non_directional = 0;
923 }
924
925 if ($fastqc_args){
926 $fastqc = 1; # specifying fastqc extra arguments automatically means that FastQC will be executed
927 }
928 else{
929 $fastqc_args = 0;
930 }
931
932 ### CUSTOM ERROR RATE
933 if (defined $error_rate){
934 # make sure that the error rate is between 0 and 1
935 unless ($error_rate >= 0 and $error_rate <= 1){
936 die "Please specify an error rate between 0 and 1 (the default is 0.1)\n";
937 }
938 }
939 else{
940 $error_rate = 0.1; # (default)
941 }
942
943 if (defined $adapter){
944 unless ($adapter =~ /^[ACTGNactgn]+$/){
945 die "Adapter sequence must contain DNA characters only (A,C,T,G or N)!\n";
946 }
947 $adapter = uc$adapter;
948 }
949
950 if (defined $adapter2){
951 unless ($validate){
952 die "An optional adapter for read 2 of paired-end files requires '--paired' to be specified as well! Please re-specify\n";
953 }
954 unless ($adapter2 =~ /^[ACTGNactgn]+$/){
955 die "Optional adapter 2 sequence must contain DNA characters only (A,C,T,G or N)!\n";
956 }
957 $adapter2 = uc$adapter2;
958 }
959
960 ### files are supposed to be paired-end files
961 if ($validate){
962
963 # making sure that an even number of reads has been supplied
964 unless ((scalar@ARGV)%2 == 0){
965 die "Please provide an even number of input files for paired-end FastQ trimming! Aborting ...\n";
966 }
967
968 ## CUTOFF FOR VALIDATED READ-PAIRS
969
970 if (defined $length_read_1 or defined $length_read_2){
971 unless ($retain){
972 die "Please specify --keep_unpaired to alter the unpaired single-end read length cut off(s)\n\n";
973 }
974
975 if (defined $length_read_1){
976 unless ($length_read_1 >= 15 and $length_read_1 <= 100){
977 die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n";
978 }
979 unless ($length_read_1 > $length_cutoff){
980 die "The single-end unpaired read length needs to be longer than the paired-end cut-off value\n\n";
981 }
982 }
983
984 if (defined $length_read_2){
985 unless ($length_read_2 >= 15 and $length_read_2 <= 100){
986 die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n";
987 }
988 unless ($length_read_2 > $length_cutoff){
989 die "The single-end unpaired read length needs to be longer than the paired-end cut-off value\n\n";
990 }
991 }
992 }
993
994 if ($retain){
995 $length_read_1 = 35 unless (defined $length_read_1);
996 $length_read_2 = 35 unless (defined $length_read_2);
997 }
998 }
999
1000 unless ($no_report_file){
1001 $no_report_file = 0;
1002 }
1003
1004 ### OUTPUT DIR PATH
1005 if ($output_dir){
1006 unless ($output_dir =~ /\/$/){
1007 $output_dir =~ s/$/\//;
1008 }
1009 }
1010 else{
1011 $output_dir = '';
1012 }
1013
1014 return ($quality,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$adapter2,$error_rate,$output_dir,$no_report_file);
1015 }
1016
1017
1018
1019
1020 sub print_helpfile{
1021 print << "HELP";
1022
1023 USAGE:
1024
1025 trim_galore [options] <filename(s)>
1026
1027
1028 -h/--help Print this help message and exits.
1029
1030 -v/--version Print the version information and exits.
1031
1032 -q/--quality <INT> Trim low-quality ends from reads in addition to adapter removal. For
1033 RRBS samples, quality trimming will be performed first, and adapter
1034 trimming is carried in a second round. Other files are quality and adapter
1035 trimmed in a single pass. The algorithm is the same as the one used by BWA
1036 (Subtract INT from all qualities; compute partial sums from all indices
1037 to the end of the sequence; cut sequence at the index at which the sum is
1038 minimal). Default Phred score: 20.
1039
1040 --phred33 Instructs Cutadapt to use ASCII+33 quality scores as Phred scores
1041 (Sanger/Illumina 1.9+ encoding) for quality trimming. Default: ON.
1042
1043 --phred64 Instructs Cutadapt to use ASCII+64 quality scores as Phred scores
1044 (Illumina 1.5 encoding) for quality trimming.
1045
1046 --fastqc Run FastQC in the default mode on the FastQ file once trimming is complete.
1047
1048 --fastqc_args "<ARGS>" Passes extra arguments to FastQC. If more than one argument is to be passed
1049 to FastQC they must be in the form "arg1 arg2 etc.". An example would be:
1050 --fastqc_args "--nogroup --outdir /home/". Passing extra arguments will
1051 automatically invoke FastQC, so --fastqc does not have to be specified
1052 separately.
1053
1054 -a/--adapter <STRING> Adapter sequence to be trimmed. If not specified explicitely, the first 13
1055 bp of the Illumina adapter 'AGATCGGAAGAGC' will be used by default.
1056
1057 -a2/--adapter2 <STRING> Optional adapter sequence to be trimmed off read 2 of paired-end files. This
1058 option requires '--paired' to be specified as well.
1059
1060
1061 -s/--stringency <INT> Overlap with adapter sequence required to trim a sequence. Defaults to a
1062 very stringent setting of '1', i.e. even a single bp of overlapping sequence
1063 will be trimmed of the 3' end of any read.
1064
1065 -e <ERROR RATE> Maximum allowed error rate (no. of errors divided by the length of the matching
1066 region) (default: 0.1)
1067
1068 --gzip Compress the output file with gzip. If the input files are gzip-compressed
1069 the output files will be automatically gzip compressed as well.
1070
1071 --length <INT> Discard reads that became shorter than length INT because of either
1072 quality or adapter trimming. A value of '0' effectively disables
1073 this behaviour. Default: 20 bp.
1074
1075 For paired-end files, both reads of a read-pair need to be longer than
1076 <INT> bp to be printed out to validated paired-end files (see option --paired).
1077 If only one read became too short there is the possibility of keeping such
1078 unpaired single-end reads (see --retain_unpaired). Default pair-cutoff: 20 bp.
1079
1080 -o/--output_dir <DIR> If specified all output will be written to this directory instead of the current
1081 directory.
1082
1083 --no_report_file If specified no report file will be generated.
1084
1085 --suppress_warn If specified any output to STDOUT or STDERR will be suppressed.
1086
1087
1088
1089 RRBS-specific options (MspI digested material):
1090
1091 --rrbs Specifies that the input file was an MspI digested RRBS sample (recognition
1092 site: CCGG). Sequences which were adapter-trimmed will have a further 2 bp
1093 removed from their 3' end. This is to avoid that the filled-in C close to the
1094 second MspI site in a sequence is used for methylation calls. Sequences which
1095 were merely trimmed because of poor quality will not be shortened further.
1096
1097 --non_directional Selecting this option for non-directional RRBS libraries will screen
1098 quality-trimmed sequences for 'CAA' or 'CGA' at the start of the read
1099 and, if found, removes the first two basepairs. Like with the option
1100 '--rrbs' this avoids using cytosine positions that were filled-in
1101 during the end-repair step. '--non_directional' requires '--rrbs' to
1102 be specified as well.
1103
1104 --keep Keep the quality trimmed intermediate file. Default: off, which means
1105 the temporary file is being deleted after adapter trimming. Only has
1106 an effect for RRBS samples since other FastQ files are not trimmed
1107 for poor qualities separately.
1108
1109
1110 Note for RRBS using MseI:
1111
1112 If your DNA material was digested with MseI (recognition motif: TTAA) instead of MspI it is NOT necessary
1113 to specify --rrbs or --non_directional since virtually all reads should start with the sequence
1114 'TAA', and this holds true for both directional and non-directional libraries. As the end-repair of 'TAA'
1115 restricted sites does not involve any cytosines it does not need to be treated especially. Instead, simply
1116 run Trim Galore! in the standard (i.e. non-RRBS) mode.
1117
1118
1119 Paired-end specific options:
1120
1121 --paired This option performs length trimming of quality/adapter/RRBS trimmed reads for
1122 paired-end files. To pass the validation test, both sequences of a sequence pair
1123 are required to have a certain minimum length which is governed by the option
1124 --length (see above). If only one read passes this length threshold the
1125 other read can be rescued (see option --retain_unpaired). Using this option lets
1126 you discard too short read pairs without disturbing the sequence-by-sequence order
1127 of FastQ files which is required by many aligners.
1128
1129 Trim Galore! expects paired-end files to be supplied in a pairwise fashion, e.g.
1130 file1_1.fq file1_2.fq SRR2_1.fq.gz SRR2_2.fq.gz ... .
1131
1132 -t/--trim1 Trims 1 bp off every read from its 3' end. This may be needed for FastQ files that
1133 are to be aligned as paired-end data with Bowtie. This is because Bowtie (1) regards
1134 alignments like this:
1135
1136 R1 ---------------------------> or this: -----------------------> R1
1137 R2 <--------------------------- <----------------- R2
1138
1139 as invalid (whenever a start/end coordinate is contained within the other read).
1140
1141 --retain_unpaired If only one of the two paired-end reads became too short, the longer
1142 read will be written to either '.unpaired_1.fq' or '.unpaired_2.fq'
1143 output files. The length cutoff for unpaired single-end reads is
1144 governed by the parameters -r1/--length_1 and -r2/--length_2. Default: OFF.
1145
1146 -r1/--length_1 <INT> Unpaired single-end read length cutoff needed for read 1 to be written to
1147 '.unpaired_1.fq' output file. These reads may be mapped in single-end mode.
1148 Default: 35 bp.
1149
1150 -r2/--length_2 <INT> Unpaired single-end read length cutoff needed for read 2 to be written to
1151 '.unpaired_2.fq' output file. These reads may be mapped in single-end mode.
1152 Default: 35 bp.
1153
1154
1155 Last modified on 18 Oct 2012.
1156
1157 HELP
1158 exit;
1159 }