0
|
1 #!/usr/bin/perl
|
|
2 use strict;
|
|
3 use warnings;
|
|
4 use Getopt::Long;
|
|
5 use IPC::Open3;
|
|
6 use File::Spec;
|
|
7 use File::Basename;
|
|
8 use Cwd;
|
|
9
|
4
|
10 ## This program is Copyright (C) 2012-14, Felix Krueger (felix.krueger@babraham.ac.uk)
|
0
|
11
|
|
12 ## This program is free software: you can redistribute it and/or modify
|
|
13 ## it under the terms of the GNU General Public License as published by
|
|
14 ## the Free Software Foundation, either version 3 of the License, or
|
|
15 ## (at your option) any later version.
|
|
16
|
|
17 ## This program is distributed in the hope that it will be useful,
|
|
18 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
19 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
20 ## GNU General Public License for more details.
|
|
21
|
|
22 ## You should have received a copy of the GNU General Public License
|
|
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
24
|
|
25
|
|
26
|
|
27 ## this script is taking in FastQ sequences and trims them with Cutadapt
|
4
|
28
|
|
29 ## last modified on 16 July 2014
|
|
30
|
|
31
|
0
|
32
|
|
33 ########################################################################
|
|
34
|
|
35 # change these paths if needed
|
|
36
|
|
37 my $path_to_cutadapt = 'cutadapt';
|
|
38 my $path_to_fastqc = 'fastqc';
|
|
39
|
|
40 ########################################################################
|
|
41
|
4
|
42 my $trimmer_version = '0.3.7';
|
0
|
43 my $DOWARN = 1; # print on screen warning and text by default
|
|
44 BEGIN { $SIG{'__WARN__'} = sub { warn $_[0] if $DOWARN } };
|
|
45
|
4
|
46 my ($cutoff,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$a2,$error_rate,$output_dir,$no_report_file,$dont_gzip,$clip_r1,$clip_r2,$three_prime_clip_r1,$three_prime_clip_r2) = process_commandline();
|
1
|
47
|
|
48 my @filenames = @ARGV;
|
|
49
|
4
|
50
|
1
|
51 die "\nPlease provide the filename(s) of one or more FastQ file(s) to launch Trim Galore!\n
|
|
52 USAGE: 'trim_galore [options] <filename(s)>' or 'trim_galore --help' for more options\n\n" unless (@filenames);
|
|
53
|
0
|
54
|
|
55 ### SETTING DEFAULTS UNLESS THEY WERE SPECIFIED
|
|
56 unless (defined $cutoff){
|
|
57 $cutoff = 20;
|
|
58 }
|
|
59 my $phred_score_cutoff = $cutoff; # only relevant for report
|
|
60
|
|
61 unless (defined $adapter){
|
|
62 $adapter = 'AGATCGGAAGAGC';
|
|
63 }
|
|
64 unless (defined $a2){ # optional adapter for the second read in a pair. Only works for --paired trimming
|
|
65 $a2 = '';
|
|
66 }
|
|
67
|
|
68 unless (defined $stringency){
|
|
69 $stringency = 1;
|
|
70 }
|
|
71
|
|
72 if ($phred_encoding == 64){
|
|
73 $cutoff += 31;
|
|
74 }
|
|
75
|
|
76 my $file_1;
|
|
77 my $file_2;
|
|
78
|
|
79 foreach my $filename (@ARGV){
|
|
80 trim ($filename);
|
|
81 }
|
|
82
|
|
83
|
|
84 sub trim{
|
|
85 my $filename = shift;
|
|
86
|
|
87 my $output_filename = (split (/\//,$filename))[-1];
|
|
88
|
|
89 my $report = $output_filename;
|
|
90 $report =~ s/$/_trimming_report.txt/;
|
|
91
|
|
92 if ($no_report_file) {
|
|
93 $report = File::Spec->devnull;
|
4
|
94 open (REPORT,'>',$report) or die "Failed to write to file '$report': $!\n";
|
0
|
95 # warn "Redirecting report output to /dev/null\n";
|
|
96 }
|
|
97 else{
|
4
|
98 open (REPORT,'>',$output_dir.$report) or die "Failed to write to file '$report': $!\n";
|
0
|
99 warn "Writing report to '$output_dir$report'\n";
|
|
100 }
|
|
101
|
|
102 warn "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n";
|
|
103 print REPORT "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n";
|
|
104
|
4
|
105 if ($validate){ # paired-end mode
|
|
106 warn "Trimming mode: paired-end\n";
|
|
107 print REPORT "Trimming mode: paired-end\n";
|
|
108 }
|
|
109 else{
|
|
110 warn "Trimming mode: single-end\n";
|
|
111 print REPORT "Trimming mode: single-end\n";
|
|
112 }
|
|
113
|
|
114
|
|
115 warn "Trim Galore version: $trimmer_version\n";
|
|
116 print REPORT "Trim Galore version: $trimmer_version\n";
|
|
117
|
0
|
118 warn "Quality Phred score cutoff: $phred_score_cutoff\n";
|
|
119 print REPORT "Quality Phred score cutoff: $phred_score_cutoff\n";
|
|
120
|
|
121 warn "Quality encoding type selected: ASCII+$phred_encoding\n";
|
|
122 print REPORT "Quality encoding type selected: ASCII+$phred_encoding\n";
|
|
123
|
|
124 warn "Adapter sequence: '$adapter'\n";
|
|
125 print REPORT "Adapter sequence: '$adapter'\n";
|
|
126
|
|
127 if ($error_rate == 0.1){
|
|
128 warn "Maximum trimming error rate: $error_rate (default)\n";
|
|
129 }
|
|
130 else{
|
|
131 warn "Maximum trimming error rate: $error_rate\n";
|
|
132 }
|
|
133
|
|
134 print REPORT "Maximum trimming error rate: $error_rate";
|
|
135 if ($error_rate == 0.1){
|
|
136 print REPORT " (default)\n";
|
|
137 }
|
|
138 else{
|
|
139 print REPORT "\n";
|
|
140 }
|
|
141
|
|
142 if ($a2){
|
|
143 warn "Optional adapter 2 sequence (only used for read 2 of paired-end files): '$a2'\n";
|
|
144 print REPORT "Optional adapter 2 sequence (only used for read 2 of paired-end files): '$a2'\n";
|
|
145 }
|
|
146
|
|
147 warn "Minimum required adapter overlap (stringency): $stringency bp\n";
|
|
148 print REPORT "Minimum required adapter overlap (stringency): $stringency bp\n";
|
|
149
|
|
150 if ($validate){
|
|
151 warn "Minimum required sequence length for both reads before a sequence pair gets removed: $length_cutoff bp\n";
|
|
152 print REPORT "Minimum required sequence length for both reads before a sequence pair gets removed: $length_cutoff bp\n";
|
|
153 }
|
|
154 else{
|
|
155 warn "Minimum required sequence length before a sequence gets removed: $length_cutoff bp\n";
|
|
156 print REPORT "Minimum required sequence length before a sequence gets removed: $length_cutoff bp\n";
|
|
157 }
|
|
158
|
|
159 if ($validate){ # only for paired-end files
|
|
160
|
|
161 if ($retain){ # keeping single-end reads if only one end is long enough
|
|
162
|
|
163 if ($length_read_1 == 35){
|
|
164 warn "Length cut-off for read 1: $length_read_1 bp (default)\n";
|
|
165 print REPORT "Length cut-off for read 1: $length_read_1 bp (default)\n";
|
|
166 }
|
|
167 else{
|
|
168 warn "Length cut-off for read 1: $length_read_1 bp\n";
|
|
169 print REPORT "Length cut-off for read 1: $length_read_1 bp\n";
|
|
170 }
|
|
171
|
|
172 if ($length_read_2 == 35){
|
1
|
173 warn "Length cut-off for read 2: $length_read_2 bb (default)\n";
|
0
|
174 print REPORT "Length cut-off for read 2: $length_read_2 bp (default)\n";
|
|
175 }
|
|
176 else{
|
|
177 warn "Length cut-off for read 2: $length_read_2 bp\n";
|
|
178 print REPORT "Length cut-off for read 2: $length_read_2 bp\n";
|
|
179 }
|
|
180 }
|
|
181 }
|
|
182
|
|
183 if ($rrbs){
|
|
184 warn "File was specified to be an MspI-digested RRBS sample. Sequences with adapter contamination will be trimmed a further 2 bp to remove potential methylation-biased bases from the end-repair reaction\n";
|
|
185 print REPORT "File was specified to be an MspI-digested RRBS sample. Sequences with adapter contamination will be trimmed a further 2 bp to remove potential methylation-biased bases from the end-repair reaction\n";
|
|
186 }
|
|
187
|
|
188 if ($non_directional){
|
|
189 warn "File was specified to be a non-directional MspI-digested RRBS sample. Sequences starting with either 'CAA' or 'CGA' will have the first 2 bp trimmed off to remove potential methylation-biased bases from the end-repair reaction\n";
|
|
190 print REPORT "File was specified to be a non-directional MspI-digested RRBS sample. Sequences starting with either 'CAA' or 'CGA' will have the first 2 bp trimmed off to remove potential methylation-biased bases from the end-repair reaction\n";
|
|
191 }
|
|
192
|
|
193 if ($trim){
|
|
194 warn "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n";
|
|
195 print REPORT "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n";
|
|
196 }
|
|
197
|
1
|
198 if ($clip_r1){
|
|
199 warn "All Read 1 sequences will be trimmed by $clip_r1 bp from their 5' end to avoid poor qualities or biases\n";
|
|
200 print REPORT "All Read 1 sequences will be trimmed by $clip_r1 bp from their 5' end to avoid poor qualities or biases\n";
|
|
201 }
|
|
202 if ($clip_r2){
|
|
203 warn "All Read 2 sequences will be trimmed by $clip_r2 bp from their 5' end to avoid poor qualities or biases (e.g. M-bias for BS-Seq applications)\n";
|
|
204 print REPORT "All Read 2 sequences will be trimmed by $clip_r2 bp from their 5' end to avoid poor qualities or biases (e.g. M-bias for BS-Seq applications)\n";
|
|
205 }
|
|
206
|
4
|
207 if ($three_prime_clip_r1){
|
|
208 warn "All Read 1 sequences will be trimmed by $three_prime_clip_r1 bp from their 3' end to avoid poor qualities or biases\n";
|
|
209 print REPORT "All Read 1 sequences will be trimmed by $three_prime_clip_r1 bp from their 3' end to avoid poor qualities or biases\n";
|
|
210 }
|
|
211 if ($three_prime_clip_r2){
|
|
212 warn "All Read 2 sequences will be trimmed by $three_prime_clip_r2 bp from their 3' end to avoid poor qualities or biases\n";
|
|
213 print REPORT "All Read 2 sequences will be trimmed by $three_prime_clip_r2 bp from their 3' end to avoid poor qualities or biases\n";
|
|
214 }
|
1
|
215
|
0
|
216 if ($fastqc){
|
|
217 warn "Running FastQC on the data once trimming has completed\n";
|
|
218 print REPORT "Running FastQC on the data once trimming has completed\n";
|
|
219
|
|
220 if ($fastqc_args){
|
|
221 warn "Running FastQC with the following extra arguments: '$fastqc_args'\n";
|
|
222 print REPORT "Running FastQC with the following extra arguments: $fastqc_args\n";
|
|
223 }
|
|
224 }
|
|
225
|
|
226 if ($keep and $rrbs){
|
|
227 warn "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n";
|
|
228 print REPORT "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n";
|
|
229 }
|
|
230
|
1
|
231
|
0
|
232 if ($gzip or $filename =~ /\.gz$/){
|
1
|
233 $gzip = 1;
|
|
234 unless ($dont_gzip){
|
|
235 warn "Output file(s) will be GZIP compressed\n";
|
|
236 print REPORT "Output file will be GZIP compressed\n";
|
|
237 }
|
0
|
238 }
|
|
239
|
|
240 warn "\n";
|
|
241 print REPORT "\n";
|
|
242 sleep (3);
|
|
243
|
|
244 my $temp;
|
|
245
|
|
246 ### Proceeding differently for RRBS and other type of libraries
|
|
247 if ($rrbs){
|
|
248
|
|
249 ### Skipping quality filtering for RRBS libraries if a quality cutoff of 0 was specified
|
|
250 if ($cutoff == 0){
|
|
251 warn "Quality cutoff selected was 0 - Skipping quality trimming altogether\n\n";
|
|
252 sleep (3);
|
|
253 }
|
|
254 else{
|
|
255
|
|
256 $temp = $filename;
|
4
|
257 $temp =~ s/^.*\///; # replacing optional file path information
|
0
|
258 $temp =~ s/$/_qual_trimmed.fastq/;
|
4
|
259 open (TEMP,'>',$output_dir.$temp) or die "Can't write to '$temp': $!";
|
0
|
260
|
|
261 warn " >>> Now performing adaptive quality trimming with a Phred-score cutoff of: $cutoff <<<\n\n";
|
|
262 sleep (3);
|
|
263
|
|
264 open (QUAL,"$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -a X $filename |") or die "Can't open pipe to Cutadapt: $!";
|
|
265
|
|
266 my $qual_count = 0;
|
|
267
|
|
268 while (1){
|
|
269 my $l1 = <QUAL>;
|
|
270 my $seq = <QUAL>;
|
|
271 my $l3 = <QUAL>;
|
|
272 my $qual = <QUAL>;
|
|
273 last unless (defined $qual);
|
|
274
|
|
275 $qual_count++;
|
|
276 if ($qual_count%10000000 == 0){
|
|
277 warn "$qual_count sequences processed\n";
|
|
278 }
|
|
279 print TEMP "$l1$seq$l3$qual";
|
|
280 }
|
|
281
|
|
282 warn "\n >>> Quality trimming completed <<<\n$qual_count sequences processed in total\n\n";
|
4
|
283 close QUAL or die "Unable to close QUAL filehandle: $!\n";
|
0
|
284 sleep (3);
|
|
285
|
|
286 }
|
|
287 }
|
|
288
|
|
289
|
|
290 if ($output_filename =~ /\.fastq$/){
|
|
291 $output_filename =~ s/\.fastq$/_trimmed.fq/;
|
|
292 }
|
|
293 elsif ($output_filename =~ /\.fastq\.gz$/){
|
|
294 $output_filename =~ s/\.fastq\.gz$/_trimmed.fq/;
|
|
295 }
|
|
296 elsif ($output_filename =~ /\.fq$/){
|
|
297 $output_filename =~ s/\.fq$/_trimmed.fq/;
|
|
298 }
|
|
299 elsif ($output_filename =~ /\.fq\.gz$/){
|
|
300 $output_filename =~ s/\.fq\.gz$/_trimmed.fq/;
|
|
301 }
|
|
302 else{
|
|
303 $output_filename =~ s/$/_trimmed.fq/;
|
|
304 }
|
|
305
|
1
|
306 if ($gzip or $filename =~ /\.gz$/){
|
4
|
307 if ($dont_gzip){
|
|
308 open (OUT,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n"; # don't need to gzip intermediate file
|
1
|
309 }
|
|
310 else{
|
4
|
311 ### 6 Jan 2014: had a request to also gzip intermediate files to save disk space
|
|
312 # if ($validate){
|
|
313 # open (OUT,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n"; # don't need to gzip intermediate file
|
|
314 # }
|
|
315 $output_filename .= '.gz';
|
|
316 open (OUT,"| gzip -c - > ${output_dir}${output_filename}") or die "Can't write to '$output_filename': $!\n";
|
1
|
317 }
|
|
318 }
|
|
319 else{
|
4
|
320 open (OUT,'>',$output_dir.$output_filename) or die "Can't open '$output_filename': $!\n";
|
1
|
321 }
|
0
|
322 warn "Writing final adapter and quality trimmed output to $output_filename\n\n";
|
|
323
|
|
324 my $count = 0;
|
|
325 my $too_short = 0;
|
|
326 my $quality_trimmed = 0;
|
|
327 my $rrbs_trimmed = 0;
|
|
328 my $rrbs_trimmed_start = 0;
|
|
329 my $CAA = 0;
|
|
330 my $CGA = 0;
|
|
331
|
4
|
332 my $pid;
|
|
333
|
0
|
334 if ($rrbs and $cutoff != 0){
|
|
335
|
|
336 ### optionally using 2 different adapters for read 1 and read 2
|
|
337 if ($validate and $a2){
|
|
338 ### Figure out whether current file counts as read 1 or read 2 of paired-end files
|
|
339 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
|
|
340 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n";
|
|
341 sleep (3);
|
4
|
342 $pid = open (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
|
0
|
343 }
|
|
344 else{ # this is read 2 of a pair
|
|
345 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$a2' from file $temp <<< \n";
|
|
346 sleep (3);
|
4
|
347 $pid = open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $a2 $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
|
0
|
348 }
|
|
349 }
|
|
350 ### Using the same adapter for both read 1 and read 2
|
|
351 else{
|
|
352 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n";
|
|
353 sleep (3);
|
4
|
354 $pid = open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
|
0
|
355 }
|
|
356
|
|
357 close WRITER or die $!; # not needed
|
|
358
|
|
359 open (QUAL,"$output_dir$temp") or die $!; # quality trimmed file
|
|
360
|
|
361 if ($filename =~ /\.gz$/){
|
|
362 open (IN,"zcat $filename |") or die $!; # original, untrimmed file
|
|
363 }
|
|
364 else{
|
|
365 open (IN,$filename) or die $!; # original, untrimmed file
|
|
366 }
|
|
367
|
|
368 while (1){
|
|
369
|
|
370 # we can process the output from Cutadapt and the original input 1 by 1 to decide if the adapter has been removed or not
|
|
371 my $l1 = <TRIM>;
|
|
372 my $seq = <TRIM>; # adapter trimmed sequence
|
|
373 my $l3 = <TRIM>;
|
|
374 my $qual = <TRIM>;
|
|
375
|
|
376 $_ = <IN>; # irrelevant
|
|
377 my $original_seq = <IN>;
|
|
378 $_ = <IN>; # irrelevant
|
|
379 $_ = <IN>; # irrelevant
|
|
380
|
|
381 $_ = <QUAL>; # irrelevant
|
|
382 my $qual_trimmed_seq = <QUAL>;
|
|
383 $_ = <QUAL>; # irrelevant
|
|
384 my $qual_trimmed_qual = <QUAL>;
|
|
385
|
|
386 last unless (defined $qual and defined $qual_trimmed_qual); # could be empty strings
|
|
387
|
|
388 $count++;
|
|
389 if ($count%10000000 == 0){
|
|
390 warn "$count sequences processed\n";
|
|
391 }
|
|
392
|
|
393 chomp $seq;
|
|
394 chomp $qual;
|
|
395 chomp $qual_trimmed_seq;
|
|
396 chomp $original_seq;
|
|
397
|
|
398 my $quality_trimmed_seq_length = length $qual_trimmed_seq;
|
|
399
|
|
400 if (length $original_seq > length $qual_trimmed_seq){
|
|
401 ++$quality_trimmed;
|
|
402 }
|
|
403
|
|
404 my $nd = 0;
|
|
405
|
|
406 ### NON-DIRECTIONAL RRBS
|
|
407 if ($non_directional){
|
|
408 if (length$seq > 2){
|
|
409 if ($seq =~ /^CAA/){
|
|
410 ++$CAA;
|
|
411 $seq = substr ($seq,2,length($seq)-2);
|
|
412 $qual = substr ($qual,2,length($qual)-2);
|
|
413 ++$rrbs_trimmed_start;
|
|
414 $nd = 1;
|
|
415 }
|
|
416 elsif ($seq =~ /^CGA/){
|
|
417 $seq = substr ($seq,2,length($seq)-2);
|
|
418 $qual = substr ($qual,2,length($qual)-2);
|
|
419 ++$CGA;
|
|
420 ++$rrbs_trimmed_start;
|
|
421 $nd = 1;
|
|
422 }
|
|
423 }
|
|
424 }
|
|
425
|
|
426 ### directional read
|
|
427 unless ($nd == 1){
|
|
428 if (length $seq >= 2 and length$seq < $quality_trimmed_seq_length){
|
|
429 $seq = substr ($seq,0,length($seq)-2);
|
|
430 $qual = substr ($qual,0,length($qual)-2);
|
|
431 ++$rrbs_trimmed;
|
|
432 }
|
|
433 }
|
|
434
|
|
435 ### Shortening all sequences by 1 bp on the 3' end
|
|
436 if ($trim){
|
|
437 $seq = substr($seq,0,length($seq)-1);
|
|
438 $qual = substr($qual,0,length($qual)-1);
|
|
439 }
|
|
440
|
|
441 ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE
|
|
442 if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later)
|
|
443 print OUT "$l1$seq\n$l3$qual\n";
|
|
444 }
|
|
445 else{ # single end
|
1
|
446
|
|
447 if ($clip_r1){
|
4
|
448 if (length $seq > $clip_r1){ # sequences that are already too short won't be clipped again
|
|
449 $seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence
|
|
450 $qual = substr($qual,$clip_r1);
|
|
451 }
|
|
452 }
|
|
453
|
|
454 if ($three_prime_clip_r1){
|
|
455
|
|
456 if (length $seq > $three_prime_clip_r1){ # sequences that are already too short won't be clipped again
|
|
457 # warn "seq/qual before/after trimming:\n$seq\n$qual\n";
|
|
458 $seq = substr($seq,0,(length($seq) - $three_prime_clip_r1)); # starting after the sequences to be trimmed until the end of the sequence
|
|
459 $qual = substr($qual,0,(length($qual) - $three_prime_clip_r1 ));
|
|
460 # warn "$seq\n$qual\n";
|
|
461 }
|
|
462
|
1
|
463 }
|
|
464
|
0
|
465 if (length $seq < $length_cutoff){
|
|
466 ++$too_short;
|
|
467 next;
|
|
468 }
|
|
469 else{
|
|
470 print OUT "$l1$seq\n$l3$qual\n";
|
|
471 }
|
|
472 }
|
|
473 }
|
|
474
|
|
475 print REPORT "\n";
|
|
476 while (<ERROR>){
|
|
477 warn $_;
|
|
478 print REPORT $_;
|
|
479 }
|
|
480
|
|
481 close IN or die "Unable to close IN filehandle: $!";
|
|
482 close QUAL or die "Unable to close QUAL filehandle: $!";
|
|
483 close TRIM or die "Unable to close TRIM filehandle: $!";
|
|
484 close OUT or die "Unable to close OUT filehandle: $!";
|
4
|
485
|
0
|
486 }
|
|
487 else{
|
|
488
|
|
489 ### optionally using 2 different adapters for read 1 and read 2
|
|
490 if ($validate and $a2){
|
|
491 ### Figure out whether current file counts as read 1 or read 2 of paired-end files
|
|
492 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
|
|
493 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n";
|
|
494 sleep (3);
|
4
|
495 $pid = open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $adapter $filename") or die "Failed to launch Cutadapt: $!";
|
0
|
496 }
|
|
497 else{ # this is read 2 of a pair
|
|
498 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$a2' from file $filename <<< \n";
|
|
499 sleep (3);
|
4
|
500 $pid = open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $a2 $filename") or die "Failed to launch Cutadapt: $!";
|
0
|
501 }
|
|
502 }
|
|
503 ### Using the same adapter for both read 1 and read 2
|
|
504 else{
|
|
505 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n";
|
|
506 sleep (3);
|
4
|
507 $pid = open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $adapter $filename") or die "Failed to launch Cutadapt: $!";
|
0
|
508 }
|
|
509
|
|
510 close WRITER or die $!; # not needed
|
|
511
|
|
512 while (1){
|
|
513
|
|
514 my $l1 = <TRIM>;
|
|
515 my $seq = <TRIM>; # quality and/or adapter trimmed sequence
|
|
516 my $l3 = <TRIM>;
|
|
517 my $qual = <TRIM>;
|
|
518 # print "$l1$seq\n$l3$qual\n";
|
|
519 last unless (defined $qual); # could be an empty string
|
|
520
|
|
521 $count++;
|
|
522 if ($count%10000000 == 0){
|
|
523 warn "$count sequences processed\n";
|
|
524 }
|
|
525
|
|
526 chomp $seq;
|
|
527 chomp $qual;
|
|
528
|
|
529 ### Shortening all sequences by 1 bp on the 3' end
|
|
530 if ($trim){
|
|
531 $seq = substr($seq,0,length($seq)-1);
|
|
532 $qual = substr($qual,0,length($qual)-1);
|
|
533 }
|
|
534
|
|
535 ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE
|
|
536 if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later)
|
|
537 print OUT "$l1$seq\n$l3$qual\n";
|
|
538 }
|
|
539 else{ # single end
|
4
|
540
|
1
|
541 if ($clip_r1){
|
4
|
542 if (length $seq > $clip_r1){ # sequences that are already too short won't be clipped again
|
|
543 $seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence
|
|
544 $qual = substr($qual,$clip_r1);
|
|
545 }
|
|
546 }
|
1
|
547
|
4
|
548 if ($three_prime_clip_r1){
|
|
549 if (length $seq > $three_prime_clip_r1){ # sequences that are already too short won't be clipped again
|
|
550 # warn "seq/qual before/after trimming:\n$seq\n$qual\n";
|
|
551 $seq = substr($seq,0,(length($seq) - $three_prime_clip_r1)); # starting after the sequences to be trimmed until the end of the sequence
|
|
552 $qual = substr($qual,0,(length($qual) - $three_prime_clip_r1));
|
|
553 # warn "$seq\n$qual\n";sleep(1);
|
|
554 }
|
|
555 }
|
|
556
|
0
|
557 if (length $seq < $length_cutoff){
|
|
558 ++$too_short;
|
|
559 next;
|
|
560 }
|
|
561 else{
|
|
562 print OUT "$l1$seq\n$l3$qual\n";
|
|
563 }
|
|
564 }
|
|
565 }
|
|
566
|
|
567 print REPORT "\n";
|
|
568 while (<ERROR>){
|
|
569 warn $_;
|
|
570 print REPORT $_;
|
|
571 }
|
|
572
|
|
573 close TRIM or die "Unable to close TRIM filehandle: $!\n";
|
|
574 close ERROR or die "Unable to close ERROR filehandle: $!\n";
|
|
575 close OUT or die "Unable to close OUT filehandle: $!\n";
|
|
576
|
|
577 }
|
|
578
|
4
|
579
|
0
|
580 if ($rrbs){
|
|
581 unless ($keep){ # keeping the quality trimmed intermediate file for RRBS files
|
|
582
|
|
583 # deleting temporary quality trimmed file
|
|
584 my $deleted = unlink "$output_dir$temp";
|
|
585
|
|
586 if ($deleted){
|
|
587 warn "Successfully deleted temporary file $temp\n\n";
|
|
588 }
|
|
589 else{
|
|
590 warn "Could not delete temporary file $temp";
|
|
591 }
|
|
592 }
|
|
593 }
|
|
594
|
4
|
595 ### Wait and reap the child process (Cutadapt) so that it doesn't become a zombie process
|
|
596 waitpid $pid, 0;
|
|
597 unless ($? == 0){
|
|
598 die "\n\nCutadapt terminated with exit signal: '$?'.\nTerminating Trim Galore run, please check error message(s) to get an idea what went wrong...\n\n";
|
|
599 }
|
|
600
|
0
|
601 warn "\nRUN STATISTICS FOR INPUT FILE: $filename\n";
|
|
602 print REPORT "\nRUN STATISTICS FOR INPUT FILE: $filename\n";
|
|
603
|
|
604 warn "="x 45,"\n";
|
|
605 print REPORT "="x 45,"\n";
|
|
606
|
|
607 warn "$count sequences processed in total\n";
|
|
608 print REPORT "$count sequences processed in total\n";
|
|
609
|
|
610 ### only reporting this separately if quality and adapter trimming were performed separately
|
|
611 if ($rrbs){
|
4
|
612 my $percentage_shortened;
|
|
613 if ($count){
|
|
614 $percentage_shortened = sprintf ("%.1f",$quality_trimmed/$count*100);
|
|
615 warn "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n";
|
|
616 print REPORT "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n";
|
|
617 }
|
|
618 else{
|
|
619 warn "Unable to determine percentage of reads that were shortened because 0 lines were processed\n\n";
|
|
620 print REPORT "Unable to determine percentage of reads that were shortened because 0 lines were processed\n\n";
|
|
621 }
|
0
|
622 warn "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n";
|
|
623 print REPORT "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n";
|
|
624 }
|
|
625
|
4
|
626 my $percentage_too_short;
|
|
627 if ($count){
|
|
628 $percentage_too_short = sprintf ("%.1f",$too_short/$count*100);
|
|
629 }
|
|
630 else{
|
|
631 $percentage_too_short = 'N/A';
|
|
632 }
|
|
633
|
|
634 if ($validate){ ### only for paired-end files
|
|
635 warn "The length threshold of paired-end sequences gets evaluated later on (in the validation step)\n";
|
|
636 }
|
|
637 else{ ### Single-end file
|
|
638 warn "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n";
|
|
639 print REPORT "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n";
|
|
640 }
|
0
|
641
|
|
642 if ($rrbs){
|
|
643 my $percentage_rrbs_trimmed = sprintf ("%.1f",$rrbs_trimmed/$count*100);
|
|
644 warn "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n";
|
|
645 print REPORT "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n";
|
|
646 }
|
|
647
|
|
648 if ($non_directional){
|
|
649 my $percentage_rrbs_trimmed_at_start = sprintf ("%.1f",$rrbs_trimmed_start/$count*100);
|
|
650 warn "RRBS reads trimmed by 2 bp at the start when read started with CAA ($CAA) or CGA ($CGA) in total:\t$rrbs_trimmed_start ($percentage_rrbs_trimmed_at_start%)\n";
|
|
651 print REPORT "RRBS reads trimmed by 2 bp at the start when read started with CAA ($CAA) or CGA ($CGA) in total:\t$rrbs_trimmed_start ($percentage_rrbs_trimmed_at_start%)\n";
|
|
652 }
|
|
653
|
|
654 warn "\n";
|
|
655 print REPORT "\n";
|
|
656
|
1
|
657 ### RUNNING FASTQC unless we are dealing with paired-end files
|
|
658 unless($validate){
|
|
659 if ($fastqc){
|
|
660 warn "\n >>> Now running FastQC on the data <<<\n\n";
|
|
661 sleep (5);
|
|
662 if ($fastqc_args){
|
|
663 system ("$path_to_fastqc $fastqc_args $output_dir$output_filename");
|
|
664 }
|
|
665 else{
|
|
666 system ("$path_to_fastqc $output_dir$output_filename");
|
|
667 }
|
0
|
668 }
|
|
669 }
|
|
670
|
|
671 ### VALIDATE PAIRED-END FILES
|
|
672 if ($validate){
|
|
673
|
|
674 ### Figure out whether current file counts as read 1 or read 2 of paired-end files
|
|
675
|
|
676 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
|
|
677 $file_1 = $output_filename;
|
|
678 shift @filenames;
|
|
679 # warn "This is read 1: $file_1\n\n";
|
|
680 }
|
|
681 else{ # this is read 2 of a pair
|
|
682 $file_2 = $output_filename;
|
|
683 shift @filenames;
|
|
684 # warn "This is read 2: $file_2\n\n";
|
|
685 }
|
|
686
|
|
687 if ($file_1 and $file_2){
|
|
688 warn "Validate paired-end files $file_1 and $file_2\n";
|
|
689 sleep (1);
|
|
690
|
|
691 my ($val_1,$val_2,$un_1,$un_2) = validate_paired_end_files($file_1,$file_2);
|
|
692
|
|
693 ### RUNNING FASTQC
|
|
694 if ($fastqc){
|
|
695
|
|
696 warn "\n >>> Now running FastQC on the validated data $val_1<<<\n\n";
|
|
697 sleep (3);
|
|
698
|
|
699 if ($fastqc_args){
|
1
|
700 system ("$path_to_fastqc $fastqc_args $output_dir$val_1");
|
0
|
701 }
|
|
702 else{
|
1
|
703 system ("$path_to_fastqc $output_dir$val_1");
|
0
|
704 }
|
|
705
|
|
706 warn "\n >>> Now running FastQC on the validated data $val_2<<<\n\n";
|
|
707 sleep (3);
|
|
708
|
|
709 if ($fastqc_args){
|
1
|
710 system ("$path_to_fastqc $fastqc_args $output_dir$val_2");
|
0
|
711 }
|
|
712 else{
|
1
|
713 system ("$path_to_fastqc $output_dir$val_2");
|
0
|
714 }
|
|
715
|
|
716 }
|
|
717
|
|
718 warn "Deleting both intermediate output files $file_1 and $file_2\n";
|
|
719 unlink "$output_dir$file_1";
|
|
720 unlink "$output_dir$file_2";
|
|
721
|
|
722 warn "\n",'='x100,"\n\n";
|
|
723 sleep (3);
|
|
724
|
|
725 $file_1 = undef; # setting file_1 and file_2 to undef once validation is completed
|
|
726 $file_2 = undef;
|
|
727 }
|
|
728 }
|
|
729
|
|
730 }
|
|
731
|
|
732 sub validate_paired_end_files{
|
|
733
|
|
734 my $file_1 = shift;
|
|
735 my $file_2 = shift;
|
|
736
|
1
|
737 warn "file_1: $file_1, file_2: $file_2\n\n";
|
0
|
738
|
|
739 if ($file_1 =~ /\.gz$/){
|
|
740 open (IN1,"zcat $output_dir$file_1 |") or die "Couldn't read from file $file_1: $!\n";
|
|
741 }
|
|
742 else{
|
|
743 open (IN1, "$output_dir$file_1") or die "Couldn't read from file $file_1: $!\n";
|
|
744 }
|
|
745
|
|
746 if ($file_2 =~ /\.gz$/){
|
|
747 open (IN2,"zcat $output_dir$file_2 |") or die "Couldn't read from file $file_2: $!\n";
|
|
748 }
|
|
749 else{
|
|
750 open (IN2, "$output_dir$file_2") or die "Couldn't read from file $file_2: $!\n";
|
|
751 }
|
|
752
|
|
753 warn "\n>>>>> Now validing the length of the 2 paired-end infiles: $file_1 and $file_2 <<<<<\n";
|
|
754 sleep (3);
|
|
755
|
|
756 my $out_1 = $file_1;
|
|
757 my $out_2 = $file_2;
|
|
758
|
|
759 if ($out_1 =~ /gz$/){
|
|
760 $out_1 =~ s/trimmed\.fq\.gz$/val_1.fq/;
|
|
761 }
|
|
762 else{
|
|
763 $out_1 =~ s/trimmed\.fq$/val_1.fq/;
|
|
764 }
|
|
765
|
|
766 if ($out_2 =~ /gz$/){
|
|
767 $out_2 =~ s/trimmed\.fq\.gz$/val_2.fq/;
|
|
768 }
|
|
769 else{
|
|
770 $out_2 =~ s/trimmed\.fq$/val_2.fq/;
|
|
771 }
|
|
772
|
1
|
773 if ($gzip){
|
|
774 if ($dont_gzip){
|
|
775 open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n";
|
|
776 }
|
|
777 else{
|
|
778 $out_1 .= '.gz';
|
|
779 open (R1,"| gzip -c - > ${output_dir}${out_1}") or die "Can't write to $out_1: $!\n";
|
|
780 }
|
|
781 }
|
|
782 else{
|
|
783 open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n";
|
|
784 }
|
|
785
|
|
786 if ($gzip){
|
|
787 if ($dont_gzip){
|
|
788 open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n";
|
|
789 }
|
|
790 else{
|
|
791 $out_2 .= '.gz';
|
|
792 open (R2,"| gzip -c - > ${output_dir}${out_2}") or die "Can't write to $out_2: $!\n";
|
|
793 }
|
|
794 }
|
|
795 else{
|
|
796 open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n";
|
|
797 }
|
|
798
|
0
|
799 warn "Writing validated paired-end read 1 reads to $out_1\n";
|
|
800 warn "Writing validated paired-end read 2 reads to $out_2\n\n";
|
|
801
|
|
802 my $unpaired_1;
|
|
803 my $unpaired_2;
|
|
804
|
|
805 if ($retain){
|
|
806
|
|
807 $unpaired_1 = $file_1;
|
|
808 $unpaired_2 = $file_2;
|
|
809
|
|
810 if ($unpaired_1 =~ /gz$/){
|
|
811 $unpaired_1 =~ s/trimmed\.fq\.gz$/unpaired_1.fq/;
|
|
812 }
|
|
813 else{
|
|
814 $unpaired_1 =~ s/trimmed\.fq$/unpaired_1.fq/;
|
|
815 }
|
|
816
|
|
817 if ($unpaired_2 =~ /gz$/){
|
|
818 $unpaired_2 =~ s/trimmed\.fq\.gz$/unpaired_2.fq/;
|
|
819 }
|
|
820 else{
|
|
821 $unpaired_2 =~ s/trimmed\.fq$/unpaired_2.fq/;
|
|
822 }
|
|
823
|
1
|
824 if ($gzip){
|
|
825 if ($dont_gzip){
|
|
826 open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n";
|
|
827 }
|
|
828 else{
|
|
829 $unpaired_1 .= '.gz';
|
|
830 open (UNPAIRED1,"| gzip -c - > ${output_dir}${unpaired_1}") or die "Can't write to $unpaired_1: $!\n";
|
|
831 }
|
|
832 }
|
|
833 else{
|
|
834 open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n";
|
|
835 }
|
|
836
|
|
837 if ($gzip){
|
|
838 if ($dont_gzip){
|
|
839 open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n";
|
|
840 }
|
|
841 else{
|
|
842 $unpaired_2 .= '.gz';
|
|
843 open (UNPAIRED2,"| gzip -c - > ${output_dir}${unpaired_2}") or die "Can't write to $unpaired_2: $!\n";
|
|
844 }
|
|
845 }
|
|
846 else{
|
|
847 open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n";
|
|
848 }
|
0
|
849
|
|
850 warn "Writing unpaired read 1 reads to $unpaired_1\n";
|
|
851 warn "Writing unpaired read 2 reads to $unpaired_2\n\n";
|
|
852 }
|
|
853
|
|
854 my $sequence_pairs_removed = 0;
|
|
855 my $read_1_printed = 0;
|
|
856 my $read_2_printed = 0;
|
|
857
|
|
858 my $count = 0;
|
|
859
|
|
860 while (1){
|
|
861 my $id_1 = <IN1>;
|
|
862 my $seq_1 = <IN1>;
|
|
863 my $l3_1 = <IN1>;
|
|
864 my $qual_1 = <IN1>;
|
|
865 last unless ($id_1 and $seq_1 and $l3_1 and $qual_1);
|
|
866
|
|
867 my $id_2 = <IN2>;
|
|
868 my $seq_2 = <IN2>;
|
|
869 my $l3_2 = <IN2>;
|
|
870 my $qual_2 = <IN2>;
|
|
871 last unless ($id_2 and $seq_2 and $l3_2 and $qual_2);
|
|
872
|
|
873 ++$count;
|
|
874
|
|
875
|
1
|
876 ## small check if the sequence files appear to be FastQ files
|
0
|
877 if ($count == 1){ # performed just once
|
|
878 if ($id_1 !~ /^\@/ or $l3_1 !~ /^\+/){
|
|
879 die "Input file doesn't seem to be in FastQ format at sequence $count\n";
|
|
880 }
|
|
881 if ($id_2 !~ /^\@/ or $l3_2 !~ /^\+/){
|
|
882 die "Input file doesn't seem to be in FastQ format at sequence $count\n";
|
|
883 }
|
|
884 }
|
|
885
|
|
886 chomp $seq_1;
|
|
887 chomp $seq_2;
|
4
|
888 chomp $qual_1;
|
|
889 chomp $qual_2;
|
0
|
890
|
1
|
891 if ($clip_r1){
|
4
|
892 if (length $seq_1 > $clip_r1){ # sequences that are already too short won't be trimmed again
|
|
893 $seq_1 = substr($seq_1,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence
|
|
894 $qual_1 = substr($qual_1,$clip_r1);
|
|
895 }
|
1
|
896 }
|
|
897 if ($clip_r2){
|
4
|
898 if (length $seq_2 > $clip_r2){ # sequences that are already too short won't be trimmed again
|
|
899 $seq_2 = substr($seq_2,$clip_r2); # starting after the sequences to be trimmed until the end of the sequence
|
|
900 $qual_2 = substr($qual_2,$clip_r2);
|
|
901 }
|
1
|
902 }
|
0
|
903
|
4
|
904 if ($three_prime_clip_r1){
|
|
905 if (length $seq_1 > $three_prime_clip_r1){ # sequences that are already too short won't be clipped again
|
|
906 $seq_1 = substr($seq_1,0,(length($seq_1) - $three_prime_clip_r1)); # starting after the sequences to be trimmed until the end of the sequence
|
|
907 $qual_1 = substr($qual_1,0,(length($qual_1) - $three_prime_clip_r1));
|
|
908 }
|
|
909 }
|
|
910 if ($three_prime_clip_r2){
|
|
911 if (length $seq_2 > $three_prime_clip_r2){ # sequences that are already too short won't be clipped again
|
|
912 $seq_2 = substr($seq_2,0,(length($seq_2) - $three_prime_clip_r2)); # starting after the sequences to be trimmed until the end of the sequence
|
|
913 $qual_2 = substr($qual_2,0,(length($qual_2) - $three_prime_clip_r2));
|
|
914 }
|
|
915 }
|
|
916
|
|
917
|
|
918
|
0
|
919 ### making sure that the reads do have a sensible length
|
|
920 if ( (length($seq_1) < $length_cutoff) or (length($seq_2) < $length_cutoff) ){
|
|
921 ++$sequence_pairs_removed;
|
|
922 if ($retain){ # writing out single-end reads if they are longer than the cutoff
|
|
923
|
|
924 if ( length($seq_1) >= $length_read_1){ # read 1 is long enough
|
|
925 print UNPAIRED1 $id_1;
|
|
926 print UNPAIRED1 "$seq_1\n";
|
|
927 print UNPAIRED1 $l3_1;
|
4
|
928 print UNPAIRED1 "$qual_1\n";
|
0
|
929 ++$read_1_printed;
|
|
930 }
|
|
931
|
|
932 if ( length($seq_2) >= $length_read_2){ # read 2 is long enough
|
|
933 print UNPAIRED2 $id_2;
|
|
934 print UNPAIRED2 "$seq_2\n";
|
|
935 print UNPAIRED2 $l3_2;
|
4
|
936 print UNPAIRED2 "$qual_2\n";
|
0
|
937 ++$read_2_printed;
|
|
938 }
|
|
939
|
|
940 }
|
|
941 }
|
|
942 else{
|
|
943 print R1 $id_1;
|
|
944 print R1 "$seq_1\n";
|
|
945 print R1 $l3_1;
|
4
|
946 print R1 "$qual_1\n";
|
0
|
947
|
|
948 print R2 $id_2;
|
|
949 print R2 "$seq_2\n";
|
|
950 print R2 $l3_2;
|
4
|
951 print R2 "$qual_2\n";
|
0
|
952 }
|
|
953
|
|
954 }
|
4
|
955
|
|
956
|
|
957 my $percentage;
|
|
958
|
|
959 if ($count){
|
|
960 $percentage = sprintf("%.2f",$sequence_pairs_removed/$count*100);
|
|
961 }
|
|
962 else{
|
|
963 $percentage = 'N/A';
|
|
964 }
|
|
965
|
0
|
966 warn "Total number of sequences analysed: $count\n\n";
|
4
|
967 warn "Number of sequence pairs removed because at least one read was shorter than the length cutoff ($length_cutoff bp): $sequence_pairs_removed ($percentage%)\n";
|
0
|
968
|
|
969 print REPORT "Total number of sequences analysed for the sequence pair length validation: $count\n\n";
|
|
970 print REPORT "Number of sequence pairs removed because at least one read was shorter than the length cutoff ($length_cutoff bp): $sequence_pairs_removed ($percentage%)\n";
|
|
971
|
|
972 if ($keep){
|
|
973 warn "Number of unpaired read 1 reads printed: $read_1_printed\n";
|
|
974 warn "Number of unpaired read 2 reads printed: $read_2_printed\n";
|
|
975 }
|
|
976
|
1
|
977 close R1 or die $!;
|
|
978 close R2 or die $!;
|
|
979
|
|
980 if ($retain){
|
|
981 close UNPAIRED1 or die $!;
|
|
982 close UNPAIRED2 or die $!;
|
|
983 }
|
|
984
|
0
|
985 warn "\n";
|
|
986 if ($retain){
|
|
987 return ($out_1,$out_2,$unpaired_1,$unpaired_2);
|
|
988 }
|
|
989 else{
|
|
990 return ($out_1,$out_2);
|
|
991 }
|
|
992 }
|
|
993
|
|
994 sub process_commandline{
|
|
995 my $help;
|
|
996 my $quality;
|
|
997 my $adapter;
|
|
998 my $adapter2;
|
|
999 my $stringency;
|
|
1000 my $report;
|
|
1001 my $version;
|
|
1002 my $rrbs;
|
|
1003 my $length_cutoff;
|
|
1004 my $keep;
|
|
1005 my $fastqc;
|
|
1006 my $non_directional;
|
|
1007 my $phred33;
|
|
1008 my $phred64;
|
|
1009 my $fastqc_args;
|
|
1010 my $trim;
|
|
1011 my $gzip;
|
|
1012 my $validate;
|
|
1013 my $retain;
|
|
1014 my $length_read_1;
|
|
1015 my $length_read_2;
|
|
1016 my $error_rate;
|
|
1017 my $output_dir;
|
|
1018 my $no_report_file;
|
|
1019 my $suppress_warn;
|
1
|
1020 my $dont_gzip;
|
|
1021 my $clip_r1;
|
|
1022 my $clip_r2;
|
4
|
1023 my $three_prime_clip_r1;
|
|
1024 my $three_prime_clip_r2;
|
0
|
1025
|
|
1026 my $command_line = GetOptions ('help|man' => \$help,
|
|
1027 'q|quality=i' => \$quality,
|
|
1028 'a|adapter=s' => \$adapter,
|
|
1029 'a2|adapter2=s' => \$adapter2,
|
|
1030 'report' => \$report,
|
|
1031 'version' => \$version,
|
|
1032 'stringency=i' => \$stringency,
|
|
1033 'fastqc' => \$fastqc,
|
|
1034 'RRBS' => \$rrbs,
|
|
1035 'keep' => \$keep,
|
|
1036 'length=i' => \$length_cutoff,
|
|
1037 'non_directional' => \$non_directional,
|
|
1038 'phred33' => \$phred33,
|
|
1039 'phred64' => \$phred64,
|
|
1040 'fastqc_args=s' => \$fastqc_args,
|
|
1041 'trim1' => \$trim,
|
|
1042 'gzip' => \$gzip,
|
|
1043 'paired_end' => \$validate,
|
|
1044 'retain_unpaired' => \$retain,
|
|
1045 'length_1|r1=i' => \$length_read_1,
|
|
1046 'length_2|r2=i' => \$length_read_2,
|
|
1047 'e|error_rate=s' => \$error_rate,
|
|
1048 'o|output_dir=s' => \$output_dir,
|
|
1049 'no_report_file' => \$no_report_file,
|
|
1050 'suppress_warn' => \$suppress_warn,
|
1
|
1051 'dont_gzip' => \$dont_gzip,
|
|
1052 'clip_R1=i' => \$clip_r1,
|
|
1053 'clip_R2=i' => \$clip_r2,
|
4
|
1054 'three_prime_clip_R1=i' => \$three_prime_clip_r1,
|
|
1055 'three_prime_clip_R2=i' => \$three_prime_clip_r2,
|
0
|
1056 );
|
1
|
1057
|
0
|
1058 ### EXIT ON ERROR if there were errors with any of the supplied options
|
|
1059 unless ($command_line){
|
|
1060 die "Please respecify command line options\n";
|
|
1061 }
|
|
1062
|
|
1063 ### HELPFILE
|
|
1064 if ($help){
|
|
1065 print_helpfile();
|
|
1066 exit;
|
|
1067 }
|
|
1068
|
|
1069
|
|
1070
|
|
1071
|
|
1072 if ($version){
|
|
1073 print << "VERSION";
|
|
1074
|
|
1075 Quality-/Adapter-/RRBS-Trimming
|
|
1076 (powered by Cutadapt)
|
|
1077 version $trimmer_version
|
|
1078
|
4
|
1079 Last update: 15 04 2014
|
0
|
1080
|
|
1081 VERSION
|
|
1082 exit;
|
|
1083 }
|
|
1084
|
|
1085 ### RRBS
|
|
1086 unless ($rrbs){
|
|
1087 $rrbs = 0;
|
|
1088 }
|
|
1089
|
|
1090 ### SUPRESS WARNINGS
|
|
1091 if (defined $suppress_warn){
|
|
1092 $DOWARN = 0;
|
|
1093 }
|
|
1094
|
|
1095 ### QUALITY SCORES
|
|
1096 my $phred_encoding;
|
|
1097 if ($phred33){
|
|
1098 if ($phred64){
|
|
1099 die "Please specify only a single quality encoding type (--phred33 or --phred64)\n\n";
|
|
1100 }
|
|
1101 $phred_encoding = 33;
|
|
1102 }
|
|
1103 elsif ($phred64){
|
|
1104 $phred_encoding = 64;
|
|
1105 }
|
|
1106 unless ($phred33 or $phred64){
|
|
1107 warn "No quality encoding type selected. Assuming that the data provided uses Sanger encoded Phred scores (default)\n\n";
|
|
1108 $phred_encoding = 33;
|
|
1109 sleep (3);
|
|
1110 }
|
|
1111
|
|
1112 ### NON-DIRECTIONAL RRBS
|
|
1113 if ($non_directional){
|
|
1114 unless ($rrbs){
|
|
1115 die "Option '--non_directional' requires '--rrbs' to be specified as well. Please re-specify!\n";
|
|
1116 }
|
|
1117 }
|
|
1118 else{
|
|
1119 $non_directional = 0;
|
|
1120 }
|
|
1121
|
|
1122 if ($fastqc_args){
|
|
1123 $fastqc = 1; # specifying fastqc extra arguments automatically means that FastQC will be executed
|
|
1124 }
|
|
1125 else{
|
|
1126 $fastqc_args = 0;
|
|
1127 }
|
|
1128
|
|
1129 ### CUSTOM ERROR RATE
|
|
1130 if (defined $error_rate){
|
|
1131 # make sure that the error rate is between 0 and 1
|
|
1132 unless ($error_rate >= 0 and $error_rate <= 1){
|
|
1133 die "Please specify an error rate between 0 and 1 (the default is 0.1)\n";
|
|
1134 }
|
|
1135 }
|
|
1136 else{
|
|
1137 $error_rate = 0.1; # (default)
|
|
1138 }
|
|
1139
|
|
1140 if (defined $adapter){
|
4
|
1141 unless ($adapter =~ /^[ACTGNXactgnx]+$/){
|
0
|
1142 die "Adapter sequence must contain DNA characters only (A,C,T,G or N)!\n";
|
|
1143 }
|
|
1144 $adapter = uc$adapter;
|
|
1145 }
|
|
1146
|
|
1147 if (defined $adapter2){
|
|
1148 unless ($validate){
|
|
1149 die "An optional adapter for read 2 of paired-end files requires '--paired' to be specified as well! Please re-specify\n";
|
|
1150 }
|
|
1151 unless ($adapter2 =~ /^[ACTGNactgn]+$/){
|
|
1152 die "Optional adapter 2 sequence must contain DNA characters only (A,C,T,G or N)!\n";
|
|
1153 }
|
|
1154 $adapter2 = uc$adapter2;
|
|
1155 }
|
|
1156
|
4
|
1157 ### LENGTH CUTOFF
|
|
1158 unless (defined $length_cutoff){
|
|
1159 $length_cutoff = 20;
|
|
1160 }
|
|
1161
|
0
|
1162 ### files are supposed to be paired-end files
|
|
1163 if ($validate){
|
|
1164
|
|
1165 # making sure that an even number of reads has been supplied
|
|
1166 unless ((scalar@ARGV)%2 == 0){
|
|
1167 die "Please provide an even number of input files for paired-end FastQ trimming! Aborting ...\n";
|
|
1168 }
|
|
1169
|
|
1170 ## CUTOFF FOR VALIDATED READ-PAIRS
|
4
|
1171 if (defined $length_read_1 or defined $length_read_2){
|
0
|
1172
|
|
1173 unless ($retain){
|
|
1174 die "Please specify --keep_unpaired to alter the unpaired single-end read length cut off(s)\n\n";
|
|
1175 }
|
|
1176
|
|
1177 if (defined $length_read_1){
|
|
1178 unless ($length_read_1 >= 15 and $length_read_1 <= 100){
|
|
1179 die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n";
|
|
1180 }
|
|
1181 unless ($length_read_1 > $length_cutoff){
|
4
|
1182 die "The single-end unpaired read length needs to be longer than the paired-end cut-off value ($length_cutoff bp)\n\n";
|
0
|
1183 }
|
|
1184 }
|
|
1185
|
|
1186 if (defined $length_read_2){
|
|
1187 unless ($length_read_2 >= 15 and $length_read_2 <= 100){
|
|
1188 die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n";
|
|
1189 }
|
|
1190 unless ($length_read_2 > $length_cutoff){
|
4
|
1191 die "The single-end unpaired read length needs to be longer than the paired-end cut-off value ($length_cutoff bp)\n\n";
|
0
|
1192 }
|
|
1193 }
|
|
1194 }
|
|
1195
|
|
1196 if ($retain){
|
|
1197 $length_read_1 = 35 unless (defined $length_read_1);
|
|
1198 $length_read_2 = 35 unless (defined $length_read_2);
|
|
1199 }
|
|
1200 }
|
|
1201
|
|
1202 unless ($no_report_file){
|
|
1203 $no_report_file = 0;
|
|
1204 }
|
|
1205
|
|
1206 ### OUTPUT DIR PATH
|
|
1207 if ($output_dir){
|
|
1208 unless ($output_dir =~ /\/$/){
|
|
1209 $output_dir =~ s/$/\//;
|
|
1210 }
|
|
1211 }
|
|
1212 else{
|
|
1213 $output_dir = '';
|
|
1214 }
|
|
1215
|
1
|
1216 ### Trimming at the 5' end
|
|
1217 if (defined $clip_r2){ # trimming 5' bases of read 1
|
|
1218 die "Clipping the 5' end of read 2 is only allowed for paired-end files (--paired)\n" unless ($validate);
|
|
1219 }
|
|
1220
|
|
1221 if (defined $clip_r1){ # trimming 5' bases of read 1
|
|
1222 unless ($clip_r1 > 0 and $clip_r1 < 100){
|
|
1223 die "The 5' clipping value for read 1 should have a sensible value (> 0 and < read length)\n\n";
|
|
1224 }
|
|
1225 }
|
|
1226
|
|
1227 if (defined $clip_r2){ # trimming 5' bases of read 2
|
|
1228 unless ($clip_r2 > 0 and $clip_r2 < 100){
|
|
1229 die "The 5' clipping value for read 2 should have a sensible value (> 0 and < read length)\n\n";
|
|
1230 }
|
|
1231 }
|
|
1232
|
4
|
1233 ### Trimming at the 3' end
|
|
1234 if (defined $three_prime_clip_r1){ # trimming 3' bases of read 1
|
|
1235 unless ($three_prime_clip_r1 > 0 and $three_prime_clip_r1 < 100){
|
|
1236 die "The 3' clipping value for read 1 should have a sensible value (> 0 and < read length)\n\n";
|
|
1237 }
|
|
1238 }
|
1
|
1239
|
4
|
1240 if (defined $three_prime_clip_r2){ # trimming 3' bases of read 2
|
|
1241 unless ($three_prime_clip_r2 > 0 and $three_prime_clip_r2 < 100){
|
|
1242 die "The 3' clipping value for read 2 should have a sensible value (> 0 and < read length)\n\n";
|
|
1243 }
|
|
1244 }
|
|
1245
|
|
1246
|
|
1247 return ($quality,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$adapter2,$error_rate,$output_dir,$no_report_file,$dont_gzip,$clip_r1,$clip_r2,$three_prime_clip_r1,$three_prime_clip_r2);
|
0
|
1248 }
|
|
1249
|
|
1250
|
|
1251
|
|
1252
|
|
1253 sub print_helpfile{
|
|
1254 print << "HELP";
|
|
1255
|
|
1256 USAGE:
|
|
1257
|
|
1258 trim_galore [options] <filename(s)>
|
|
1259
|
|
1260
|
|
1261 -h/--help Print this help message and exits.
|
|
1262
|
|
1263 -v/--version Print the version information and exits.
|
|
1264
|
|
1265 -q/--quality <INT> Trim low-quality ends from reads in addition to adapter removal. For
|
|
1266 RRBS samples, quality trimming will be performed first, and adapter
|
|
1267 trimming is carried in a second round. Other files are quality and adapter
|
|
1268 trimmed in a single pass. The algorithm is the same as the one used by BWA
|
|
1269 (Subtract INT from all qualities; compute partial sums from all indices
|
|
1270 to the end of the sequence; cut sequence at the index at which the sum is
|
|
1271 minimal). Default Phred score: 20.
|
|
1272
|
|
1273 --phred33 Instructs Cutadapt to use ASCII+33 quality scores as Phred scores
|
|
1274 (Sanger/Illumina 1.9+ encoding) for quality trimming. Default: ON.
|
|
1275
|
|
1276 --phred64 Instructs Cutadapt to use ASCII+64 quality scores as Phred scores
|
|
1277 (Illumina 1.5 encoding) for quality trimming.
|
|
1278
|
|
1279 --fastqc Run FastQC in the default mode on the FastQ file once trimming is complete.
|
|
1280
|
|
1281 --fastqc_args "<ARGS>" Passes extra arguments to FastQC. If more than one argument is to be passed
|
|
1282 to FastQC they must be in the form "arg1 arg2 etc.". An example would be:
|
|
1283 --fastqc_args "--nogroup --outdir /home/". Passing extra arguments will
|
|
1284 automatically invoke FastQC, so --fastqc does not have to be specified
|
|
1285 separately.
|
|
1286
|
|
1287 -a/--adapter <STRING> Adapter sequence to be trimmed. If not specified explicitely, the first 13
|
|
1288 bp of the Illumina adapter 'AGATCGGAAGAGC' will be used by default.
|
|
1289
|
|
1290 -a2/--adapter2 <STRING> Optional adapter sequence to be trimmed off read 2 of paired-end files. This
|
|
1291 option requires '--paired' to be specified as well.
|
|
1292
|
|
1293
|
4
|
1294 --stringency <INT> Overlap with adapter sequence required to trim a sequence. Defaults to a
|
|
1295 very stringent setting of 1, i.e. even a single bp of overlapping sequence
|
|
1296 will be trimmed off from the 3' end of any read.
|
0
|
1297
|
|
1298 -e <ERROR RATE> Maximum allowed error rate (no. of errors divided by the length of the matching
|
|
1299 region) (default: 0.1)
|
|
1300
|
1
|
1301 --gzip Compress the output file with GZIP. If the input files are GZIP-compressed
|
|
1302 the output files will automatically be GZIP compressed as well. As of v0.2.8 the
|
|
1303 compression will take place on the fly.
|
|
1304
|
|
1305 --dont_gzip Output files won't be compressed with GZIP. This option overrides --gzip.
|
0
|
1306
|
|
1307 --length <INT> Discard reads that became shorter than length INT because of either
|
|
1308 quality or adapter trimming. A value of '0' effectively disables
|
|
1309 this behaviour. Default: 20 bp.
|
|
1310
|
|
1311 For paired-end files, both reads of a read-pair need to be longer than
|
|
1312 <INT> bp to be printed out to validated paired-end files (see option --paired).
|
|
1313 If only one read became too short there is the possibility of keeping such
|
|
1314 unpaired single-end reads (see --retain_unpaired). Default pair-cutoff: 20 bp.
|
|
1315
|
|
1316 -o/--output_dir <DIR> If specified all output will be written to this directory instead of the current
|
|
1317 directory.
|
|
1318
|
|
1319 --no_report_file If specified no report file will be generated.
|
|
1320
|
|
1321 --suppress_warn If specified any output to STDOUT or STDERR will be suppressed.
|
|
1322
|
1
|
1323 --clip_R1 <int> Instructs Trim Galore to remove <int> bp from the 5' end of read 1 (or single-end
|
|
1324 reads). This may be useful if the qualities were very poor, or if there is some
|
|
1325 sort of unwanted bias at the 5' end. Default: OFF.
|
|
1326
|
|
1327 --clip_R2 <int> Instructs Trim Galore to remove <int> bp from the 5' end of read 2 (paired-end reads
|
|
1328 only). This may be useful if the qualities were very poor, or if there is some sort
|
|
1329 of unwanted bias at the 5' end. For paired-end BS-Seq, it is recommended to remove
|
|
1330 the first few bp because the end-repair reaction may introduce a bias towards low
|
|
1331 methylation. Please refer to the M-bias plot section in the Bismark User Guide for
|
|
1332 some examples. Default: OFF.
|
|
1333
|
4
|
1334 --three_prime_clip_R1 <int> Instructs Trim Galore to remove <int> bp from the 3' end of read 1 (or single-end
|
|
1335 reads) AFTER adapter/quality trimming has been performed. This may remove some unwanted
|
|
1336 bias from the 3' end that is not directly related to adapter sequence or basecall quality.
|
|
1337 Default: OFF.
|
|
1338
|
|
1339 --three_prime_clip_R2 <int> Instructs Trim Galore to remove <int> bp from the 3' end of read 2 AFTER
|
|
1340 adapter/quality trimming has been performed. This may remove some unwanted bias from
|
|
1341 the 3' end that is not directly related to adapter sequence or basecall quality.
|
|
1342 Default: OFF.
|
0
|
1343
|
|
1344
|
|
1345 RRBS-specific options (MspI digested material):
|
|
1346
|
|
1347 --rrbs Specifies that the input file was an MspI digested RRBS sample (recognition
|
|
1348 site: CCGG). Sequences which were adapter-trimmed will have a further 2 bp
|
|
1349 removed from their 3' end. This is to avoid that the filled-in C close to the
|
|
1350 second MspI site in a sequence is used for methylation calls. Sequences which
|
|
1351 were merely trimmed because of poor quality will not be shortened further.
|
|
1352
|
|
1353 --non_directional Selecting this option for non-directional RRBS libraries will screen
|
|
1354 quality-trimmed sequences for 'CAA' or 'CGA' at the start of the read
|
|
1355 and, if found, removes the first two basepairs. Like with the option
|
|
1356 '--rrbs' this avoids using cytosine positions that were filled-in
|
|
1357 during the end-repair step. '--non_directional' requires '--rrbs' to
|
|
1358 be specified as well.
|
|
1359
|
|
1360 --keep Keep the quality trimmed intermediate file. Default: off, which means
|
|
1361 the temporary file is being deleted after adapter trimming. Only has
|
|
1362 an effect for RRBS samples since other FastQ files are not trimmed
|
|
1363 for poor qualities separately.
|
|
1364
|
|
1365
|
|
1366 Note for RRBS using MseI:
|
|
1367
|
|
1368 If your DNA material was digested with MseI (recognition motif: TTAA) instead of MspI it is NOT necessary
|
|
1369 to specify --rrbs or --non_directional since virtually all reads should start with the sequence
|
|
1370 'TAA', and this holds true for both directional and non-directional libraries. As the end-repair of 'TAA'
|
|
1371 restricted sites does not involve any cytosines it does not need to be treated especially. Instead, simply
|
|
1372 run Trim Galore! in the standard (i.e. non-RRBS) mode.
|
|
1373
|
|
1374
|
|
1375 Paired-end specific options:
|
|
1376
|
|
1377 --paired This option performs length trimming of quality/adapter/RRBS trimmed reads for
|
|
1378 paired-end files. To pass the validation test, both sequences of a sequence pair
|
|
1379 are required to have a certain minimum length which is governed by the option
|
|
1380 --length (see above). If only one read passes this length threshold the
|
|
1381 other read can be rescued (see option --retain_unpaired). Using this option lets
|
|
1382 you discard too short read pairs without disturbing the sequence-by-sequence order
|
|
1383 of FastQ files which is required by many aligners.
|
|
1384
|
|
1385 Trim Galore! expects paired-end files to be supplied in a pairwise fashion, e.g.
|
|
1386 file1_1.fq file1_2.fq SRR2_1.fq.gz SRR2_2.fq.gz ... .
|
|
1387
|
|
1388 -t/--trim1 Trims 1 bp off every read from its 3' end. This may be needed for FastQ files that
|
|
1389 are to be aligned as paired-end data with Bowtie. This is because Bowtie (1) regards
|
|
1390 alignments like this:
|
|
1391
|
|
1392 R1 ---------------------------> or this: -----------------------> R1
|
|
1393 R2 <--------------------------- <----------------- R2
|
|
1394
|
|
1395 as invalid (whenever a start/end coordinate is contained within the other read).
|
|
1396
|
|
1397 --retain_unpaired If only one of the two paired-end reads became too short, the longer
|
|
1398 read will be written to either '.unpaired_1.fq' or '.unpaired_2.fq'
|
|
1399 output files. The length cutoff for unpaired single-end reads is
|
|
1400 governed by the parameters -r1/--length_1 and -r2/--length_2. Default: OFF.
|
|
1401
|
|
1402 -r1/--length_1 <INT> Unpaired single-end read length cutoff needed for read 1 to be written to
|
|
1403 '.unpaired_1.fq' output file. These reads may be mapped in single-end mode.
|
|
1404 Default: 35 bp.
|
|
1405
|
|
1406 -r2/--length_2 <INT> Unpaired single-end read length cutoff needed for read 2 to be written to
|
|
1407 '.unpaired_2.fq' output file. These reads may be mapped in single-end mode.
|
|
1408 Default: 35 bp.
|
|
1409
|
|
1410
|
4
|
1411 Last modified on 16 July 2014.
|
0
|
1412
|
|
1413 HELP
|
|
1414 exit;
|
|
1415 }
|