0
|
1 #!/usr/bin/perl
|
|
2 use strict;
|
|
3 use warnings;
|
|
4 use Getopt::Long;
|
|
5 use IPC::Open3;
|
|
6 use File::Spec;
|
|
7 use File::Basename;
|
|
8 use Cwd;
|
|
9
|
1
|
10 ## This program is Copyright (C) 2012-13, Felix Krueger (felix.krueger@babraham.ac.uk)
|
0
|
11
|
|
12 ## This program is free software: you can redistribute it and/or modify
|
|
13 ## it under the terms of the GNU General Public License as published by
|
|
14 ## the Free Software Foundation, either version 3 of the License, or
|
|
15 ## (at your option) any later version.
|
|
16
|
|
17 ## This program is distributed in the hope that it will be useful,
|
|
18 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
19 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
20 ## GNU General Public License for more details.
|
|
21
|
|
22 ## You should have received a copy of the GNU General Public License
|
|
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
24
|
|
25
|
|
26
|
|
27 ## this script is taking in FastQ sequences and trims them with Cutadapt
|
1
|
28 ## last modified on 10 April 2013
|
0
|
29
|
|
30 ########################################################################
|
|
31
|
|
32 # change these paths if needed
|
|
33
|
|
34 my $path_to_cutadapt = 'cutadapt';
|
|
35 my $path_to_fastqc = 'fastqc';
|
|
36
|
|
37 ########################################################################
|
|
38
|
|
39
|
1
|
40 my $trimmer_version = '0.2.8';
|
0
|
41 my $DOWARN = 1; # print on screen warning and text by default
|
|
42 BEGIN { $SIG{'__WARN__'} = sub { warn $_[0] if $DOWARN } };
|
|
43
|
1
|
44 my ($cutoff,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$a2,$error_rate,$output_dir,$no_report_file,$dont_gzip,$clip_r1,$clip_r2) = process_commandline();
|
|
45
|
|
46 my @filenames = @ARGV;
|
|
47
|
|
48 die "\nPlease provide the filename(s) of one or more FastQ file(s) to launch Trim Galore!\n
|
|
49 USAGE: 'trim_galore [options] <filename(s)>' or 'trim_galore --help' for more options\n\n" unless (@filenames);
|
|
50
|
0
|
51
|
|
52 ### SETTING DEFAULTS UNLESS THEY WERE SPECIFIED
|
|
53 unless (defined $cutoff){
|
|
54 $cutoff = 20;
|
|
55 }
|
|
56 my $phred_score_cutoff = $cutoff; # only relevant for report
|
|
57
|
|
58 unless (defined $adapter){
|
|
59 $adapter = 'AGATCGGAAGAGC';
|
|
60 }
|
|
61 unless (defined $a2){ # optional adapter for the second read in a pair. Only works for --paired trimming
|
|
62 $a2 = '';
|
|
63 }
|
|
64
|
|
65 unless (defined $stringency){
|
|
66 $stringency = 1;
|
|
67 }
|
|
68
|
|
69 unless (defined $length_cutoff){
|
|
70 $length_cutoff = 20;
|
|
71 }
|
|
72
|
|
73 if ($phred_encoding == 64){
|
|
74 $cutoff += 31;
|
|
75 }
|
|
76
|
|
77 my $file_1;
|
|
78 my $file_2;
|
|
79
|
|
80 foreach my $filename (@ARGV){
|
|
81 trim ($filename);
|
|
82 }
|
|
83
|
|
84
|
|
85 sub trim{
|
|
86 my $filename = shift;
|
|
87
|
|
88 my $output_filename = (split (/\//,$filename))[-1];
|
|
89 # warn "Here is the outputfile name: $output_filename\n";
|
|
90
|
|
91 my $report = $output_filename;
|
|
92 $report =~ s/$/_trimming_report.txt/;
|
|
93
|
|
94 if ($no_report_file) {
|
|
95 $report = File::Spec->devnull;
|
|
96 open (REPORT,'>',$report) or die "Failed to write to file: $!\n";
|
|
97 # warn "Redirecting report output to /dev/null\n";
|
|
98 }
|
|
99 else{
|
|
100 open (REPORT,'>',$output_dir.$report) or die "Failed to write to file: $!\n";
|
|
101 warn "Writing report to '$output_dir$report'\n";
|
|
102 }
|
|
103
|
|
104 warn "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n";
|
|
105 print REPORT "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n";
|
|
106
|
|
107 warn "Quality Phred score cutoff: $phred_score_cutoff\n";
|
|
108 print REPORT "Quality Phred score cutoff: $phred_score_cutoff\n";
|
|
109
|
|
110 warn "Quality encoding type selected: ASCII+$phred_encoding\n";
|
|
111 print REPORT "Quality encoding type selected: ASCII+$phred_encoding\n";
|
|
112
|
|
113 warn "Adapter sequence: '$adapter'\n";
|
|
114 print REPORT "Adapter sequence: '$adapter'\n";
|
|
115
|
|
116 if ($error_rate == 0.1){
|
|
117 warn "Maximum trimming error rate: $error_rate (default)\n";
|
|
118 }
|
|
119 else{
|
|
120 warn "Maximum trimming error rate: $error_rate\n";
|
|
121 }
|
|
122
|
|
123 print REPORT "Maximum trimming error rate: $error_rate";
|
|
124 if ($error_rate == 0.1){
|
|
125 print REPORT " (default)\n";
|
|
126 }
|
|
127 else{
|
|
128 print REPORT "\n";
|
|
129 }
|
|
130
|
|
131 if ($a2){
|
|
132 warn "Optional adapter 2 sequence (only used for read 2 of paired-end files): '$a2'\n";
|
|
133 print REPORT "Optional adapter 2 sequence (only used for read 2 of paired-end files): '$a2'\n";
|
|
134 }
|
|
135
|
|
136 warn "Minimum required adapter overlap (stringency): $stringency bp\n";
|
|
137 print REPORT "Minimum required adapter overlap (stringency): $stringency bp\n";
|
|
138
|
|
139 if ($validate){
|
|
140 warn "Minimum required sequence length for both reads before a sequence pair gets removed: $length_cutoff bp\n";
|
|
141 print REPORT "Minimum required sequence length for both reads before a sequence pair gets removed: $length_cutoff bp\n";
|
|
142 }
|
|
143 else{
|
|
144 warn "Minimum required sequence length before a sequence gets removed: $length_cutoff bp\n";
|
|
145 print REPORT "Minimum required sequence length before a sequence gets removed: $length_cutoff bp\n";
|
|
146 }
|
|
147
|
|
148 if ($validate){ # only for paired-end files
|
|
149
|
|
150 if ($retain){ # keeping single-end reads if only one end is long enough
|
|
151
|
|
152 if ($length_read_1 == 35){
|
|
153 warn "Length cut-off for read 1: $length_read_1 bp (default)\n";
|
|
154 print REPORT "Length cut-off for read 1: $length_read_1 bp (default)\n";
|
|
155 }
|
|
156 else{
|
|
157 warn "Length cut-off for read 1: $length_read_1 bp\n";
|
|
158 print REPORT "Length cut-off for read 1: $length_read_1 bp\n";
|
|
159 }
|
|
160
|
|
161 if ($length_read_2 == 35){
|
1
|
162 warn "Length cut-off for read 2: $length_read_2 bb (default)\n";
|
0
|
163 print REPORT "Length cut-off for read 2: $length_read_2 bp (default)\n";
|
|
164 }
|
|
165 else{
|
|
166 warn "Length cut-off for read 2: $length_read_2 bp\n";
|
|
167 print REPORT "Length cut-off for read 2: $length_read_2 bp\n";
|
|
168 }
|
|
169 }
|
|
170 }
|
|
171
|
|
172 if ($rrbs){
|
|
173 warn "File was specified to be an MspI-digested RRBS sample. Sequences with adapter contamination will be trimmed a further 2 bp to remove potential methylation-biased bases from the end-repair reaction\n";
|
|
174 print REPORT "File was specified to be an MspI-digested RRBS sample. Sequences with adapter contamination will be trimmed a further 2 bp to remove potential methylation-biased bases from the end-repair reaction\n";
|
|
175 }
|
|
176
|
|
177 if ($non_directional){
|
|
178 warn "File was specified to be a non-directional MspI-digested RRBS sample. Sequences starting with either 'CAA' or 'CGA' will have the first 2 bp trimmed off to remove potential methylation-biased bases from the end-repair reaction\n";
|
|
179 print REPORT "File was specified to be a non-directional MspI-digested RRBS sample. Sequences starting with either 'CAA' or 'CGA' will have the first 2 bp trimmed off to remove potential methylation-biased bases from the end-repair reaction\n";
|
|
180 }
|
|
181
|
|
182 if ($trim){
|
|
183 warn "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n";
|
|
184 print REPORT "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n";
|
|
185 }
|
|
186
|
1
|
187 if ($clip_r1){
|
|
188 warn "All Read 1 sequences will be trimmed by $clip_r1 bp from their 5' end to avoid poor qualities or biases\n";
|
|
189 print REPORT "All Read 1 sequences will be trimmed by $clip_r1 bp from their 5' end to avoid poor qualities or biases\n";
|
|
190 }
|
|
191 if ($clip_r2){
|
|
192 warn "All Read 2 sequences will be trimmed by $clip_r2 bp from their 5' end to avoid poor qualities or biases (e.g. M-bias for BS-Seq applications)\n";
|
|
193 print REPORT "All Read 2 sequences will be trimmed by $clip_r2 bp from their 5' end to avoid poor qualities or biases (e.g. M-bias for BS-Seq applications)\n";
|
|
194 }
|
|
195
|
|
196
|
0
|
197 if ($fastqc){
|
|
198 warn "Running FastQC on the data once trimming has completed\n";
|
|
199 print REPORT "Running FastQC on the data once trimming has completed\n";
|
|
200
|
|
201 if ($fastqc_args){
|
|
202 warn "Running FastQC with the following extra arguments: '$fastqc_args'\n";
|
|
203 print REPORT "Running FastQC with the following extra arguments: $fastqc_args\n";
|
|
204 }
|
|
205 }
|
|
206
|
|
207 if ($keep and $rrbs){
|
|
208 warn "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n";
|
|
209 print REPORT "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n";
|
|
210 }
|
|
211
|
1
|
212
|
0
|
213 if ($gzip or $filename =~ /\.gz$/){
|
1
|
214 $gzip = 1;
|
|
215 unless ($dont_gzip){
|
|
216 warn "Output file(s) will be GZIP compressed\n";
|
|
217 print REPORT "Output file will be GZIP compressed\n";
|
|
218 }
|
0
|
219 }
|
|
220
|
|
221 warn "\n";
|
|
222 print REPORT "\n";
|
|
223 sleep (3);
|
|
224
|
|
225 my $temp;
|
|
226
|
|
227 ### Proceeding differently for RRBS and other type of libraries
|
|
228 if ($rrbs){
|
|
229
|
|
230 ### Skipping quality filtering for RRBS libraries if a quality cutoff of 0 was specified
|
|
231 if ($cutoff == 0){
|
|
232 warn "Quality cutoff selected was 0 - Skipping quality trimming altogether\n\n";
|
|
233 sleep (3);
|
|
234 }
|
|
235 else{
|
|
236
|
|
237 $temp = $filename;
|
|
238 $temp =~ s/$/_qual_trimmed.fastq/;
|
|
239 open (TEMP,'>',$output_dir.$temp) or die "Can't write to $temp: $!";
|
|
240
|
|
241 warn " >>> Now performing adaptive quality trimming with a Phred-score cutoff of: $cutoff <<<\n\n";
|
|
242 sleep (3);
|
|
243
|
|
244 open (QUAL,"$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -a X $filename |") or die "Can't open pipe to Cutadapt: $!";
|
|
245
|
|
246 my $qual_count = 0;
|
|
247
|
|
248 while (1){
|
|
249 my $l1 = <QUAL>;
|
|
250 my $seq = <QUAL>;
|
|
251 my $l3 = <QUAL>;
|
|
252 my $qual = <QUAL>;
|
|
253 last unless (defined $qual);
|
|
254
|
|
255 $qual_count++;
|
|
256 if ($qual_count%10000000 == 0){
|
|
257 warn "$qual_count sequences processed\n";
|
|
258 }
|
|
259 print TEMP "$l1$seq$l3$qual";
|
|
260 }
|
|
261
|
|
262 warn "\n >>> Quality trimming completed <<<\n$qual_count sequences processed in total\n\n";
|
|
263 close QUAL or die "Unable to close filehandle: $!\n";
|
|
264 sleep (3);
|
|
265
|
|
266 }
|
|
267 }
|
|
268
|
|
269
|
|
270 if ($output_filename =~ /\.fastq$/){
|
|
271 $output_filename =~ s/\.fastq$/_trimmed.fq/;
|
|
272 }
|
|
273 elsif ($output_filename =~ /\.fastq\.gz$/){
|
|
274 $output_filename =~ s/\.fastq\.gz$/_trimmed.fq/;
|
|
275 }
|
|
276 elsif ($output_filename =~ /\.fq$/){
|
|
277 $output_filename =~ s/\.fq$/_trimmed.fq/;
|
|
278 }
|
|
279 elsif ($output_filename =~ /\.fq\.gz$/){
|
|
280 $output_filename =~ s/\.fq\.gz$/_trimmed.fq/;
|
|
281 }
|
|
282 else{
|
|
283 $output_filename =~ s/$/_trimmed.fq/;
|
|
284 }
|
|
285
|
1
|
286 if ($gzip or $filename =~ /\.gz$/){
|
|
287 unless ($dont_gzip){
|
|
288 if ($validate){
|
|
289 open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n"; # don't need to gzip intermediate file
|
|
290 }
|
|
291 else{
|
|
292 $output_filename .= '.gz';
|
|
293 open (OUT,"| gzip -c - > ${output_dir}${output_filename}") or die "Can't write to $output_filename: $!\n";
|
|
294 }
|
|
295 }
|
|
296 else{
|
|
297 open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n"; # don't need to gzip intermediate file
|
|
298 }
|
|
299 }
|
|
300 else{
|
|
301 open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n";
|
|
302 }
|
0
|
303 warn "Writing final adapter and quality trimmed output to $output_filename\n\n";
|
|
304
|
|
305 my $count = 0;
|
|
306 my $too_short = 0;
|
|
307 my $quality_trimmed = 0;
|
|
308 my $rrbs_trimmed = 0;
|
|
309 my $rrbs_trimmed_start = 0;
|
|
310 my $CAA = 0;
|
|
311 my $CGA = 0;
|
|
312
|
|
313 if ($rrbs and $cutoff != 0){
|
|
314
|
|
315 ### optionally using 2 different adapters for read 1 and read 2
|
|
316 if ($validate and $a2){
|
|
317 ### Figure out whether current file counts as read 1 or read 2 of paired-end files
|
|
318 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
|
|
319 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n";
|
|
320 sleep (3);
|
|
321 open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
|
|
322 }
|
|
323 else{ # this is read 2 of a pair
|
|
324 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$a2' from file $temp <<< \n";
|
|
325 sleep (3);
|
|
326 open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $a2 $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
|
|
327 }
|
|
328 }
|
|
329 ### Using the same adapter for both read 1 and read 2
|
|
330 else{
|
|
331 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n";
|
|
332 sleep (3);
|
|
333 open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
|
|
334 }
|
|
335
|
|
336 close WRITER or die $!; # not needed
|
|
337
|
|
338 open (QUAL,"$output_dir$temp") or die $!; # quality trimmed file
|
|
339
|
|
340 if ($filename =~ /\.gz$/){
|
|
341 open (IN,"zcat $filename |") or die $!; # original, untrimmed file
|
|
342 }
|
|
343 else{
|
|
344 open (IN,$filename) or die $!; # original, untrimmed file
|
|
345 }
|
|
346
|
|
347 while (1){
|
|
348
|
|
349 # we can process the output from Cutadapt and the original input 1 by 1 to decide if the adapter has been removed or not
|
|
350 my $l1 = <TRIM>;
|
|
351 my $seq = <TRIM>; # adapter trimmed sequence
|
|
352 my $l3 = <TRIM>;
|
|
353 my $qual = <TRIM>;
|
|
354
|
|
355 $_ = <IN>; # irrelevant
|
|
356 my $original_seq = <IN>;
|
|
357 $_ = <IN>; # irrelevant
|
|
358 $_ = <IN>; # irrelevant
|
|
359
|
|
360 $_ = <QUAL>; # irrelevant
|
|
361 my $qual_trimmed_seq = <QUAL>;
|
|
362 $_ = <QUAL>; # irrelevant
|
|
363 my $qual_trimmed_qual = <QUAL>;
|
|
364
|
|
365 last unless (defined $qual and defined $qual_trimmed_qual); # could be empty strings
|
|
366
|
|
367 $count++;
|
|
368 if ($count%10000000 == 0){
|
|
369 warn "$count sequences processed\n";
|
|
370 }
|
|
371
|
|
372 chomp $seq;
|
|
373 chomp $qual;
|
|
374 chomp $qual_trimmed_seq;
|
|
375 chomp $original_seq;
|
|
376
|
|
377 my $quality_trimmed_seq_length = length $qual_trimmed_seq;
|
|
378
|
|
379 if (length $original_seq > length $qual_trimmed_seq){
|
|
380 ++$quality_trimmed;
|
|
381 }
|
|
382
|
|
383 my $nd = 0;
|
|
384
|
|
385 ### NON-DIRECTIONAL RRBS
|
|
386 if ($non_directional){
|
|
387 if (length$seq > 2){
|
|
388 if ($seq =~ /^CAA/){
|
|
389 ++$CAA;
|
|
390 $seq = substr ($seq,2,length($seq)-2);
|
|
391 $qual = substr ($qual,2,length($qual)-2);
|
|
392 ++$rrbs_trimmed_start;
|
|
393 $nd = 1;
|
|
394 }
|
|
395 elsif ($seq =~ /^CGA/){
|
|
396 $seq = substr ($seq,2,length($seq)-2);
|
|
397 $qual = substr ($qual,2,length($qual)-2);
|
|
398 ++$CGA;
|
|
399 ++$rrbs_trimmed_start;
|
|
400 $nd = 1;
|
|
401 }
|
|
402 }
|
|
403 }
|
|
404
|
|
405 ### directional read
|
|
406 unless ($nd == 1){
|
|
407 if (length $seq >= 2 and length$seq < $quality_trimmed_seq_length){
|
|
408 $seq = substr ($seq,0,length($seq)-2);
|
|
409 $qual = substr ($qual,0,length($qual)-2);
|
|
410 ++$rrbs_trimmed;
|
|
411 }
|
|
412 }
|
|
413
|
|
414 ### Shortening all sequences by 1 bp on the 3' end
|
|
415 if ($trim){
|
|
416 $seq = substr($seq,0,length($seq)-1);
|
|
417 $qual = substr($qual,0,length($qual)-1);
|
|
418 }
|
|
419
|
|
420 ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE
|
|
421 if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later)
|
|
422 print OUT "$l1$seq\n$l3$qual\n";
|
|
423 }
|
|
424 else{ # single end
|
1
|
425
|
|
426 if ($clip_r1){
|
|
427 $seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence
|
|
428 $qual = substr($qual,$clip_r1);
|
|
429 }
|
|
430
|
0
|
431 if (length $seq < $length_cutoff){
|
|
432 ++$too_short;
|
|
433 next;
|
|
434 }
|
|
435 else{
|
|
436 print OUT "$l1$seq\n$l3$qual\n";
|
|
437 }
|
|
438 }
|
|
439 }
|
|
440
|
|
441 print REPORT "\n";
|
|
442 while (<ERROR>){
|
|
443 warn $_;
|
|
444 print REPORT $_;
|
|
445 }
|
|
446
|
|
447 close IN or die "Unable to close IN filehandle: $!";
|
|
448 close QUAL or die "Unable to close QUAL filehandle: $!";
|
|
449 close TRIM or die "Unable to close TRIM filehandle: $!";
|
|
450 close OUT or die "Unable to close OUT filehandle: $!";
|
|
451 }
|
|
452 else{
|
|
453
|
|
454 ### optionally using 2 different adapters for read 1 and read 2
|
|
455 if ($validate and $a2){
|
|
456 ### Figure out whether current file counts as read 1 or read 2 of paired-end files
|
|
457 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
|
|
458 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n";
|
|
459 sleep (3);
|
|
460 open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $adapter $filename") or die "Failed to launch Cutadapt: $!";
|
|
461 }
|
|
462 else{ # this is read 2 of a pair
|
|
463 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$a2' from file $filename <<< \n";
|
|
464 sleep (3);
|
|
465 open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $a2 $filename") or die "Failed to launch Cutadapt: $!";
|
|
466 }
|
|
467 }
|
|
468 ### Using the same adapter for both read 1 and read 2
|
|
469 else{
|
|
470 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n";
|
|
471 sleep (3);
|
|
472 open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $adapter $filename") or die "Failed to launch Cutadapt: $!";
|
|
473 }
|
|
474
|
|
475 close WRITER or die $!; # not needed
|
|
476
|
|
477 while (1){
|
|
478
|
|
479 my $l1 = <TRIM>;
|
|
480 my $seq = <TRIM>; # quality and/or adapter trimmed sequence
|
|
481 my $l3 = <TRIM>;
|
|
482 my $qual = <TRIM>;
|
|
483 # print "$l1$seq\n$l3$qual\n";
|
|
484 last unless (defined $qual); # could be an empty string
|
|
485
|
|
486 $count++;
|
|
487 if ($count%10000000 == 0){
|
|
488 warn "$count sequences processed\n";
|
|
489 }
|
|
490
|
|
491 chomp $seq;
|
|
492 chomp $qual;
|
|
493
|
|
494 ### Shortening all sequences by 1 bp on the 3' end
|
|
495 if ($trim){
|
|
496 $seq = substr($seq,0,length($seq)-1);
|
|
497 $qual = substr($qual,0,length($qual)-1);
|
|
498 }
|
|
499
|
|
500 ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE
|
|
501 if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later)
|
|
502 print OUT "$l1$seq\n$l3$qual\n";
|
|
503 }
|
|
504 else{ # single end
|
1
|
505
|
|
506 if ($clip_r1){
|
|
507 $seq = substr($seq,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence
|
|
508 $qual = substr($qual,$clip_r1);
|
|
509 }
|
|
510
|
0
|
511 if (length $seq < $length_cutoff){
|
|
512 ++$too_short;
|
|
513 next;
|
|
514 }
|
|
515 else{
|
|
516 print OUT "$l1$seq\n$l3$qual\n";
|
|
517 }
|
|
518 }
|
|
519 }
|
|
520
|
|
521 print REPORT "\n";
|
|
522 while (<ERROR>){
|
|
523 warn $_;
|
|
524 print REPORT $_;
|
|
525 }
|
|
526
|
|
527 close TRIM or die "Unable to close TRIM filehandle: $!\n";
|
|
528 close ERROR or die "Unable to close ERROR filehandle: $!\n";
|
|
529 close OUT or die "Unable to close OUT filehandle: $!\n";
|
|
530
|
|
531 }
|
|
532
|
|
533 if ($rrbs){
|
|
534 unless ($keep){ # keeping the quality trimmed intermediate file for RRBS files
|
|
535
|
|
536 # deleting temporary quality trimmed file
|
|
537 my $deleted = unlink "$output_dir$temp";
|
|
538
|
|
539 if ($deleted){
|
|
540 warn "Successfully deleted temporary file $temp\n\n";
|
|
541 }
|
|
542 else{
|
|
543 warn "Could not delete temporary file $temp";
|
|
544 }
|
|
545 }
|
|
546 }
|
|
547
|
|
548 warn "\nRUN STATISTICS FOR INPUT FILE: $filename\n";
|
|
549 print REPORT "\nRUN STATISTICS FOR INPUT FILE: $filename\n";
|
|
550
|
|
551 warn "="x 45,"\n";
|
|
552 print REPORT "="x 45,"\n";
|
|
553
|
|
554 warn "$count sequences processed in total\n";
|
|
555 print REPORT "$count sequences processed in total\n";
|
|
556
|
|
557 ### only reporting this separately if quality and adapter trimming were performed separately
|
|
558 if ($rrbs){
|
|
559 my $percentage_shortened = sprintf ("%.1f",$quality_trimmed/$count*100);
|
|
560 warn "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n";
|
|
561 print REPORT "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n";
|
|
562 }
|
|
563
|
|
564 my $percentage_too_short = sprintf ("%.1f",$too_short/$count*100);
|
|
565 warn "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n";
|
|
566 print REPORT "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n";
|
|
567
|
|
568 if ($rrbs){
|
|
569 my $percentage_rrbs_trimmed = sprintf ("%.1f",$rrbs_trimmed/$count*100);
|
|
570 warn "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n";
|
|
571 print REPORT "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n";
|
|
572 }
|
|
573
|
|
574 if ($non_directional){
|
|
575 my $percentage_rrbs_trimmed_at_start = sprintf ("%.1f",$rrbs_trimmed_start/$count*100);
|
|
576 warn "RRBS reads trimmed by 2 bp at the start when read started with CAA ($CAA) or CGA ($CGA) in total:\t$rrbs_trimmed_start ($percentage_rrbs_trimmed_at_start%)\n";
|
|
577 print REPORT "RRBS reads trimmed by 2 bp at the start when read started with CAA ($CAA) or CGA ($CGA) in total:\t$rrbs_trimmed_start ($percentage_rrbs_trimmed_at_start%)\n";
|
|
578 }
|
|
579
|
|
580 warn "\n";
|
|
581 print REPORT "\n";
|
|
582
|
1
|
583 ### RUNNING FASTQC unless we are dealing with paired-end files
|
|
584 unless($validate){
|
|
585 if ($fastqc){
|
|
586 warn "\n >>> Now running FastQC on the data <<<\n\n";
|
|
587 sleep (5);
|
|
588 if ($fastqc_args){
|
|
589 system ("$path_to_fastqc $fastqc_args $output_dir$output_filename");
|
|
590 }
|
|
591 else{
|
|
592 system ("$path_to_fastqc $output_dir$output_filename");
|
|
593 }
|
0
|
594 }
|
|
595 }
|
|
596
|
|
597 ### VALIDATE PAIRED-END FILES
|
|
598 if ($validate){
|
|
599
|
|
600 ### Figure out whether current file counts as read 1 or read 2 of paired-end files
|
|
601
|
|
602 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
|
|
603 $file_1 = $output_filename;
|
|
604 shift @filenames;
|
|
605 # warn "This is read 1: $file_1\n\n";
|
|
606 }
|
|
607 else{ # this is read 2 of a pair
|
|
608 $file_2 = $output_filename;
|
|
609 shift @filenames;
|
|
610 # warn "This is read 2: $file_2\n\n";
|
|
611 }
|
|
612
|
|
613 if ($file_1 and $file_2){
|
|
614 warn "Validate paired-end files $file_1 and $file_2\n";
|
|
615 sleep (1);
|
|
616
|
|
617 my ($val_1,$val_2,$un_1,$un_2) = validate_paired_end_files($file_1,$file_2);
|
|
618
|
|
619 ### RUNNING FASTQC
|
|
620 if ($fastqc){
|
|
621
|
|
622 warn "\n >>> Now running FastQC on the validated data $val_1<<<\n\n";
|
|
623 sleep (3);
|
|
624
|
|
625 if ($fastqc_args){
|
1
|
626 system ("$path_to_fastqc $fastqc_args $output_dir$val_1");
|
0
|
627 }
|
|
628 else{
|
1
|
629 system ("$path_to_fastqc $output_dir$val_1");
|
0
|
630 }
|
|
631
|
|
632 warn "\n >>> Now running FastQC on the validated data $val_2<<<\n\n";
|
|
633 sleep (3);
|
|
634
|
|
635 if ($fastqc_args){
|
1
|
636 system ("$path_to_fastqc $fastqc_args $output_dir$val_2");
|
0
|
637 }
|
|
638 else{
|
1
|
639 system ("$path_to_fastqc $output_dir$val_2");
|
0
|
640 }
|
|
641
|
|
642 }
|
|
643
|
|
644 warn "Deleting both intermediate output files $file_1 and $file_2\n";
|
|
645 unlink "$output_dir$file_1";
|
|
646 unlink "$output_dir$file_2";
|
|
647
|
|
648 warn "\n",'='x100,"\n\n";
|
|
649 sleep (3);
|
|
650
|
|
651 $file_1 = undef; # setting file_1 and file_2 to undef once validation is completed
|
|
652 $file_2 = undef;
|
|
653 }
|
|
654 }
|
|
655
|
|
656 }
|
|
657
|
|
658 sub validate_paired_end_files{
|
|
659
|
|
660 my $file_1 = shift;
|
|
661 my $file_2 = shift;
|
|
662
|
1
|
663 warn "file_1: $file_1, file_2: $file_2\n\n";
|
0
|
664
|
|
665 if ($file_1 =~ /\.gz$/){
|
|
666 open (IN1,"zcat $output_dir$file_1 |") or die "Couldn't read from file $file_1: $!\n";
|
|
667 }
|
|
668 else{
|
|
669 open (IN1, "$output_dir$file_1") or die "Couldn't read from file $file_1: $!\n";
|
|
670 }
|
|
671
|
|
672 if ($file_2 =~ /\.gz$/){
|
|
673 open (IN2,"zcat $output_dir$file_2 |") or die "Couldn't read from file $file_2: $!\n";
|
|
674 }
|
|
675 else{
|
|
676 open (IN2, "$output_dir$file_2") or die "Couldn't read from file $file_2: $!\n";
|
|
677 }
|
|
678
|
|
679 warn "\n>>>>> Now validing the length of the 2 paired-end infiles: $file_1 and $file_2 <<<<<\n";
|
|
680 sleep (3);
|
|
681
|
|
682 my $out_1 = $file_1;
|
|
683 my $out_2 = $file_2;
|
|
684
|
|
685 if ($out_1 =~ /gz$/){
|
|
686 $out_1 =~ s/trimmed\.fq\.gz$/val_1.fq/;
|
|
687 }
|
|
688 else{
|
|
689 $out_1 =~ s/trimmed\.fq$/val_1.fq/;
|
|
690 }
|
|
691
|
|
692 if ($out_2 =~ /gz$/){
|
|
693 $out_2 =~ s/trimmed\.fq\.gz$/val_2.fq/;
|
|
694 }
|
|
695 else{
|
|
696 $out_2 =~ s/trimmed\.fq$/val_2.fq/;
|
|
697 }
|
|
698
|
1
|
699 if ($gzip){
|
|
700 if ($dont_gzip){
|
|
701 open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n";
|
|
702 }
|
|
703 else{
|
|
704 $out_1 .= '.gz';
|
|
705 open (R1,"| gzip -c - > ${output_dir}${out_1}") or die "Can't write to $out_1: $!\n";
|
|
706 }
|
|
707 }
|
|
708 else{
|
|
709 open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n";
|
|
710 }
|
|
711
|
|
712 if ($gzip){
|
|
713 if ($dont_gzip){
|
|
714 open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n";
|
|
715 }
|
|
716 else{
|
|
717 $out_2 .= '.gz';
|
|
718 open (R2,"| gzip -c - > ${output_dir}${out_2}") or die "Can't write to $out_2: $!\n";
|
|
719 }
|
|
720 }
|
|
721 else{
|
|
722 open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n";
|
|
723 }
|
|
724
|
0
|
725 warn "Writing validated paired-end read 1 reads to $out_1\n";
|
|
726 warn "Writing validated paired-end read 2 reads to $out_2\n\n";
|
|
727
|
|
728 my $unpaired_1;
|
|
729 my $unpaired_2;
|
|
730
|
|
731 if ($retain){
|
|
732
|
|
733 $unpaired_1 = $file_1;
|
|
734 $unpaired_2 = $file_2;
|
|
735
|
|
736 if ($unpaired_1 =~ /gz$/){
|
|
737 $unpaired_1 =~ s/trimmed\.fq\.gz$/unpaired_1.fq/;
|
|
738 }
|
|
739 else{
|
|
740 $unpaired_1 =~ s/trimmed\.fq$/unpaired_1.fq/;
|
|
741 }
|
|
742
|
|
743 if ($unpaired_2 =~ /gz$/){
|
|
744 $unpaired_2 =~ s/trimmed\.fq\.gz$/unpaired_2.fq/;
|
|
745 }
|
|
746 else{
|
|
747 $unpaired_2 =~ s/trimmed\.fq$/unpaired_2.fq/;
|
|
748 }
|
|
749
|
1
|
750 if ($gzip){
|
|
751 if ($dont_gzip){
|
|
752 open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n";
|
|
753 }
|
|
754 else{
|
|
755 $unpaired_1 .= '.gz';
|
|
756 open (UNPAIRED1,"| gzip -c - > ${output_dir}${unpaired_1}") or die "Can't write to $unpaired_1: $!\n";
|
|
757 }
|
|
758 }
|
|
759 else{
|
|
760 open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n";
|
|
761 }
|
|
762
|
|
763 if ($gzip){
|
|
764 if ($dont_gzip){
|
|
765 open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n";
|
|
766 }
|
|
767 else{
|
|
768 $unpaired_2 .= '.gz';
|
|
769 open (UNPAIRED2,"| gzip -c - > ${output_dir}${unpaired_2}") or die "Can't write to $unpaired_2: $!\n";
|
|
770 }
|
|
771 }
|
|
772 else{
|
|
773 open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n";
|
|
774 }
|
0
|
775
|
|
776 warn "Writing unpaired read 1 reads to $unpaired_1\n";
|
|
777 warn "Writing unpaired read 2 reads to $unpaired_2\n\n";
|
|
778 }
|
|
779
|
|
780 my $sequence_pairs_removed = 0;
|
|
781 my $read_1_printed = 0;
|
|
782 my $read_2_printed = 0;
|
|
783
|
|
784 my $count = 0;
|
|
785
|
|
786 while (1){
|
|
787 my $id_1 = <IN1>;
|
|
788 my $seq_1 = <IN1>;
|
|
789 my $l3_1 = <IN1>;
|
|
790 my $qual_1 = <IN1>;
|
|
791 last unless ($id_1 and $seq_1 and $l3_1 and $qual_1);
|
|
792
|
|
793 my $id_2 = <IN2>;
|
|
794 my $seq_2 = <IN2>;
|
|
795 my $l3_2 = <IN2>;
|
|
796 my $qual_2 = <IN2>;
|
|
797 last unless ($id_2 and $seq_2 and $l3_2 and $qual_2);
|
|
798
|
|
799 ++$count;
|
|
800
|
|
801
|
1
|
802 ## small check if the sequence files appear to be FastQ files
|
0
|
803 if ($count == 1){ # performed just once
|
|
804 if ($id_1 !~ /^\@/ or $l3_1 !~ /^\+/){
|
|
805 die "Input file doesn't seem to be in FastQ format at sequence $count\n";
|
|
806 }
|
|
807 if ($id_2 !~ /^\@/ or $l3_2 !~ /^\+/){
|
|
808 die "Input file doesn't seem to be in FastQ format at sequence $count\n";
|
|
809 }
|
|
810 }
|
|
811
|
|
812 chomp $seq_1;
|
|
813 chomp $seq_2;
|
|
814
|
1
|
815 if ($clip_r1){
|
|
816 $seq_1 = substr($seq_1,$clip_r1); # starting after the sequences to be trimmed until the end of the sequence
|
|
817 $qual_1 = substr($qual_1,$clip_r1);
|
|
818 }
|
|
819 if ($clip_r2){
|
|
820 $seq_2 = substr($seq_2,$clip_r2); # starting after the sequences to be trimmed until the end of the sequence
|
|
821 $qual_2 = substr($qual_2,$clip_r2);
|
|
822 }
|
0
|
823
|
|
824 ### making sure that the reads do have a sensible length
|
|
825 if ( (length($seq_1) < $length_cutoff) or (length($seq_2) < $length_cutoff) ){
|
|
826 ++$sequence_pairs_removed;
|
|
827 if ($retain){ # writing out single-end reads if they are longer than the cutoff
|
|
828
|
|
829 if ( length($seq_1) >= $length_read_1){ # read 1 is long enough
|
|
830 print UNPAIRED1 $id_1;
|
|
831 print UNPAIRED1 "$seq_1\n";
|
|
832 print UNPAIRED1 $l3_1;
|
|
833 print UNPAIRED1 $qual_1;
|
|
834 ++$read_1_printed;
|
|
835 }
|
|
836
|
|
837 if ( length($seq_2) >= $length_read_2){ # read 2 is long enough
|
|
838 print UNPAIRED2 $id_2;
|
|
839 print UNPAIRED2 "$seq_2\n";
|
|
840 print UNPAIRED2 $l3_2;
|
|
841 print UNPAIRED2 $qual_2;
|
|
842 ++$read_2_printed;
|
|
843 }
|
|
844
|
|
845 }
|
|
846 }
|
|
847 else{
|
|
848 print R1 $id_1;
|
|
849 print R1 "$seq_1\n";
|
|
850 print R1 $l3_1;
|
|
851 print R1 $qual_1;
|
|
852
|
|
853 print R2 $id_2;
|
|
854 print R2 "$seq_2\n";
|
|
855 print R2 $l3_2;
|
|
856 print R2 $qual_2;
|
|
857 }
|
|
858
|
|
859 }
|
|
860 my $percentage = sprintf("%.2f",$sequence_pairs_removed/$count*100);
|
|
861 warn "Total number of sequences analysed: $count\n\n";
|
|
862 warn "Number of sequence pairs removed: $sequence_pairs_removed ($percentage%)\n";
|
|
863
|
|
864 print REPORT "Total number of sequences analysed for the sequence pair length validation: $count\n\n";
|
|
865 print REPORT "Number of sequence pairs removed because at least one read was shorter than the length cutoff ($length_cutoff bp): $sequence_pairs_removed ($percentage%)\n";
|
|
866
|
|
867 if ($keep){
|
|
868 warn "Number of unpaired read 1 reads printed: $read_1_printed\n";
|
|
869 warn "Number of unpaired read 2 reads printed: $read_2_printed\n";
|
|
870 }
|
|
871
|
1
|
872 close R1 or die $!;
|
|
873 close R2 or die $!;
|
|
874
|
|
875 if ($retain){
|
|
876 close UNPAIRED1 or die $!;
|
|
877 close UNPAIRED2 or die $!;
|
|
878 }
|
|
879
|
0
|
880 warn "\n";
|
|
881 if ($retain){
|
|
882 return ($out_1,$out_2,$unpaired_1,$unpaired_2);
|
|
883 }
|
|
884 else{
|
|
885 return ($out_1,$out_2);
|
|
886 }
|
|
887 }
|
|
888
|
|
889 sub process_commandline{
|
|
890 my $help;
|
|
891 my $quality;
|
|
892 my $adapter;
|
|
893 my $adapter2;
|
|
894 my $stringency;
|
|
895 my $report;
|
|
896 my $version;
|
|
897 my $rrbs;
|
|
898 my $length_cutoff;
|
|
899 my $keep;
|
|
900 my $fastqc;
|
|
901 my $non_directional;
|
|
902 my $phred33;
|
|
903 my $phred64;
|
|
904 my $fastqc_args;
|
|
905 my $trim;
|
|
906 my $gzip;
|
|
907 my $validate;
|
|
908 my $retain;
|
|
909 my $length_read_1;
|
|
910 my $length_read_2;
|
|
911 my $error_rate;
|
|
912 my $output_dir;
|
|
913 my $no_report_file;
|
|
914 my $suppress_warn;
|
1
|
915 my $dont_gzip;
|
|
916 my $clip_r1;
|
|
917 my $clip_r2;
|
0
|
918
|
|
919 my $command_line = GetOptions ('help|man' => \$help,
|
|
920 'q|quality=i' => \$quality,
|
|
921 'a|adapter=s' => \$adapter,
|
|
922 'a2|adapter2=s' => \$adapter2,
|
|
923 'report' => \$report,
|
|
924 'version' => \$version,
|
|
925 'stringency=i' => \$stringency,
|
|
926 'fastqc' => \$fastqc,
|
|
927 'RRBS' => \$rrbs,
|
|
928 'keep' => \$keep,
|
|
929 'length=i' => \$length_cutoff,
|
|
930 'non_directional' => \$non_directional,
|
|
931 'phred33' => \$phred33,
|
|
932 'phred64' => \$phred64,
|
|
933 'fastqc_args=s' => \$fastqc_args,
|
|
934 'trim1' => \$trim,
|
|
935 'gzip' => \$gzip,
|
|
936 'paired_end' => \$validate,
|
|
937 'retain_unpaired' => \$retain,
|
|
938 'length_1|r1=i' => \$length_read_1,
|
|
939 'length_2|r2=i' => \$length_read_2,
|
|
940 'e|error_rate=s' => \$error_rate,
|
|
941 'o|output_dir=s' => \$output_dir,
|
|
942 'no_report_file' => \$no_report_file,
|
|
943 'suppress_warn' => \$suppress_warn,
|
1
|
944 'dont_gzip' => \$dont_gzip,
|
|
945 'clip_R1=i' => \$clip_r1,
|
|
946 'clip_R2=i' => \$clip_r2,
|
0
|
947 );
|
1
|
948
|
|
949
|
0
|
950 ### EXIT ON ERROR if there were errors with any of the supplied options
|
|
951 unless ($command_line){
|
|
952 die "Please respecify command line options\n";
|
|
953 }
|
|
954
|
|
955 ### HELPFILE
|
|
956 if ($help){
|
|
957 print_helpfile();
|
|
958 exit;
|
|
959 }
|
|
960
|
|
961
|
|
962
|
|
963
|
|
964 if ($version){
|
|
965 print << "VERSION";
|
|
966
|
|
967 Quality-/Adapter-/RRBS-Trimming
|
|
968 (powered by Cutadapt)
|
|
969 version $trimmer_version
|
|
970
|
1
|
971 Last update: 10 04 2013
|
0
|
972
|
|
973 VERSION
|
|
974 exit;
|
|
975 }
|
|
976
|
|
977 ### RRBS
|
|
978 unless ($rrbs){
|
|
979 $rrbs = 0;
|
|
980 }
|
|
981
|
|
982 ### SUPRESS WARNINGS
|
|
983 if (defined $suppress_warn){
|
|
984 $DOWARN = 0;
|
|
985 }
|
|
986
|
|
987 ### QUALITY SCORES
|
|
988 my $phred_encoding;
|
|
989 if ($phred33){
|
|
990 if ($phred64){
|
|
991 die "Please specify only a single quality encoding type (--phred33 or --phred64)\n\n";
|
|
992 }
|
|
993 $phred_encoding = 33;
|
|
994 }
|
|
995 elsif ($phred64){
|
|
996 $phred_encoding = 64;
|
|
997 }
|
|
998 unless ($phred33 or $phred64){
|
|
999 warn "No quality encoding type selected. Assuming that the data provided uses Sanger encoded Phred scores (default)\n\n";
|
|
1000 $phred_encoding = 33;
|
|
1001 sleep (3);
|
|
1002 }
|
|
1003
|
|
1004 ### NON-DIRECTIONAL RRBS
|
|
1005 if ($non_directional){
|
|
1006 unless ($rrbs){
|
|
1007 die "Option '--non_directional' requires '--rrbs' to be specified as well. Please re-specify!\n";
|
|
1008 }
|
|
1009 }
|
|
1010 else{
|
|
1011 $non_directional = 0;
|
|
1012 }
|
|
1013
|
|
1014 if ($fastqc_args){
|
|
1015 $fastqc = 1; # specifying fastqc extra arguments automatically means that FastQC will be executed
|
|
1016 }
|
|
1017 else{
|
|
1018 $fastqc_args = 0;
|
|
1019 }
|
|
1020
|
|
1021 ### CUSTOM ERROR RATE
|
|
1022 if (defined $error_rate){
|
|
1023 # make sure that the error rate is between 0 and 1
|
|
1024 unless ($error_rate >= 0 and $error_rate <= 1){
|
|
1025 die "Please specify an error rate between 0 and 1 (the default is 0.1)\n";
|
|
1026 }
|
|
1027 }
|
|
1028 else{
|
|
1029 $error_rate = 0.1; # (default)
|
|
1030 }
|
|
1031
|
|
1032 if (defined $adapter){
|
|
1033 unless ($adapter =~ /^[ACTGNactgn]+$/){
|
|
1034 die "Adapter sequence must contain DNA characters only (A,C,T,G or N)!\n";
|
|
1035 }
|
|
1036 $adapter = uc$adapter;
|
|
1037 }
|
|
1038
|
|
1039 if (defined $adapter2){
|
|
1040 unless ($validate){
|
|
1041 die "An optional adapter for read 2 of paired-end files requires '--paired' to be specified as well! Please re-specify\n";
|
|
1042 }
|
|
1043 unless ($adapter2 =~ /^[ACTGNactgn]+$/){
|
|
1044 die "Optional adapter 2 sequence must contain DNA characters only (A,C,T,G or N)!\n";
|
|
1045 }
|
|
1046 $adapter2 = uc$adapter2;
|
|
1047 }
|
|
1048
|
|
1049 ### files are supposed to be paired-end files
|
|
1050 if ($validate){
|
|
1051
|
|
1052 # making sure that an even number of reads has been supplied
|
|
1053 unless ((scalar@ARGV)%2 == 0){
|
|
1054 die "Please provide an even number of input files for paired-end FastQ trimming! Aborting ...\n";
|
|
1055 }
|
|
1056
|
|
1057 ## CUTOFF FOR VALIDATED READ-PAIRS
|
|
1058
|
|
1059 if (defined $length_read_1 or defined $length_read_2){
|
|
1060 unless ($retain){
|
|
1061 die "Please specify --keep_unpaired to alter the unpaired single-end read length cut off(s)\n\n";
|
|
1062 }
|
|
1063
|
|
1064 if (defined $length_read_1){
|
|
1065 unless ($length_read_1 >= 15 and $length_read_1 <= 100){
|
|
1066 die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n";
|
|
1067 }
|
|
1068 unless ($length_read_1 > $length_cutoff){
|
|
1069 die "The single-end unpaired read length needs to be longer than the paired-end cut-off value\n\n";
|
|
1070 }
|
|
1071 }
|
|
1072
|
|
1073 if (defined $length_read_2){
|
|
1074 unless ($length_read_2 >= 15 and $length_read_2 <= 100){
|
|
1075 die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n";
|
|
1076 }
|
|
1077 unless ($length_read_2 > $length_cutoff){
|
|
1078 die "The single-end unpaired read length needs to be longer than the paired-end cut-off value\n\n";
|
|
1079 }
|
|
1080 }
|
|
1081 }
|
|
1082
|
|
1083 if ($retain){
|
|
1084 $length_read_1 = 35 unless (defined $length_read_1);
|
|
1085 $length_read_2 = 35 unless (defined $length_read_2);
|
|
1086 }
|
|
1087 }
|
|
1088
|
|
1089 unless ($no_report_file){
|
|
1090 $no_report_file = 0;
|
|
1091 }
|
|
1092
|
|
1093 ### OUTPUT DIR PATH
|
|
1094 if ($output_dir){
|
|
1095 unless ($output_dir =~ /\/$/){
|
|
1096 $output_dir =~ s/$/\//;
|
|
1097 }
|
|
1098 }
|
|
1099 else{
|
|
1100 $output_dir = '';
|
|
1101 }
|
|
1102
|
1
|
1103 ### Trimming at the 5' end
|
|
1104 if (defined $clip_r2){ # trimming 5' bases of read 1
|
|
1105 die "Clipping the 5' end of read 2 is only allowed for paired-end files (--paired)\n" unless ($validate);
|
|
1106 }
|
|
1107
|
|
1108 if (defined $clip_r1){ # trimming 5' bases of read 1
|
|
1109 unless ($clip_r1 > 0 and $clip_r1 < 100){
|
|
1110 die "The 5' clipping value for read 1 should have a sensible value (> 0 and < read length)\n\n";
|
|
1111 }
|
|
1112 }
|
|
1113
|
|
1114 if (defined $clip_r2){ # trimming 5' bases of read 2
|
|
1115 unless ($clip_r2 > 0 and $clip_r2 < 100){
|
|
1116 die "The 5' clipping value for read 2 should have a sensible value (> 0 and < read length)\n\n";
|
|
1117 }
|
|
1118 }
|
|
1119
|
|
1120
|
|
1121 return ($quality,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$adapter2,$error_rate,$output_dir,$no_report_file,$dont_gzip,$clip_r1,$clip_r2);
|
0
|
1122 }
|
|
1123
|
|
1124
|
|
1125
|
|
1126
|
|
1127 sub print_helpfile{
|
|
1128 print << "HELP";
|
|
1129
|
|
1130 USAGE:
|
|
1131
|
|
1132 trim_galore [options] <filename(s)>
|
|
1133
|
|
1134
|
|
1135 -h/--help Print this help message and exits.
|
|
1136
|
|
1137 -v/--version Print the version information and exits.
|
|
1138
|
|
1139 -q/--quality <INT> Trim low-quality ends from reads in addition to adapter removal. For
|
|
1140 RRBS samples, quality trimming will be performed first, and adapter
|
|
1141 trimming is carried in a second round. Other files are quality and adapter
|
|
1142 trimmed in a single pass. The algorithm is the same as the one used by BWA
|
|
1143 (Subtract INT from all qualities; compute partial sums from all indices
|
|
1144 to the end of the sequence; cut sequence at the index at which the sum is
|
|
1145 minimal). Default Phred score: 20.
|
|
1146
|
|
1147 --phred33 Instructs Cutadapt to use ASCII+33 quality scores as Phred scores
|
|
1148 (Sanger/Illumina 1.9+ encoding) for quality trimming. Default: ON.
|
|
1149
|
|
1150 --phred64 Instructs Cutadapt to use ASCII+64 quality scores as Phred scores
|
|
1151 (Illumina 1.5 encoding) for quality trimming.
|
|
1152
|
|
1153 --fastqc Run FastQC in the default mode on the FastQ file once trimming is complete.
|
|
1154
|
|
1155 --fastqc_args "<ARGS>" Passes extra arguments to FastQC. If more than one argument is to be passed
|
|
1156 to FastQC they must be in the form "arg1 arg2 etc.". An example would be:
|
|
1157 --fastqc_args "--nogroup --outdir /home/". Passing extra arguments will
|
|
1158 automatically invoke FastQC, so --fastqc does not have to be specified
|
|
1159 separately.
|
|
1160
|
|
1161 -a/--adapter <STRING> Adapter sequence to be trimmed. If not specified explicitely, the first 13
|
|
1162 bp of the Illumina adapter 'AGATCGGAAGAGC' will be used by default.
|
|
1163
|
|
1164 -a2/--adapter2 <STRING> Optional adapter sequence to be trimmed off read 2 of paired-end files. This
|
|
1165 option requires '--paired' to be specified as well.
|
|
1166
|
|
1167
|
|
1168 -s/--stringency <INT> Overlap with adapter sequence required to trim a sequence. Defaults to a
|
|
1169 very stringent setting of '1', i.e. even a single bp of overlapping sequence
|
|
1170 will be trimmed of the 3' end of any read.
|
|
1171
|
|
1172 -e <ERROR RATE> Maximum allowed error rate (no. of errors divided by the length of the matching
|
|
1173 region) (default: 0.1)
|
|
1174
|
1
|
1175 --gzip Compress the output file with GZIP. If the input files are GZIP-compressed
|
|
1176 the output files will automatically be GZIP compressed as well. As of v0.2.8 the
|
|
1177 compression will take place on the fly.
|
|
1178
|
|
1179 --dont_gzip Output files won't be compressed with GZIP. This option overrides --gzip.
|
0
|
1180
|
|
1181 --length <INT> Discard reads that became shorter than length INT because of either
|
|
1182 quality or adapter trimming. A value of '0' effectively disables
|
|
1183 this behaviour. Default: 20 bp.
|
|
1184
|
|
1185 For paired-end files, both reads of a read-pair need to be longer than
|
|
1186 <INT> bp to be printed out to validated paired-end files (see option --paired).
|
|
1187 If only one read became too short there is the possibility of keeping such
|
|
1188 unpaired single-end reads (see --retain_unpaired). Default pair-cutoff: 20 bp.
|
|
1189
|
|
1190 -o/--output_dir <DIR> If specified all output will be written to this directory instead of the current
|
|
1191 directory.
|
|
1192
|
|
1193 --no_report_file If specified no report file will be generated.
|
|
1194
|
|
1195 --suppress_warn If specified any output to STDOUT or STDERR will be suppressed.
|
|
1196
|
1
|
1197 --clip_R1 <int> Instructs Trim Galore to remove <int> bp from the 5' end of read 1 (or single-end
|
|
1198 reads). This may be useful if the qualities were very poor, or if there is some
|
|
1199 sort of unwanted bias at the 5' end. Default: OFF.
|
|
1200
|
|
1201 --clip_R2 <int> Instructs Trim Galore to remove <int> bp from the 5' end of read 2 (paired-end reads
|
|
1202 only). This may be useful if the qualities were very poor, or if there is some sort
|
|
1203 of unwanted bias at the 5' end. For paired-end BS-Seq, it is recommended to remove
|
|
1204 the first few bp because the end-repair reaction may introduce a bias towards low
|
|
1205 methylation. Please refer to the M-bias plot section in the Bismark User Guide for
|
|
1206 some examples. Default: OFF.
|
|
1207
|
0
|
1208
|
|
1209
|
|
1210 RRBS-specific options (MspI digested material):
|
|
1211
|
|
1212 --rrbs Specifies that the input file was an MspI digested RRBS sample (recognition
|
|
1213 site: CCGG). Sequences which were adapter-trimmed will have a further 2 bp
|
|
1214 removed from their 3' end. This is to avoid that the filled-in C close to the
|
|
1215 second MspI site in a sequence is used for methylation calls. Sequences which
|
|
1216 were merely trimmed because of poor quality will not be shortened further.
|
|
1217
|
|
1218 --non_directional Selecting this option for non-directional RRBS libraries will screen
|
|
1219 quality-trimmed sequences for 'CAA' or 'CGA' at the start of the read
|
|
1220 and, if found, removes the first two basepairs. Like with the option
|
|
1221 '--rrbs' this avoids using cytosine positions that were filled-in
|
|
1222 during the end-repair step. '--non_directional' requires '--rrbs' to
|
|
1223 be specified as well.
|
|
1224
|
|
1225 --keep Keep the quality trimmed intermediate file. Default: off, which means
|
|
1226 the temporary file is being deleted after adapter trimming. Only has
|
|
1227 an effect for RRBS samples since other FastQ files are not trimmed
|
|
1228 for poor qualities separately.
|
|
1229
|
|
1230
|
|
1231 Note for RRBS using MseI:
|
|
1232
|
|
1233 If your DNA material was digested with MseI (recognition motif: TTAA) instead of MspI it is NOT necessary
|
|
1234 to specify --rrbs or --non_directional since virtually all reads should start with the sequence
|
|
1235 'TAA', and this holds true for both directional and non-directional libraries. As the end-repair of 'TAA'
|
|
1236 restricted sites does not involve any cytosines it does not need to be treated especially. Instead, simply
|
|
1237 run Trim Galore! in the standard (i.e. non-RRBS) mode.
|
|
1238
|
|
1239
|
|
1240 Paired-end specific options:
|
|
1241
|
|
1242 --paired This option performs length trimming of quality/adapter/RRBS trimmed reads for
|
|
1243 paired-end files. To pass the validation test, both sequences of a sequence pair
|
|
1244 are required to have a certain minimum length which is governed by the option
|
|
1245 --length (see above). If only one read passes this length threshold the
|
|
1246 other read can be rescued (see option --retain_unpaired). Using this option lets
|
|
1247 you discard too short read pairs without disturbing the sequence-by-sequence order
|
|
1248 of FastQ files which is required by many aligners.
|
|
1249
|
|
1250 Trim Galore! expects paired-end files to be supplied in a pairwise fashion, e.g.
|
|
1251 file1_1.fq file1_2.fq SRR2_1.fq.gz SRR2_2.fq.gz ... .
|
|
1252
|
|
1253 -t/--trim1 Trims 1 bp off every read from its 3' end. This may be needed for FastQ files that
|
|
1254 are to be aligned as paired-end data with Bowtie. This is because Bowtie (1) regards
|
|
1255 alignments like this:
|
|
1256
|
|
1257 R1 ---------------------------> or this: -----------------------> R1
|
|
1258 R2 <--------------------------- <----------------- R2
|
|
1259
|
|
1260 as invalid (whenever a start/end coordinate is contained within the other read).
|
|
1261
|
|
1262 --retain_unpaired If only one of the two paired-end reads became too short, the longer
|
|
1263 read will be written to either '.unpaired_1.fq' or '.unpaired_2.fq'
|
|
1264 output files. The length cutoff for unpaired single-end reads is
|
|
1265 governed by the parameters -r1/--length_1 and -r2/--length_2. Default: OFF.
|
|
1266
|
|
1267 -r1/--length_1 <INT> Unpaired single-end read length cutoff needed for read 1 to be written to
|
|
1268 '.unpaired_1.fq' output file. These reads may be mapped in single-end mode.
|
|
1269 Default: 35 bp.
|
|
1270
|
|
1271 -r2/--length_2 <INT> Unpaired single-end read length cutoff needed for read 2 to be written to
|
|
1272 '.unpaired_2.fq' output file. These reads may be mapped in single-end mode.
|
|
1273 Default: 35 bp.
|
|
1274
|
|
1275
|
1
|
1276 Last modified on 15 July 2013.
|
0
|
1277
|
|
1278 HELP
|
|
1279 exit;
|
|
1280 }
|