0
|
1 #!/usr/bin/perl
|
|
2 use strict;
|
|
3 use warnings;
|
|
4 use Getopt::Long;
|
|
5 use IPC::Open3;
|
|
6 use File::Spec;
|
|
7 use File::Basename;
|
|
8 use Cwd;
|
|
9
|
|
10 ## This program is Copyright (C) 2012, Felix Krueger (felix.krueger@babraham.ac.uk)
|
|
11
|
|
12 ## This program is free software: you can redistribute it and/or modify
|
|
13 ## it under the terms of the GNU General Public License as published by
|
|
14 ## the Free Software Foundation, either version 3 of the License, or
|
|
15 ## (at your option) any later version.
|
|
16
|
|
17 ## This program is distributed in the hope that it will be useful,
|
|
18 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
19 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
20 ## GNU General Public License for more details.
|
|
21
|
|
22 ## You should have received a copy of the GNU General Public License
|
|
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
24
|
|
25
|
|
26
|
|
27 ## this script is taking in FastQ sequences and trims them with Cutadapt
|
|
28 ## last modified on 18 10 2012
|
|
29
|
|
30 ########################################################################
|
|
31
|
|
32 # change these paths if needed
|
|
33
|
|
34 my $path_to_cutadapt = 'cutadapt';
|
|
35 my $path_to_fastqc = 'fastqc';
|
|
36
|
|
37 ########################################################################
|
|
38
|
|
39
|
|
40 my $trimmer_version = '0.2.5';
|
|
41 my $DOWARN = 1; # print on screen warning and text by default
|
|
42 BEGIN { $SIG{'__WARN__'} = sub { warn $_[0] if $DOWARN } };
|
|
43
|
|
44 my ($cutoff,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$a2,$error_rate,$output_dir,$no_report_file) = process_commandline();
|
|
45
|
|
46 ### SETTING DEFAULTS UNLESS THEY WERE SPECIFIED
|
|
47 unless (defined $cutoff){
|
|
48 $cutoff = 20;
|
|
49 }
|
|
50 my $phred_score_cutoff = $cutoff; # only relevant for report
|
|
51
|
|
52 unless (defined $adapter){
|
|
53 $adapter = 'AGATCGGAAGAGC';
|
|
54 }
|
|
55 unless (defined $a2){ # optional adapter for the second read in a pair. Only works for --paired trimming
|
|
56 $a2 = '';
|
|
57 }
|
|
58
|
|
59 unless (defined $stringency){
|
|
60 $stringency = 1;
|
|
61 }
|
|
62
|
|
63 unless (defined $length_cutoff){
|
|
64 $length_cutoff = 20;
|
|
65 }
|
|
66
|
|
67 if ($phred_encoding == 64){
|
|
68 $cutoff += 31;
|
|
69 }
|
|
70
|
|
71 my @filenames = @ARGV;
|
|
72
|
|
73 my $file_1;
|
|
74 my $file_2;
|
|
75
|
|
76 foreach my $filename (@ARGV){
|
|
77 trim ($filename);
|
|
78 }
|
|
79
|
|
80
|
|
81 sub trim{
|
|
82 my $filename = shift;
|
|
83
|
|
84 my $output_filename = (split (/\//,$filename))[-1];
|
|
85 # warn "Here is the outputfile name: $output_filename\n";
|
|
86
|
|
87 my $report = $output_filename;
|
|
88 $report =~ s/$/_trimming_report.txt/;
|
|
89
|
|
90 if ($no_report_file) {
|
|
91 $report = File::Spec->devnull;
|
|
92 open (REPORT,'>',$report) or die "Failed to write to file: $!\n";
|
|
93 # warn "Redirecting report output to /dev/null\n";
|
|
94 }
|
|
95 else{
|
|
96 open (REPORT,'>',$output_dir.$report) or die "Failed to write to file: $!\n";
|
|
97 warn "Writing report to '$output_dir$report'\n";
|
|
98 }
|
|
99
|
|
100 warn "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n";
|
|
101 print REPORT "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n";
|
|
102
|
|
103 warn "Quality Phred score cutoff: $phred_score_cutoff\n";
|
|
104 print REPORT "Quality Phred score cutoff: $phred_score_cutoff\n";
|
|
105
|
|
106 warn "Quality encoding type selected: ASCII+$phred_encoding\n";
|
|
107 print REPORT "Quality encoding type selected: ASCII+$phred_encoding\n";
|
|
108
|
|
109 warn "Adapter sequence: '$adapter'\n";
|
|
110 print REPORT "Adapter sequence: '$adapter'\n";
|
|
111
|
|
112 if ($error_rate == 0.1){
|
|
113 warn "Maximum trimming error rate: $error_rate (default)\n";
|
|
114 }
|
|
115 else{
|
|
116 warn "Maximum trimming error rate: $error_rate\n";
|
|
117 }
|
|
118
|
|
119 print REPORT "Maximum trimming error rate: $error_rate";
|
|
120 if ($error_rate == 0.1){
|
|
121 print REPORT " (default)\n";
|
|
122 }
|
|
123 else{
|
|
124 print REPORT "\n";
|
|
125 }
|
|
126
|
|
127 if ($a2){
|
|
128 warn "Optional adapter 2 sequence (only used for read 2 of paired-end files): '$a2'\n";
|
|
129 print REPORT "Optional adapter 2 sequence (only used for read 2 of paired-end files): '$a2'\n";
|
|
130 }
|
|
131
|
|
132 warn "Minimum required adapter overlap (stringency): $stringency bp\n";
|
|
133 print REPORT "Minimum required adapter overlap (stringency): $stringency bp\n";
|
|
134
|
|
135 if ($validate){
|
|
136 warn "Minimum required sequence length for both reads before a sequence pair gets removed: $length_cutoff bp\n";
|
|
137 print REPORT "Minimum required sequence length for both reads before a sequence pair gets removed: $length_cutoff bp\n";
|
|
138 }
|
|
139 else{
|
|
140 warn "Minimum required sequence length before a sequence gets removed: $length_cutoff bp\n";
|
|
141 print REPORT "Minimum required sequence length before a sequence gets removed: $length_cutoff bp\n";
|
|
142 }
|
|
143
|
|
144 if ($validate){ # only for paired-end files
|
|
145
|
|
146 if ($retain){ # keeping single-end reads if only one end is long enough
|
|
147
|
|
148 if ($length_read_1 == 35){
|
|
149 warn "Length cut-off for read 1: $length_read_1 bp (default)\n";
|
|
150 print REPORT "Length cut-off for read 1: $length_read_1 bp (default)\n";
|
|
151 }
|
|
152 else{
|
|
153 warn "Length cut-off for read 1: $length_read_1 bp\n";
|
|
154 print REPORT "Length cut-off for read 1: $length_read_1 bp\n";
|
|
155 }
|
|
156
|
|
157 if ($length_read_2 == 35){
|
|
158 warn "Length cut-off for read 2: $length_read_2 b (default)\n";
|
|
159 print REPORT "Length cut-off for read 2: $length_read_2 bp (default)\n";
|
|
160 }
|
|
161 else{
|
|
162 warn "Length cut-off for read 2: $length_read_2 bp\n";
|
|
163 print REPORT "Length cut-off for read 2: $length_read_2 bp\n";
|
|
164 }
|
|
165 }
|
|
166 }
|
|
167
|
|
168 if ($rrbs){
|
|
169 warn "File was specified to be an MspI-digested RRBS sample. Sequences with adapter contamination will be trimmed a further 2 bp to remove potential methylation-biased bases from the end-repair reaction\n";
|
|
170 print REPORT "File was specified to be an MspI-digested RRBS sample. Sequences with adapter contamination will be trimmed a further 2 bp to remove potential methylation-biased bases from the end-repair reaction\n";
|
|
171 }
|
|
172
|
|
173 if ($non_directional){
|
|
174 warn "File was specified to be a non-directional MspI-digested RRBS sample. Sequences starting with either 'CAA' or 'CGA' will have the first 2 bp trimmed off to remove potential methylation-biased bases from the end-repair reaction\n";
|
|
175 print REPORT "File was specified to be a non-directional MspI-digested RRBS sample. Sequences starting with either 'CAA' or 'CGA' will have the first 2 bp trimmed off to remove potential methylation-biased bases from the end-repair reaction\n";
|
|
176 }
|
|
177
|
|
178 if ($trim){
|
|
179 warn "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n";
|
|
180 print REPORT "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n";
|
|
181 }
|
|
182
|
|
183 if ($fastqc){
|
|
184 warn "Running FastQC on the data once trimming has completed\n";
|
|
185 print REPORT "Running FastQC on the data once trimming has completed\n";
|
|
186
|
|
187 if ($fastqc_args){
|
|
188 warn "Running FastQC with the following extra arguments: '$fastqc_args'\n";
|
|
189 print REPORT "Running FastQC with the following extra arguments: $fastqc_args\n";
|
|
190 }
|
|
191 }
|
|
192
|
|
193 if ($keep and $rrbs){
|
|
194 warn "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n";
|
|
195 print REPORT "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n";
|
|
196 }
|
|
197
|
|
198 if ($gzip or $filename =~ /\.gz$/){
|
|
199 warn "Output file will be GZIP compressed\n";
|
|
200 print REPORT "Output file will be GZIP compressed\n";
|
|
201 }
|
|
202
|
|
203 warn "\n";
|
|
204 print REPORT "\n";
|
|
205 sleep (3);
|
|
206
|
|
207 my $temp;
|
|
208
|
|
209 ### Proceeding differently for RRBS and other type of libraries
|
|
210 if ($rrbs){
|
|
211
|
|
212 ### Skipping quality filtering for RRBS libraries if a quality cutoff of 0 was specified
|
|
213 if ($cutoff == 0){
|
|
214 warn "Quality cutoff selected was 0 - Skipping quality trimming altogether\n\n";
|
|
215 sleep (3);
|
|
216 }
|
|
217 else{
|
|
218
|
|
219 $temp = $filename;
|
|
220 $temp =~ s/$/_qual_trimmed.fastq/;
|
|
221 open (TEMP,'>',$output_dir.$temp) or die "Can't write to $temp: $!";
|
|
222
|
|
223 warn " >>> Now performing adaptive quality trimming with a Phred-score cutoff of: $cutoff <<<\n\n";
|
|
224 sleep (3);
|
|
225
|
|
226 open (QUAL,"$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -a X $filename |") or die "Can't open pipe to Cutadapt: $!";
|
|
227
|
|
228 my $qual_count = 0;
|
|
229
|
|
230 while (1){
|
|
231 my $l1 = <QUAL>;
|
|
232 my $seq = <QUAL>;
|
|
233 my $l3 = <QUAL>;
|
|
234 my $qual = <QUAL>;
|
|
235 last unless (defined $qual);
|
|
236
|
|
237 $qual_count++;
|
|
238 if ($qual_count%10000000 == 0){
|
|
239 warn "$qual_count sequences processed\n";
|
|
240 }
|
|
241 print TEMP "$l1$seq$l3$qual";
|
|
242 }
|
|
243
|
|
244 warn "\n >>> Quality trimming completed <<<\n$qual_count sequences processed in total\n\n";
|
|
245 close QUAL or die "Unable to close filehandle: $!\n";
|
|
246 sleep (3);
|
|
247
|
|
248 }
|
|
249 }
|
|
250
|
|
251
|
|
252 if ($output_filename =~ /\.fastq$/){
|
|
253 $output_filename =~ s/\.fastq$/_trimmed.fq/;
|
|
254 }
|
|
255 elsif ($output_filename =~ /\.fastq\.gz$/){
|
|
256 $output_filename =~ s/\.fastq\.gz$/_trimmed.fq/;
|
|
257 }
|
|
258 elsif ($output_filename =~ /\.fq$/){
|
|
259 $output_filename =~ s/\.fq$/_trimmed.fq/;
|
|
260 }
|
|
261 elsif ($output_filename =~ /\.fq\.gz$/){
|
|
262 $output_filename =~ s/\.fq\.gz$/_trimmed.fq/;
|
|
263 }
|
|
264 else{
|
|
265 $output_filename =~ s/$/_trimmed.fq/;
|
|
266 }
|
|
267
|
|
268 warn "Writing final adapter and quality trimmed output to $output_filename\n\n";
|
|
269 open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n";
|
|
270 sleep (2);
|
|
271
|
|
272 my $count = 0;
|
|
273 my $too_short = 0;
|
|
274 my $quality_trimmed = 0;
|
|
275 my $rrbs_trimmed = 0;
|
|
276 my $rrbs_trimmed_start = 0;
|
|
277 my $CAA = 0;
|
|
278 my $CGA = 0;
|
|
279
|
|
280 if ($rrbs and $cutoff != 0){
|
|
281
|
|
282 ### optionally using 2 different adapters for read 1 and read 2
|
|
283 if ($validate and $a2){
|
|
284 ### Figure out whether current file counts as read 1 or read 2 of paired-end files
|
|
285 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
|
|
286 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n";
|
|
287 sleep (3);
|
|
288 open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
|
|
289 }
|
|
290 else{ # this is read 2 of a pair
|
|
291 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$a2' from file $temp <<< \n";
|
|
292 sleep (3);
|
|
293 open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $a2 $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
|
|
294 }
|
|
295 }
|
|
296 ### Using the same adapter for both read 1 and read 2
|
|
297 else{
|
|
298 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n";
|
|
299 sleep (3);
|
|
300 open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
|
|
301 }
|
|
302
|
|
303 close WRITER or die $!; # not needed
|
|
304
|
|
305 open (QUAL,"$output_dir$temp") or die $!; # quality trimmed file
|
|
306
|
|
307 if ($filename =~ /\.gz$/){
|
|
308 open (IN,"zcat $filename |") or die $!; # original, untrimmed file
|
|
309 }
|
|
310 else{
|
|
311 open (IN,$filename) or die $!; # original, untrimmed file
|
|
312 }
|
|
313
|
|
314 while (1){
|
|
315
|
|
316 # we can process the output from Cutadapt and the original input 1 by 1 to decide if the adapter has been removed or not
|
|
317 my $l1 = <TRIM>;
|
|
318 my $seq = <TRIM>; # adapter trimmed sequence
|
|
319 my $l3 = <TRIM>;
|
|
320 my $qual = <TRIM>;
|
|
321
|
|
322 $_ = <IN>; # irrelevant
|
|
323 my $original_seq = <IN>;
|
|
324 $_ = <IN>; # irrelevant
|
|
325 $_ = <IN>; # irrelevant
|
|
326
|
|
327 $_ = <QUAL>; # irrelevant
|
|
328 my $qual_trimmed_seq = <QUAL>;
|
|
329 $_ = <QUAL>; # irrelevant
|
|
330 my $qual_trimmed_qual = <QUAL>;
|
|
331
|
|
332 last unless (defined $qual and defined $qual_trimmed_qual); # could be empty strings
|
|
333
|
|
334 $count++;
|
|
335 if ($count%10000000 == 0){
|
|
336 warn "$count sequences processed\n";
|
|
337 }
|
|
338
|
|
339 chomp $seq;
|
|
340 chomp $qual;
|
|
341 chomp $qual_trimmed_seq;
|
|
342 chomp $original_seq;
|
|
343
|
|
344 my $quality_trimmed_seq_length = length $qual_trimmed_seq;
|
|
345
|
|
346 if (length $original_seq > length $qual_trimmed_seq){
|
|
347 ++$quality_trimmed;
|
|
348 }
|
|
349
|
|
350 my $nd = 0;
|
|
351
|
|
352 ### NON-DIRECTIONAL RRBS
|
|
353 if ($non_directional){
|
|
354 if (length$seq > 2){
|
|
355 if ($seq =~ /^CAA/){
|
|
356 ++$CAA;
|
|
357 $seq = substr ($seq,2,length($seq)-2);
|
|
358 $qual = substr ($qual,2,length($qual)-2);
|
|
359 ++$rrbs_trimmed_start;
|
|
360 $nd = 1;
|
|
361 }
|
|
362 elsif ($seq =~ /^CGA/){
|
|
363 $seq = substr ($seq,2,length($seq)-2);
|
|
364 $qual = substr ($qual,2,length($qual)-2);
|
|
365 ++$CGA;
|
|
366 ++$rrbs_trimmed_start;
|
|
367 $nd = 1;
|
|
368 }
|
|
369 }
|
|
370 }
|
|
371
|
|
372 ### directional read
|
|
373 unless ($nd == 1){
|
|
374 if (length $seq >= 2 and length$seq < $quality_trimmed_seq_length){
|
|
375 $seq = substr ($seq,0,length($seq)-2);
|
|
376 $qual = substr ($qual,0,length($qual)-2);
|
|
377 ++$rrbs_trimmed;
|
|
378 }
|
|
379 }
|
|
380
|
|
381 ### Shortening all sequences by 1 bp on the 3' end
|
|
382 if ($trim){
|
|
383 $seq = substr($seq,0,length($seq)-1);
|
|
384 $qual = substr($qual,0,length($qual)-1);
|
|
385 }
|
|
386
|
|
387 ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE
|
|
388 if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later)
|
|
389 print OUT "$l1$seq\n$l3$qual\n";
|
|
390 }
|
|
391 else{ # single end
|
|
392 if (length $seq < $length_cutoff){
|
|
393 ++$too_short;
|
|
394 next;
|
|
395 }
|
|
396 else{
|
|
397 print OUT "$l1$seq\n$l3$qual\n";
|
|
398 }
|
|
399 }
|
|
400 }
|
|
401
|
|
402 print REPORT "\n";
|
|
403 while (<ERROR>){
|
|
404 warn $_;
|
|
405 print REPORT $_;
|
|
406 }
|
|
407
|
|
408 close IN or die "Unable to close IN filehandle: $!";
|
|
409 close QUAL or die "Unable to close QUAL filehandle: $!";
|
|
410 close TRIM or die "Unable to close TRIM filehandle: $!";
|
|
411 close OUT or die "Unable to close OUT filehandle: $!";
|
|
412 }
|
|
413 else{
|
|
414
|
|
415 ### optionally using 2 different adapters for read 1 and read 2
|
|
416 if ($validate and $a2){
|
|
417 ### Figure out whether current file counts as read 1 or read 2 of paired-end files
|
|
418 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
|
|
419 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n";
|
|
420 sleep (3);
|
|
421 open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $adapter $filename") or die "Failed to launch Cutadapt: $!";
|
|
422 }
|
|
423 else{ # this is read 2 of a pair
|
|
424 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$a2' from file $filename <<< \n";
|
|
425 sleep (3);
|
|
426 open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $a2 $filename") or die "Failed to launch Cutadapt: $!";
|
|
427 }
|
|
428 }
|
|
429 ### Using the same adapter for both read 1 and read 2
|
|
430 else{
|
|
431 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n";
|
|
432 sleep (3);
|
|
433 open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $adapter $filename") or die "Failed to launch Cutadapt: $!";
|
|
434 }
|
|
435
|
|
436 close WRITER or die $!; # not needed
|
|
437
|
|
438 while (1){
|
|
439
|
|
440 my $l1 = <TRIM>;
|
|
441 my $seq = <TRIM>; # quality and/or adapter trimmed sequence
|
|
442 my $l3 = <TRIM>;
|
|
443 my $qual = <TRIM>;
|
|
444 # print "$l1$seq\n$l3$qual\n";
|
|
445 last unless (defined $qual); # could be an empty string
|
|
446
|
|
447 $count++;
|
|
448 if ($count%10000000 == 0){
|
|
449 warn "$count sequences processed\n";
|
|
450 }
|
|
451
|
|
452 chomp $seq;
|
|
453 chomp $qual;
|
|
454
|
|
455 ### Shortening all sequences by 1 bp on the 3' end
|
|
456 if ($trim){
|
|
457 $seq = substr($seq,0,length($seq)-1);
|
|
458 $qual = substr($qual,0,length($qual)-1);
|
|
459 }
|
|
460
|
|
461 ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE
|
|
462 if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later)
|
|
463 print OUT "$l1$seq\n$l3$qual\n";
|
|
464 }
|
|
465 else{ # single end
|
|
466 if (length $seq < $length_cutoff){
|
|
467 ++$too_short;
|
|
468 next;
|
|
469 }
|
|
470 else{
|
|
471 print OUT "$l1$seq\n$l3$qual\n";
|
|
472 }
|
|
473 }
|
|
474 }
|
|
475
|
|
476 print REPORT "\n";
|
|
477 while (<ERROR>){
|
|
478 warn $_;
|
|
479 print REPORT $_;
|
|
480 }
|
|
481
|
|
482 close TRIM or die "Unable to close TRIM filehandle: $!\n";
|
|
483 close ERROR or die "Unable to close ERROR filehandle: $!\n";
|
|
484 close OUT or die "Unable to close OUT filehandle: $!\n";
|
|
485
|
|
486 }
|
|
487
|
|
488 if ($rrbs){
|
|
489 unless ($keep){ # keeping the quality trimmed intermediate file for RRBS files
|
|
490
|
|
491 # deleting temporary quality trimmed file
|
|
492 my $deleted = unlink "$output_dir$temp";
|
|
493
|
|
494 if ($deleted){
|
|
495 warn "Successfully deleted temporary file $temp\n\n";
|
|
496 }
|
|
497 else{
|
|
498 warn "Could not delete temporary file $temp";
|
|
499 }
|
|
500 }
|
|
501 }
|
|
502
|
|
503 warn "\nRUN STATISTICS FOR INPUT FILE: $filename\n";
|
|
504 print REPORT "\nRUN STATISTICS FOR INPUT FILE: $filename\n";
|
|
505
|
|
506 warn "="x 45,"\n";
|
|
507 print REPORT "="x 45,"\n";
|
|
508
|
|
509 warn "$count sequences processed in total\n";
|
|
510 print REPORT "$count sequences processed in total\n";
|
|
511
|
|
512 ### only reporting this separately if quality and adapter trimming were performed separately
|
|
513 if ($rrbs){
|
|
514 my $percentage_shortened = sprintf ("%.1f",$quality_trimmed/$count*100);
|
|
515 warn "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n";
|
|
516 print REPORT "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n";
|
|
517 }
|
|
518
|
|
519 my $percentage_too_short = sprintf ("%.1f",$too_short/$count*100);
|
|
520 warn "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n";
|
|
521 print REPORT "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n";
|
|
522
|
|
523 if ($rrbs){
|
|
524 my $percentage_rrbs_trimmed = sprintf ("%.1f",$rrbs_trimmed/$count*100);
|
|
525 warn "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n";
|
|
526 print REPORT "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n";
|
|
527 }
|
|
528
|
|
529 if ($non_directional){
|
|
530 my $percentage_rrbs_trimmed_at_start = sprintf ("%.1f",$rrbs_trimmed_start/$count*100);
|
|
531 warn "RRBS reads trimmed by 2 bp at the start when read started with CAA ($CAA) or CGA ($CGA) in total:\t$rrbs_trimmed_start ($percentage_rrbs_trimmed_at_start%)\n";
|
|
532 print REPORT "RRBS reads trimmed by 2 bp at the start when read started with CAA ($CAA) or CGA ($CGA) in total:\t$rrbs_trimmed_start ($percentage_rrbs_trimmed_at_start%)\n";
|
|
533 }
|
|
534
|
|
535 warn "\n";
|
|
536 print REPORT "\n";
|
|
537
|
|
538 ### RUNNING FASTQC
|
|
539 if ($fastqc){
|
|
540 warn "\n >>> Now running FastQC on the data <<<\n\n";
|
|
541 sleep (5);
|
|
542 if ($fastqc_args){
|
|
543 system ("$path_to_fastqc $fastqc_args $output_filename");
|
|
544 }
|
|
545 else{
|
|
546 system ("$path_to_fastqc $output_filename");
|
|
547 }
|
|
548 }
|
|
549
|
|
550 ### COMPRESSING OUTPUT FILE
|
|
551 unless ($validate){ # not gzipping intermediate files for paired-end files
|
|
552 if ($gzip or $filename =~ /\.gz$/){
|
|
553 warn "\n >>> GZIP-ing the output file <<<\n\n";
|
|
554 system ("gzip -f $output_filename");
|
|
555 $output_filename = $output_filename.'.gz';
|
|
556 }
|
|
557 }
|
|
558
|
|
559 ### VALIDATE PAIRED-END FILES
|
|
560 if ($validate){
|
|
561
|
|
562 ### Figure out whether current file counts as read 1 or read 2 of paired-end files
|
|
563
|
|
564 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
|
|
565 $file_1 = $output_filename;
|
|
566 shift @filenames;
|
|
567 # warn "This is read 1: $file_1\n\n";
|
|
568 }
|
|
569 else{ # this is read 2 of a pair
|
|
570 $file_2 = $output_filename;
|
|
571 shift @filenames;
|
|
572 # warn "This is read 2: $file_2\n\n";
|
|
573 }
|
|
574
|
|
575 if ($file_1 and $file_2){
|
|
576 warn "Validate paired-end files $file_1 and $file_2\n";
|
|
577 sleep (1);
|
|
578
|
|
579 my ($val_1,$val_2,$un_1,$un_2) = validate_paired_end_files($file_1,$file_2);
|
|
580
|
|
581 ### RUNNING FASTQC
|
|
582 if ($fastqc){
|
|
583
|
|
584 warn "\n >>> Now running FastQC on the validated data $val_1<<<\n\n";
|
|
585 sleep (3);
|
|
586
|
|
587 if ($fastqc_args){
|
|
588 system ("$path_to_fastqc $fastqc_args $val_1");
|
|
589 }
|
|
590 else{
|
|
591 system ("$path_to_fastqc $val_1");
|
|
592 }
|
|
593
|
|
594 warn "\n >>> Now running FastQC on the validated data $val_2<<<\n\n";
|
|
595 sleep (3);
|
|
596
|
|
597 if ($fastqc_args){
|
|
598 system ("$path_to_fastqc $fastqc_args $val_2");
|
|
599 }
|
|
600 else{
|
|
601 system ("$path_to_fastqc $val_2");
|
|
602 }
|
|
603
|
|
604 }
|
|
605
|
|
606 if ($gzip or $filename =~ /\.gz$/){
|
|
607
|
|
608 # compressing main fastQ output files
|
|
609 warn "Compressing the validated output file $val_1 ...\n";
|
|
610 system ("gzip -f $val_1");
|
|
611
|
|
612 warn "Compressing the validated output file $val_2 ...\n";
|
|
613 system ("gzip -f $val_2");
|
|
614
|
|
615
|
|
616 if ($retain){ # compressing unpaired reads
|
|
617 warn "Compressing the unpaired read output $un_1 ...\n";
|
|
618 system ("gzip -f $un_1");
|
|
619
|
|
620 warn "Compressing the unpaired read output $un_2 ...\n";
|
|
621 system ("gzip -f $un_2");
|
|
622 }
|
|
623 }
|
|
624
|
|
625 warn "Deleting both intermediate output files $file_1 and $file_2\n";
|
|
626 unlink "$output_dir$file_1";
|
|
627 unlink "$output_dir$file_2";
|
|
628
|
|
629 warn "\n",'='x100,"\n\n";
|
|
630 sleep (3);
|
|
631
|
|
632 $file_1 = undef; # setting file_1 and file_2 to undef once validation is completed
|
|
633 $file_2 = undef;
|
|
634 }
|
|
635 }
|
|
636
|
|
637 }
|
|
638
|
|
639 sub validate_paired_end_files{
|
|
640
|
|
641 my $file_1 = shift;
|
|
642 my $file_2 = shift;
|
|
643
|
|
644 warn "file_1 $file_1 file_2 $file_2\n\n";
|
|
645
|
|
646 if ($file_1 =~ /\.gz$/){
|
|
647 open (IN1,"zcat $output_dir$file_1 |") or die "Couldn't read from file $file_1: $!\n";
|
|
648 }
|
|
649 else{
|
|
650 open (IN1, "$output_dir$file_1") or die "Couldn't read from file $file_1: $!\n";
|
|
651 }
|
|
652
|
|
653 if ($file_2 =~ /\.gz$/){
|
|
654 open (IN2,"zcat $output_dir$file_2 |") or die "Couldn't read from file $file_2: $!\n";
|
|
655 }
|
|
656 else{
|
|
657 open (IN2, "$output_dir$file_2") or die "Couldn't read from file $file_2: $!\n";
|
|
658 }
|
|
659
|
|
660 warn "\n>>>>> Now validing the length of the 2 paired-end infiles: $file_1 and $file_2 <<<<<\n";
|
|
661 sleep (3);
|
|
662
|
|
663 my $out_1 = $file_1;
|
|
664 my $out_2 = $file_2;
|
|
665
|
|
666 if ($out_1 =~ /gz$/){
|
|
667 $out_1 =~ s/trimmed\.fq\.gz$/val_1.fq/;
|
|
668 }
|
|
669 else{
|
|
670 $out_1 =~ s/trimmed\.fq$/val_1.fq/;
|
|
671 }
|
|
672
|
|
673 if ($out_2 =~ /gz$/){
|
|
674 $out_2 =~ s/trimmed\.fq\.gz$/val_2.fq/;
|
|
675 }
|
|
676 else{
|
|
677 $out_2 =~ s/trimmed\.fq$/val_2.fq/;
|
|
678 }
|
|
679
|
|
680 open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n";
|
|
681 open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n";
|
|
682 warn "Writing validated paired-end read 1 reads to $out_1\n";
|
|
683 warn "Writing validated paired-end read 2 reads to $out_2\n\n";
|
|
684
|
|
685 my $unpaired_1;
|
|
686 my $unpaired_2;
|
|
687
|
|
688 if ($retain){
|
|
689
|
|
690 $unpaired_1 = $file_1;
|
|
691 $unpaired_2 = $file_2;
|
|
692
|
|
693 if ($unpaired_1 =~ /gz$/){
|
|
694 $unpaired_1 =~ s/trimmed\.fq\.gz$/unpaired_1.fq/;
|
|
695 }
|
|
696 else{
|
|
697 $unpaired_1 =~ s/trimmed\.fq$/unpaired_1.fq/;
|
|
698 }
|
|
699
|
|
700 if ($unpaired_2 =~ /gz$/){
|
|
701 $unpaired_2 =~ s/trimmed\.fq\.gz$/unpaired_2.fq/;
|
|
702 }
|
|
703 else{
|
|
704 $unpaired_2 =~ s/trimmed\.fq$/unpaired_2.fq/;
|
|
705 }
|
|
706
|
|
707 open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n";
|
|
708 open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n";
|
|
709
|
|
710 warn "Writing unpaired read 1 reads to $unpaired_1\n";
|
|
711 warn "Writing unpaired read 2 reads to $unpaired_2\n\n";
|
|
712 }
|
|
713
|
|
714 my $sequence_pairs_removed = 0;
|
|
715 my $read_1_printed = 0;
|
|
716 my $read_2_printed = 0;
|
|
717
|
|
718 my $count = 0;
|
|
719
|
|
720 while (1){
|
|
721 my $id_1 = <IN1>;
|
|
722 my $seq_1 = <IN1>;
|
|
723 my $l3_1 = <IN1>;
|
|
724 my $qual_1 = <IN1>;
|
|
725 last unless ($id_1 and $seq_1 and $l3_1 and $qual_1);
|
|
726
|
|
727 my $id_2 = <IN2>;
|
|
728 my $seq_2 = <IN2>;
|
|
729 my $l3_2 = <IN2>;
|
|
730 my $qual_2 = <IN2>;
|
|
731 last unless ($id_2 and $seq_2 and $l3_2 and $qual_2);
|
|
732
|
|
733 ++$count;
|
|
734
|
|
735
|
|
736 ## small check if the sequence files appear to FastQ files
|
|
737 if ($count == 1){ # performed just once
|
|
738 if ($id_1 !~ /^\@/ or $l3_1 !~ /^\+/){
|
|
739 die "Input file doesn't seem to be in FastQ format at sequence $count\n";
|
|
740 }
|
|
741 if ($id_2 !~ /^\@/ or $l3_2 !~ /^\+/){
|
|
742 die "Input file doesn't seem to be in FastQ format at sequence $count\n";
|
|
743 }
|
|
744 }
|
|
745
|
|
746 chomp $seq_1;
|
|
747 chomp $seq_2;
|
|
748
|
|
749
|
|
750 ### making sure that the reads do have a sensible length
|
|
751 if ( (length($seq_1) < $length_cutoff) or (length($seq_2) < $length_cutoff) ){
|
|
752 ++$sequence_pairs_removed;
|
|
753 if ($retain){ # writing out single-end reads if they are longer than the cutoff
|
|
754
|
|
755 if ( length($seq_1) >= $length_read_1){ # read 1 is long enough
|
|
756 print UNPAIRED1 $id_1;
|
|
757 print UNPAIRED1 "$seq_1\n";
|
|
758 print UNPAIRED1 $l3_1;
|
|
759 print UNPAIRED1 $qual_1;
|
|
760 ++$read_1_printed;
|
|
761 }
|
|
762
|
|
763 if ( length($seq_2) >= $length_read_2){ # read 2 is long enough
|
|
764 print UNPAIRED2 $id_2;
|
|
765 print UNPAIRED2 "$seq_2\n";
|
|
766 print UNPAIRED2 $l3_2;
|
|
767 print UNPAIRED2 $qual_2;
|
|
768 ++$read_2_printed;
|
|
769 }
|
|
770
|
|
771 }
|
|
772 }
|
|
773 else{
|
|
774 print R1 $id_1;
|
|
775 print R1 "$seq_1\n";
|
|
776 print R1 $l3_1;
|
|
777 print R1 $qual_1;
|
|
778
|
|
779 print R2 $id_2;
|
|
780 print R2 "$seq_2\n";
|
|
781 print R2 $l3_2;
|
|
782 print R2 $qual_2;
|
|
783 }
|
|
784
|
|
785 }
|
|
786 my $percentage = sprintf("%.2f",$sequence_pairs_removed/$count*100);
|
|
787 warn "Total number of sequences analysed: $count\n\n";
|
|
788 warn "Number of sequence pairs removed: $sequence_pairs_removed ($percentage%)\n";
|
|
789
|
|
790 print REPORT "Total number of sequences analysed for the sequence pair length validation: $count\n\n";
|
|
791 print REPORT "Number of sequence pairs removed because at least one read was shorter than the length cutoff ($length_cutoff bp): $sequence_pairs_removed ($percentage%)\n";
|
|
792
|
|
793 if ($keep){
|
|
794 warn "Number of unpaired read 1 reads printed: $read_1_printed\n";
|
|
795 warn "Number of unpaired read 2 reads printed: $read_2_printed\n";
|
|
796 }
|
|
797
|
|
798 warn "\n";
|
|
799 if ($retain){
|
|
800 return ($out_1,$out_2,$unpaired_1,$unpaired_2);
|
|
801 }
|
|
802 else{
|
|
803 return ($out_1,$out_2);
|
|
804 }
|
|
805 }
|
|
806
|
|
807 sub process_commandline{
|
|
808 my $help;
|
|
809 my $quality;
|
|
810 my $adapter;
|
|
811 my $adapter2;
|
|
812 my $stringency;
|
|
813 my $report;
|
|
814 my $version;
|
|
815 my $rrbs;
|
|
816 my $length_cutoff;
|
|
817 my $keep;
|
|
818 my $fastqc;
|
|
819 my $non_directional;
|
|
820 my $phred33;
|
|
821 my $phred64;
|
|
822 my $fastqc_args;
|
|
823 my $trim;
|
|
824 my $gzip;
|
|
825 my $validate;
|
|
826 my $retain;
|
|
827 my $length_read_1;
|
|
828 my $length_read_2;
|
|
829 my $error_rate;
|
|
830 my $output_dir;
|
|
831 my $no_report_file;
|
|
832 my $suppress_warn;
|
|
833
|
|
834 my $command_line = GetOptions ('help|man' => \$help,
|
|
835 'q|quality=i' => \$quality,
|
|
836 'a|adapter=s' => \$adapter,
|
|
837 'a2|adapter2=s' => \$adapter2,
|
|
838 'report' => \$report,
|
|
839 'version' => \$version,
|
|
840 'stringency=i' => \$stringency,
|
|
841 'fastqc' => \$fastqc,
|
|
842 'RRBS' => \$rrbs,
|
|
843 'keep' => \$keep,
|
|
844 'length=i' => \$length_cutoff,
|
|
845 'non_directional' => \$non_directional,
|
|
846 'phred33' => \$phred33,
|
|
847 'phred64' => \$phred64,
|
|
848 'fastqc_args=s' => \$fastqc_args,
|
|
849 'trim1' => \$trim,
|
|
850 'gzip' => \$gzip,
|
|
851 'paired_end' => \$validate,
|
|
852 'retain_unpaired' => \$retain,
|
|
853 'length_1|r1=i' => \$length_read_1,
|
|
854 'length_2|r2=i' => \$length_read_2,
|
|
855 'e|error_rate=s' => \$error_rate,
|
|
856 'o|output_dir=s' => \$output_dir,
|
|
857 'no_report_file' => \$no_report_file,
|
|
858 'suppress_warn' => \$suppress_warn,
|
|
859 );
|
|
860
|
|
861 ### EXIT ON ERROR if there were errors with any of the supplied options
|
|
862 unless ($command_line){
|
|
863 die "Please respecify command line options\n";
|
|
864 }
|
|
865
|
|
866 ### HELPFILE
|
|
867 if ($help){
|
|
868 print_helpfile();
|
|
869 exit;
|
|
870 }
|
|
871
|
|
872
|
|
873
|
|
874
|
|
875 if ($version){
|
|
876 print << "VERSION";
|
|
877
|
|
878 Quality-/Adapter-/RRBS-Trimming
|
|
879 (powered by Cutadapt)
|
|
880 version $trimmer_version
|
|
881
|
|
882 Last update: 18 10 2012
|
|
883
|
|
884 VERSION
|
|
885 exit;
|
|
886 }
|
|
887
|
|
888 ### RRBS
|
|
889 unless ($rrbs){
|
|
890 $rrbs = 0;
|
|
891 }
|
|
892
|
|
893 ### SUPRESS WARNINGS
|
|
894 if (defined $suppress_warn){
|
|
895 $DOWARN = 0;
|
|
896 }
|
|
897
|
|
898 ### QUALITY SCORES
|
|
899 my $phred_encoding;
|
|
900 if ($phred33){
|
|
901 if ($phred64){
|
|
902 die "Please specify only a single quality encoding type (--phred33 or --phred64)\n\n";
|
|
903 }
|
|
904 $phred_encoding = 33;
|
|
905 }
|
|
906 elsif ($phred64){
|
|
907 $phred_encoding = 64;
|
|
908 }
|
|
909 unless ($phred33 or $phred64){
|
|
910 warn "No quality encoding type selected. Assuming that the data provided uses Sanger encoded Phred scores (default)\n\n";
|
|
911 $phred_encoding = 33;
|
|
912 sleep (3);
|
|
913 }
|
|
914
|
|
915 ### NON-DIRECTIONAL RRBS
|
|
916 if ($non_directional){
|
|
917 unless ($rrbs){
|
|
918 die "Option '--non_directional' requires '--rrbs' to be specified as well. Please re-specify!\n";
|
|
919 }
|
|
920 }
|
|
921 else{
|
|
922 $non_directional = 0;
|
|
923 }
|
|
924
|
|
925 if ($fastqc_args){
|
|
926 $fastqc = 1; # specifying fastqc extra arguments automatically means that FastQC will be executed
|
|
927 }
|
|
928 else{
|
|
929 $fastqc_args = 0;
|
|
930 }
|
|
931
|
|
932 ### CUSTOM ERROR RATE
|
|
933 if (defined $error_rate){
|
|
934 # make sure that the error rate is between 0 and 1
|
|
935 unless ($error_rate >= 0 and $error_rate <= 1){
|
|
936 die "Please specify an error rate between 0 and 1 (the default is 0.1)\n";
|
|
937 }
|
|
938 }
|
|
939 else{
|
|
940 $error_rate = 0.1; # (default)
|
|
941 }
|
|
942
|
|
943 if (defined $adapter){
|
|
944 unless ($adapter =~ /^[ACTGNactgn]+$/){
|
|
945 die "Adapter sequence must contain DNA characters only (A,C,T,G or N)!\n";
|
|
946 }
|
|
947 $adapter = uc$adapter;
|
|
948 }
|
|
949
|
|
950 if (defined $adapter2){
|
|
951 unless ($validate){
|
|
952 die "An optional adapter for read 2 of paired-end files requires '--paired' to be specified as well! Please re-specify\n";
|
|
953 }
|
|
954 unless ($adapter2 =~ /^[ACTGNactgn]+$/){
|
|
955 die "Optional adapter 2 sequence must contain DNA characters only (A,C,T,G or N)!\n";
|
|
956 }
|
|
957 $adapter2 = uc$adapter2;
|
|
958 }
|
|
959
|
|
960 ### files are supposed to be paired-end files
|
|
961 if ($validate){
|
|
962
|
|
963 # making sure that an even number of reads has been supplied
|
|
964 unless ((scalar@ARGV)%2 == 0){
|
|
965 die "Please provide an even number of input files for paired-end FastQ trimming! Aborting ...\n";
|
|
966 }
|
|
967
|
|
968 ## CUTOFF FOR VALIDATED READ-PAIRS
|
|
969
|
|
970 if (defined $length_read_1 or defined $length_read_2){
|
|
971 unless ($retain){
|
|
972 die "Please specify --keep_unpaired to alter the unpaired single-end read length cut off(s)\n\n";
|
|
973 }
|
|
974
|
|
975 if (defined $length_read_1){
|
|
976 unless ($length_read_1 >= 15 and $length_read_1 <= 100){
|
|
977 die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n";
|
|
978 }
|
|
979 unless ($length_read_1 > $length_cutoff){
|
|
980 die "The single-end unpaired read length needs to be longer than the paired-end cut-off value\n\n";
|
|
981 }
|
|
982 }
|
|
983
|
|
984 if (defined $length_read_2){
|
|
985 unless ($length_read_2 >= 15 and $length_read_2 <= 100){
|
|
986 die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n";
|
|
987 }
|
|
988 unless ($length_read_2 > $length_cutoff){
|
|
989 die "The single-end unpaired read length needs to be longer than the paired-end cut-off value\n\n";
|
|
990 }
|
|
991 }
|
|
992 }
|
|
993
|
|
994 if ($retain){
|
|
995 $length_read_1 = 35 unless (defined $length_read_1);
|
|
996 $length_read_2 = 35 unless (defined $length_read_2);
|
|
997 }
|
|
998 }
|
|
999
|
|
1000 unless ($no_report_file){
|
|
1001 $no_report_file = 0;
|
|
1002 }
|
|
1003
|
|
1004 ### OUTPUT DIR PATH
|
|
1005 if ($output_dir){
|
|
1006 unless ($output_dir =~ /\/$/){
|
|
1007 $output_dir =~ s/$/\//;
|
|
1008 }
|
|
1009 }
|
|
1010 else{
|
|
1011 $output_dir = '';
|
|
1012 }
|
|
1013
|
|
1014 return ($quality,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$adapter2,$error_rate,$output_dir,$no_report_file);
|
|
1015 }
|
|
1016
|
|
1017
|
|
1018
|
|
1019
|
|
1020 sub print_helpfile{
|
|
1021 print << "HELP";
|
|
1022
|
|
1023 USAGE:
|
|
1024
|
|
1025 trim_galore [options] <filename(s)>
|
|
1026
|
|
1027
|
|
1028 -h/--help Print this help message and exits.
|
|
1029
|
|
1030 -v/--version Print the version information and exits.
|
|
1031
|
|
1032 -q/--quality <INT> Trim low-quality ends from reads in addition to adapter removal. For
|
|
1033 RRBS samples, quality trimming will be performed first, and adapter
|
|
1034 trimming is carried in a second round. Other files are quality and adapter
|
|
1035 trimmed in a single pass. The algorithm is the same as the one used by BWA
|
|
1036 (Subtract INT from all qualities; compute partial sums from all indices
|
|
1037 to the end of the sequence; cut sequence at the index at which the sum is
|
|
1038 minimal). Default Phred score: 20.
|
|
1039
|
|
1040 --phred33 Instructs Cutadapt to use ASCII+33 quality scores as Phred scores
|
|
1041 (Sanger/Illumina 1.9+ encoding) for quality trimming. Default: ON.
|
|
1042
|
|
1043 --phred64 Instructs Cutadapt to use ASCII+64 quality scores as Phred scores
|
|
1044 (Illumina 1.5 encoding) for quality trimming.
|
|
1045
|
|
1046 --fastqc Run FastQC in the default mode on the FastQ file once trimming is complete.
|
|
1047
|
|
1048 --fastqc_args "<ARGS>" Passes extra arguments to FastQC. If more than one argument is to be passed
|
|
1049 to FastQC they must be in the form "arg1 arg2 etc.". An example would be:
|
|
1050 --fastqc_args "--nogroup --outdir /home/". Passing extra arguments will
|
|
1051 automatically invoke FastQC, so --fastqc does not have to be specified
|
|
1052 separately.
|
|
1053
|
|
1054 -a/--adapter <STRING> Adapter sequence to be trimmed. If not specified explicitely, the first 13
|
|
1055 bp of the Illumina adapter 'AGATCGGAAGAGC' will be used by default.
|
|
1056
|
|
1057 -a2/--adapter2 <STRING> Optional adapter sequence to be trimmed off read 2 of paired-end files. This
|
|
1058 option requires '--paired' to be specified as well.
|
|
1059
|
|
1060
|
|
1061 -s/--stringency <INT> Overlap with adapter sequence required to trim a sequence. Defaults to a
|
|
1062 very stringent setting of '1', i.e. even a single bp of overlapping sequence
|
|
1063 will be trimmed of the 3' end of any read.
|
|
1064
|
|
1065 -e <ERROR RATE> Maximum allowed error rate (no. of errors divided by the length of the matching
|
|
1066 region) (default: 0.1)
|
|
1067
|
|
1068 --gzip Compress the output file with gzip. If the input files are gzip-compressed
|
|
1069 the output files will be automatically gzip compressed as well.
|
|
1070
|
|
1071 --length <INT> Discard reads that became shorter than length INT because of either
|
|
1072 quality or adapter trimming. A value of '0' effectively disables
|
|
1073 this behaviour. Default: 20 bp.
|
|
1074
|
|
1075 For paired-end files, both reads of a read-pair need to be longer than
|
|
1076 <INT> bp to be printed out to validated paired-end files (see option --paired).
|
|
1077 If only one read became too short there is the possibility of keeping such
|
|
1078 unpaired single-end reads (see --retain_unpaired). Default pair-cutoff: 20 bp.
|
|
1079
|
|
1080 -o/--output_dir <DIR> If specified all output will be written to this directory instead of the current
|
|
1081 directory.
|
|
1082
|
|
1083 --no_report_file If specified no report file will be generated.
|
|
1084
|
|
1085 --suppress_warn If specified any output to STDOUT or STDERR will be suppressed.
|
|
1086
|
|
1087
|
|
1088
|
|
1089 RRBS-specific options (MspI digested material):
|
|
1090
|
|
1091 --rrbs Specifies that the input file was an MspI digested RRBS sample (recognition
|
|
1092 site: CCGG). Sequences which were adapter-trimmed will have a further 2 bp
|
|
1093 removed from their 3' end. This is to avoid that the filled-in C close to the
|
|
1094 second MspI site in a sequence is used for methylation calls. Sequences which
|
|
1095 were merely trimmed because of poor quality will not be shortened further.
|
|
1096
|
|
1097 --non_directional Selecting this option for non-directional RRBS libraries will screen
|
|
1098 quality-trimmed sequences for 'CAA' or 'CGA' at the start of the read
|
|
1099 and, if found, removes the first two basepairs. Like with the option
|
|
1100 '--rrbs' this avoids using cytosine positions that were filled-in
|
|
1101 during the end-repair step. '--non_directional' requires '--rrbs' to
|
|
1102 be specified as well.
|
|
1103
|
|
1104 --keep Keep the quality trimmed intermediate file. Default: off, which means
|
|
1105 the temporary file is being deleted after adapter trimming. Only has
|
|
1106 an effect for RRBS samples since other FastQ files are not trimmed
|
|
1107 for poor qualities separately.
|
|
1108
|
|
1109
|
|
1110 Note for RRBS using MseI:
|
|
1111
|
|
1112 If your DNA material was digested with MseI (recognition motif: TTAA) instead of MspI it is NOT necessary
|
|
1113 to specify --rrbs or --non_directional since virtually all reads should start with the sequence
|
|
1114 'TAA', and this holds true for both directional and non-directional libraries. As the end-repair of 'TAA'
|
|
1115 restricted sites does not involve any cytosines it does not need to be treated especially. Instead, simply
|
|
1116 run Trim Galore! in the standard (i.e. non-RRBS) mode.
|
|
1117
|
|
1118
|
|
1119 Paired-end specific options:
|
|
1120
|
|
1121 --paired This option performs length trimming of quality/adapter/RRBS trimmed reads for
|
|
1122 paired-end files. To pass the validation test, both sequences of a sequence pair
|
|
1123 are required to have a certain minimum length which is governed by the option
|
|
1124 --length (see above). If only one read passes this length threshold the
|
|
1125 other read can be rescued (see option --retain_unpaired). Using this option lets
|
|
1126 you discard too short read pairs without disturbing the sequence-by-sequence order
|
|
1127 of FastQ files which is required by many aligners.
|
|
1128
|
|
1129 Trim Galore! expects paired-end files to be supplied in a pairwise fashion, e.g.
|
|
1130 file1_1.fq file1_2.fq SRR2_1.fq.gz SRR2_2.fq.gz ... .
|
|
1131
|
|
1132 -t/--trim1 Trims 1 bp off every read from its 3' end. This may be needed for FastQ files that
|
|
1133 are to be aligned as paired-end data with Bowtie. This is because Bowtie (1) regards
|
|
1134 alignments like this:
|
|
1135
|
|
1136 R1 ---------------------------> or this: -----------------------> R1
|
|
1137 R2 <--------------------------- <----------------- R2
|
|
1138
|
|
1139 as invalid (whenever a start/end coordinate is contained within the other read).
|
|
1140
|
|
1141 --retain_unpaired If only one of the two paired-end reads became too short, the longer
|
|
1142 read will be written to either '.unpaired_1.fq' or '.unpaired_2.fq'
|
|
1143 output files. The length cutoff for unpaired single-end reads is
|
|
1144 governed by the parameters -r1/--length_1 and -r2/--length_2. Default: OFF.
|
|
1145
|
|
1146 -r1/--length_1 <INT> Unpaired single-end read length cutoff needed for read 1 to be written to
|
|
1147 '.unpaired_1.fq' output file. These reads may be mapped in single-end mode.
|
|
1148 Default: 35 bp.
|
|
1149
|
|
1150 -r2/--length_2 <INT> Unpaired single-end read length cutoff needed for read 2 to be written to
|
|
1151 '.unpaired_2.fq' output file. These reads may be mapped in single-end mode.
|
|
1152 Default: 35 bp.
|
|
1153
|
|
1154
|
|
1155 Last modified on 18 Oct 2012.
|
|
1156
|
|
1157 HELP
|
|
1158 exit;
|
|
1159 }
|