Mercurial > repos > bgruening > trim_galore
comparison trim_galore @ 0:3c1664caa8e3 draft
Uploaded
author | bgruening |
---|---|
date | Sat, 06 Jul 2013 09:52:23 -0400 |
parents | |
children | 898db63d2e84 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3c1664caa8e3 |
---|---|
1 #!/usr/bin/perl | |
2 use strict; | |
3 use warnings; | |
4 use Getopt::Long; | |
5 use IPC::Open3; | |
6 use File::Spec; | |
7 use File::Basename; | |
8 use Cwd; | |
9 | |
10 ## This program is Copyright (C) 2012, Felix Krueger (felix.krueger@babraham.ac.uk) | |
11 | |
12 ## This program is free software: you can redistribute it and/or modify | |
13 ## it under the terms of the GNU General Public License as published by | |
14 ## the Free Software Foundation, either version 3 of the License, or | |
15 ## (at your option) any later version. | |
16 | |
17 ## This program is distributed in the hope that it will be useful, | |
18 ## but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 ## GNU General Public License for more details. | |
21 | |
22 ## You should have received a copy of the GNU General Public License | |
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>. | |
24 | |
25 | |
26 | |
27 ## this script is taking in FastQ sequences and trims them with Cutadapt | |
28 ## last modified on 18 10 2012 | |
29 | |
30 ######################################################################## | |
31 | |
32 # change these paths if needed | |
33 | |
34 my $path_to_cutadapt = 'cutadapt'; | |
35 my $path_to_fastqc = 'fastqc'; | |
36 | |
37 ######################################################################## | |
38 | |
39 | |
40 my $trimmer_version = '0.2.5'; | |
41 my $DOWARN = 1; # print on screen warning and text by default | |
42 BEGIN { $SIG{'__WARN__'} = sub { warn $_[0] if $DOWARN } }; | |
43 | |
44 my ($cutoff,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$a2,$error_rate,$output_dir,$no_report_file) = process_commandline(); | |
45 | |
46 ### SETTING DEFAULTS UNLESS THEY WERE SPECIFIED | |
47 unless (defined $cutoff){ | |
48 $cutoff = 20; | |
49 } | |
50 my $phred_score_cutoff = $cutoff; # only relevant for report | |
51 | |
52 unless (defined $adapter){ | |
53 $adapter = 'AGATCGGAAGAGC'; | |
54 } | |
55 unless (defined $a2){ # optional adapter for the second read in a pair. Only works for --paired trimming | |
56 $a2 = ''; | |
57 } | |
58 | |
59 unless (defined $stringency){ | |
60 $stringency = 1; | |
61 } | |
62 | |
63 unless (defined $length_cutoff){ | |
64 $length_cutoff = 20; | |
65 } | |
66 | |
67 if ($phred_encoding == 64){ | |
68 $cutoff += 31; | |
69 } | |
70 | |
71 my @filenames = @ARGV; | |
72 | |
73 my $file_1; | |
74 my $file_2; | |
75 | |
76 foreach my $filename (@ARGV){ | |
77 trim ($filename); | |
78 } | |
79 | |
80 | |
81 sub trim{ | |
82 my $filename = shift; | |
83 | |
84 my $output_filename = (split (/\//,$filename))[-1]; | |
85 # warn "Here is the outputfile name: $output_filename\n"; | |
86 | |
87 my $report = $output_filename; | |
88 $report =~ s/$/_trimming_report.txt/; | |
89 | |
90 if ($no_report_file) { | |
91 $report = File::Spec->devnull; | |
92 open (REPORT,'>',$report) or die "Failed to write to file: $!\n"; | |
93 # warn "Redirecting report output to /dev/null\n"; | |
94 } | |
95 else{ | |
96 open (REPORT,'>',$output_dir.$report) or die "Failed to write to file: $!\n"; | |
97 warn "Writing report to '$output_dir$report'\n"; | |
98 } | |
99 | |
100 warn "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n"; | |
101 print REPORT "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n"; | |
102 | |
103 warn "Quality Phred score cutoff: $phred_score_cutoff\n"; | |
104 print REPORT "Quality Phred score cutoff: $phred_score_cutoff\n"; | |
105 | |
106 warn "Quality encoding type selected: ASCII+$phred_encoding\n"; | |
107 print REPORT "Quality encoding type selected: ASCII+$phred_encoding\n"; | |
108 | |
109 warn "Adapter sequence: '$adapter'\n"; | |
110 print REPORT "Adapter sequence: '$adapter'\n"; | |
111 | |
112 if ($error_rate == 0.1){ | |
113 warn "Maximum trimming error rate: $error_rate (default)\n"; | |
114 } | |
115 else{ | |
116 warn "Maximum trimming error rate: $error_rate\n"; | |
117 } | |
118 | |
119 print REPORT "Maximum trimming error rate: $error_rate"; | |
120 if ($error_rate == 0.1){ | |
121 print REPORT " (default)\n"; | |
122 } | |
123 else{ | |
124 print REPORT "\n"; | |
125 } | |
126 | |
127 if ($a2){ | |
128 warn "Optional adapter 2 sequence (only used for read 2 of paired-end files): '$a2'\n"; | |
129 print REPORT "Optional adapter 2 sequence (only used for read 2 of paired-end files): '$a2'\n"; | |
130 } | |
131 | |
132 warn "Minimum required adapter overlap (stringency): $stringency bp\n"; | |
133 print REPORT "Minimum required adapter overlap (stringency): $stringency bp\n"; | |
134 | |
135 if ($validate){ | |
136 warn "Minimum required sequence length for both reads before a sequence pair gets removed: $length_cutoff bp\n"; | |
137 print REPORT "Minimum required sequence length for both reads before a sequence pair gets removed: $length_cutoff bp\n"; | |
138 } | |
139 else{ | |
140 warn "Minimum required sequence length before a sequence gets removed: $length_cutoff bp\n"; | |
141 print REPORT "Minimum required sequence length before a sequence gets removed: $length_cutoff bp\n"; | |
142 } | |
143 | |
144 if ($validate){ # only for paired-end files | |
145 | |
146 if ($retain){ # keeping single-end reads if only one end is long enough | |
147 | |
148 if ($length_read_1 == 35){ | |
149 warn "Length cut-off for read 1: $length_read_1 bp (default)\n"; | |
150 print REPORT "Length cut-off for read 1: $length_read_1 bp (default)\n"; | |
151 } | |
152 else{ | |
153 warn "Length cut-off for read 1: $length_read_1 bp\n"; | |
154 print REPORT "Length cut-off for read 1: $length_read_1 bp\n"; | |
155 } | |
156 | |
157 if ($length_read_2 == 35){ | |
158 warn "Length cut-off for read 2: $length_read_2 b (default)\n"; | |
159 print REPORT "Length cut-off for read 2: $length_read_2 bp (default)\n"; | |
160 } | |
161 else{ | |
162 warn "Length cut-off for read 2: $length_read_2 bp\n"; | |
163 print REPORT "Length cut-off for read 2: $length_read_2 bp\n"; | |
164 } | |
165 } | |
166 } | |
167 | |
168 if ($rrbs){ | |
169 warn "File was specified to be an MspI-digested RRBS sample. Sequences with adapter contamination will be trimmed a further 2 bp to remove potential methylation-biased bases from the end-repair reaction\n"; | |
170 print REPORT "File was specified to be an MspI-digested RRBS sample. Sequences with adapter contamination will be trimmed a further 2 bp to remove potential methylation-biased bases from the end-repair reaction\n"; | |
171 } | |
172 | |
173 if ($non_directional){ | |
174 warn "File was specified to be a non-directional MspI-digested RRBS sample. Sequences starting with either 'CAA' or 'CGA' will have the first 2 bp trimmed off to remove potential methylation-biased bases from the end-repair reaction\n"; | |
175 print REPORT "File was specified to be a non-directional MspI-digested RRBS sample. Sequences starting with either 'CAA' or 'CGA' will have the first 2 bp trimmed off to remove potential methylation-biased bases from the end-repair reaction\n"; | |
176 } | |
177 | |
178 if ($trim){ | |
179 warn "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n"; | |
180 print REPORT "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n"; | |
181 } | |
182 | |
183 if ($fastqc){ | |
184 warn "Running FastQC on the data once trimming has completed\n"; | |
185 print REPORT "Running FastQC on the data once trimming has completed\n"; | |
186 | |
187 if ($fastqc_args){ | |
188 warn "Running FastQC with the following extra arguments: '$fastqc_args'\n"; | |
189 print REPORT "Running FastQC with the following extra arguments: $fastqc_args\n"; | |
190 } | |
191 } | |
192 | |
193 if ($keep and $rrbs){ | |
194 warn "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n"; | |
195 print REPORT "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n"; | |
196 } | |
197 | |
198 if ($gzip or $filename =~ /\.gz$/){ | |
199 warn "Output file will be GZIP compressed\n"; | |
200 print REPORT "Output file will be GZIP compressed\n"; | |
201 } | |
202 | |
203 warn "\n"; | |
204 print REPORT "\n"; | |
205 sleep (3); | |
206 | |
207 my $temp; | |
208 | |
209 ### Proceeding differently for RRBS and other type of libraries | |
210 if ($rrbs){ | |
211 | |
212 ### Skipping quality filtering for RRBS libraries if a quality cutoff of 0 was specified | |
213 if ($cutoff == 0){ | |
214 warn "Quality cutoff selected was 0 - Skipping quality trimming altogether\n\n"; | |
215 sleep (3); | |
216 } | |
217 else{ | |
218 | |
219 $temp = $filename; | |
220 $temp =~ s/$/_qual_trimmed.fastq/; | |
221 open (TEMP,'>',$output_dir.$temp) or die "Can't write to $temp: $!"; | |
222 | |
223 warn " >>> Now performing adaptive quality trimming with a Phred-score cutoff of: $cutoff <<<\n\n"; | |
224 sleep (3); | |
225 | |
226 open (QUAL,"$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -a X $filename |") or die "Can't open pipe to Cutadapt: $!"; | |
227 | |
228 my $qual_count = 0; | |
229 | |
230 while (1){ | |
231 my $l1 = <QUAL>; | |
232 my $seq = <QUAL>; | |
233 my $l3 = <QUAL>; | |
234 my $qual = <QUAL>; | |
235 last unless (defined $qual); | |
236 | |
237 $qual_count++; | |
238 if ($qual_count%10000000 == 0){ | |
239 warn "$qual_count sequences processed\n"; | |
240 } | |
241 print TEMP "$l1$seq$l3$qual"; | |
242 } | |
243 | |
244 warn "\n >>> Quality trimming completed <<<\n$qual_count sequences processed in total\n\n"; | |
245 close QUAL or die "Unable to close filehandle: $!\n"; | |
246 sleep (3); | |
247 | |
248 } | |
249 } | |
250 | |
251 | |
252 if ($output_filename =~ /\.fastq$/){ | |
253 $output_filename =~ s/\.fastq$/_trimmed.fq/; | |
254 } | |
255 elsif ($output_filename =~ /\.fastq\.gz$/){ | |
256 $output_filename =~ s/\.fastq\.gz$/_trimmed.fq/; | |
257 } | |
258 elsif ($output_filename =~ /\.fq$/){ | |
259 $output_filename =~ s/\.fq$/_trimmed.fq/; | |
260 } | |
261 elsif ($output_filename =~ /\.fq\.gz$/){ | |
262 $output_filename =~ s/\.fq\.gz$/_trimmed.fq/; | |
263 } | |
264 else{ | |
265 $output_filename =~ s/$/_trimmed.fq/; | |
266 } | |
267 | |
268 warn "Writing final adapter and quality trimmed output to $output_filename\n\n"; | |
269 open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n"; | |
270 sleep (2); | |
271 | |
272 my $count = 0; | |
273 my $too_short = 0; | |
274 my $quality_trimmed = 0; | |
275 my $rrbs_trimmed = 0; | |
276 my $rrbs_trimmed_start = 0; | |
277 my $CAA = 0; | |
278 my $CGA = 0; | |
279 | |
280 if ($rrbs and $cutoff != 0){ | |
281 | |
282 ### optionally using 2 different adapters for read 1 and read 2 | |
283 if ($validate and $a2){ | |
284 ### Figure out whether current file counts as read 1 or read 2 of paired-end files | |
285 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair | |
286 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n"; | |
287 sleep (3); | |
288 open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n"; | |
289 } | |
290 else{ # this is read 2 of a pair | |
291 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$a2' from file $temp <<< \n"; | |
292 sleep (3); | |
293 open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $a2 $output_dir$temp") or die "Failed to launch Cutadapt: $!\n"; | |
294 } | |
295 } | |
296 ### Using the same adapter for both read 1 and read 2 | |
297 else{ | |
298 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n"; | |
299 sleep (3); | |
300 open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n"; | |
301 } | |
302 | |
303 close WRITER or die $!; # not needed | |
304 | |
305 open (QUAL,"$output_dir$temp") or die $!; # quality trimmed file | |
306 | |
307 if ($filename =~ /\.gz$/){ | |
308 open (IN,"zcat $filename |") or die $!; # original, untrimmed file | |
309 } | |
310 else{ | |
311 open (IN,$filename) or die $!; # original, untrimmed file | |
312 } | |
313 | |
314 while (1){ | |
315 | |
316 # we can process the output from Cutadapt and the original input 1 by 1 to decide if the adapter has been removed or not | |
317 my $l1 = <TRIM>; | |
318 my $seq = <TRIM>; # adapter trimmed sequence | |
319 my $l3 = <TRIM>; | |
320 my $qual = <TRIM>; | |
321 | |
322 $_ = <IN>; # irrelevant | |
323 my $original_seq = <IN>; | |
324 $_ = <IN>; # irrelevant | |
325 $_ = <IN>; # irrelevant | |
326 | |
327 $_ = <QUAL>; # irrelevant | |
328 my $qual_trimmed_seq = <QUAL>; | |
329 $_ = <QUAL>; # irrelevant | |
330 my $qual_trimmed_qual = <QUAL>; | |
331 | |
332 last unless (defined $qual and defined $qual_trimmed_qual); # could be empty strings | |
333 | |
334 $count++; | |
335 if ($count%10000000 == 0){ | |
336 warn "$count sequences processed\n"; | |
337 } | |
338 | |
339 chomp $seq; | |
340 chomp $qual; | |
341 chomp $qual_trimmed_seq; | |
342 chomp $original_seq; | |
343 | |
344 my $quality_trimmed_seq_length = length $qual_trimmed_seq; | |
345 | |
346 if (length $original_seq > length $qual_trimmed_seq){ | |
347 ++$quality_trimmed; | |
348 } | |
349 | |
350 my $nd = 0; | |
351 | |
352 ### NON-DIRECTIONAL RRBS | |
353 if ($non_directional){ | |
354 if (length$seq > 2){ | |
355 if ($seq =~ /^CAA/){ | |
356 ++$CAA; | |
357 $seq = substr ($seq,2,length($seq)-2); | |
358 $qual = substr ($qual,2,length($qual)-2); | |
359 ++$rrbs_trimmed_start; | |
360 $nd = 1; | |
361 } | |
362 elsif ($seq =~ /^CGA/){ | |
363 $seq = substr ($seq,2,length($seq)-2); | |
364 $qual = substr ($qual,2,length($qual)-2); | |
365 ++$CGA; | |
366 ++$rrbs_trimmed_start; | |
367 $nd = 1; | |
368 } | |
369 } | |
370 } | |
371 | |
372 ### directional read | |
373 unless ($nd == 1){ | |
374 if (length $seq >= 2 and length$seq < $quality_trimmed_seq_length){ | |
375 $seq = substr ($seq,0,length($seq)-2); | |
376 $qual = substr ($qual,0,length($qual)-2); | |
377 ++$rrbs_trimmed; | |
378 } | |
379 } | |
380 | |
381 ### Shortening all sequences by 1 bp on the 3' end | |
382 if ($trim){ | |
383 $seq = substr($seq,0,length($seq)-1); | |
384 $qual = substr($qual,0,length($qual)-1); | |
385 } | |
386 | |
387 ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE | |
388 if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later) | |
389 print OUT "$l1$seq\n$l3$qual\n"; | |
390 } | |
391 else{ # single end | |
392 if (length $seq < $length_cutoff){ | |
393 ++$too_short; | |
394 next; | |
395 } | |
396 else{ | |
397 print OUT "$l1$seq\n$l3$qual\n"; | |
398 } | |
399 } | |
400 } | |
401 | |
402 print REPORT "\n"; | |
403 while (<ERROR>){ | |
404 warn $_; | |
405 print REPORT $_; | |
406 } | |
407 | |
408 close IN or die "Unable to close IN filehandle: $!"; | |
409 close QUAL or die "Unable to close QUAL filehandle: $!"; | |
410 close TRIM or die "Unable to close TRIM filehandle: $!"; | |
411 close OUT or die "Unable to close OUT filehandle: $!"; | |
412 } | |
413 else{ | |
414 | |
415 ### optionally using 2 different adapters for read 1 and read 2 | |
416 if ($validate and $a2){ | |
417 ### Figure out whether current file counts as read 1 or read 2 of paired-end files | |
418 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair | |
419 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n"; | |
420 sleep (3); | |
421 open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $adapter $filename") or die "Failed to launch Cutadapt: $!"; | |
422 } | |
423 else{ # this is read 2 of a pair | |
424 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$a2' from file $filename <<< \n"; | |
425 sleep (3); | |
426 open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $a2 $filename") or die "Failed to launch Cutadapt: $!"; | |
427 } | |
428 } | |
429 ### Using the same adapter for both read 1 and read 2 | |
430 else{ | |
431 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n"; | |
432 sleep (3); | |
433 open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $adapter $filename") or die "Failed to launch Cutadapt: $!"; | |
434 } | |
435 | |
436 close WRITER or die $!; # not needed | |
437 | |
438 while (1){ | |
439 | |
440 my $l1 = <TRIM>; | |
441 my $seq = <TRIM>; # quality and/or adapter trimmed sequence | |
442 my $l3 = <TRIM>; | |
443 my $qual = <TRIM>; | |
444 # print "$l1$seq\n$l3$qual\n"; | |
445 last unless (defined $qual); # could be an empty string | |
446 | |
447 $count++; | |
448 if ($count%10000000 == 0){ | |
449 warn "$count sequences processed\n"; | |
450 } | |
451 | |
452 chomp $seq; | |
453 chomp $qual; | |
454 | |
455 ### Shortening all sequences by 1 bp on the 3' end | |
456 if ($trim){ | |
457 $seq = substr($seq,0,length($seq)-1); | |
458 $qual = substr($qual,0,length($qual)-1); | |
459 } | |
460 | |
461 ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE | |
462 if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later) | |
463 print OUT "$l1$seq\n$l3$qual\n"; | |
464 } | |
465 else{ # single end | |
466 if (length $seq < $length_cutoff){ | |
467 ++$too_short; | |
468 next; | |
469 } | |
470 else{ | |
471 print OUT "$l1$seq\n$l3$qual\n"; | |
472 } | |
473 } | |
474 } | |
475 | |
476 print REPORT "\n"; | |
477 while (<ERROR>){ | |
478 warn $_; | |
479 print REPORT $_; | |
480 } | |
481 | |
482 close TRIM or die "Unable to close TRIM filehandle: $!\n"; | |
483 close ERROR or die "Unable to close ERROR filehandle: $!\n"; | |
484 close OUT or die "Unable to close OUT filehandle: $!\n"; | |
485 | |
486 } | |
487 | |
488 if ($rrbs){ | |
489 unless ($keep){ # keeping the quality trimmed intermediate file for RRBS files | |
490 | |
491 # deleting temporary quality trimmed file | |
492 my $deleted = unlink "$output_dir$temp"; | |
493 | |
494 if ($deleted){ | |
495 warn "Successfully deleted temporary file $temp\n\n"; | |
496 } | |
497 else{ | |
498 warn "Could not delete temporary file $temp"; | |
499 } | |
500 } | |
501 } | |
502 | |
503 warn "\nRUN STATISTICS FOR INPUT FILE: $filename\n"; | |
504 print REPORT "\nRUN STATISTICS FOR INPUT FILE: $filename\n"; | |
505 | |
506 warn "="x 45,"\n"; | |
507 print REPORT "="x 45,"\n"; | |
508 | |
509 warn "$count sequences processed in total\n"; | |
510 print REPORT "$count sequences processed in total\n"; | |
511 | |
512 ### only reporting this separately if quality and adapter trimming were performed separately | |
513 if ($rrbs){ | |
514 my $percentage_shortened = sprintf ("%.1f",$quality_trimmed/$count*100); | |
515 warn "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n"; | |
516 print REPORT "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n"; | |
517 } | |
518 | |
519 my $percentage_too_short = sprintf ("%.1f",$too_short/$count*100); | |
520 warn "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n"; | |
521 print REPORT "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n"; | |
522 | |
523 if ($rrbs){ | |
524 my $percentage_rrbs_trimmed = sprintf ("%.1f",$rrbs_trimmed/$count*100); | |
525 warn "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n"; | |
526 print REPORT "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n"; | |
527 } | |
528 | |
529 if ($non_directional){ | |
530 my $percentage_rrbs_trimmed_at_start = sprintf ("%.1f",$rrbs_trimmed_start/$count*100); | |
531 warn "RRBS reads trimmed by 2 bp at the start when read started with CAA ($CAA) or CGA ($CGA) in total:\t$rrbs_trimmed_start ($percentage_rrbs_trimmed_at_start%)\n"; | |
532 print REPORT "RRBS reads trimmed by 2 bp at the start when read started with CAA ($CAA) or CGA ($CGA) in total:\t$rrbs_trimmed_start ($percentage_rrbs_trimmed_at_start%)\n"; | |
533 } | |
534 | |
535 warn "\n"; | |
536 print REPORT "\n"; | |
537 | |
538 ### RUNNING FASTQC | |
539 if ($fastqc){ | |
540 warn "\n >>> Now running FastQC on the data <<<\n\n"; | |
541 sleep (5); | |
542 if ($fastqc_args){ | |
543 system ("$path_to_fastqc $fastqc_args $output_filename"); | |
544 } | |
545 else{ | |
546 system ("$path_to_fastqc $output_filename"); | |
547 } | |
548 } | |
549 | |
550 ### COMPRESSING OUTPUT FILE | |
551 unless ($validate){ # not gzipping intermediate files for paired-end files | |
552 if ($gzip or $filename =~ /\.gz$/){ | |
553 warn "\n >>> GZIP-ing the output file <<<\n\n"; | |
554 system ("gzip -f $output_filename"); | |
555 $output_filename = $output_filename.'.gz'; | |
556 } | |
557 } | |
558 | |
559 ### VALIDATE PAIRED-END FILES | |
560 if ($validate){ | |
561 | |
562 ### Figure out whether current file counts as read 1 or read 2 of paired-end files | |
563 | |
564 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair | |
565 $file_1 = $output_filename; | |
566 shift @filenames; | |
567 # warn "This is read 1: $file_1\n\n"; | |
568 } | |
569 else{ # this is read 2 of a pair | |
570 $file_2 = $output_filename; | |
571 shift @filenames; | |
572 # warn "This is read 2: $file_2\n\n"; | |
573 } | |
574 | |
575 if ($file_1 and $file_2){ | |
576 warn "Validate paired-end files $file_1 and $file_2\n"; | |
577 sleep (1); | |
578 | |
579 my ($val_1,$val_2,$un_1,$un_2) = validate_paired_end_files($file_1,$file_2); | |
580 | |
581 ### RUNNING FASTQC | |
582 if ($fastqc){ | |
583 | |
584 warn "\n >>> Now running FastQC on the validated data $val_1<<<\n\n"; | |
585 sleep (3); | |
586 | |
587 if ($fastqc_args){ | |
588 system ("$path_to_fastqc $fastqc_args $val_1"); | |
589 } | |
590 else{ | |
591 system ("$path_to_fastqc $val_1"); | |
592 } | |
593 | |
594 warn "\n >>> Now running FastQC on the validated data $val_2<<<\n\n"; | |
595 sleep (3); | |
596 | |
597 if ($fastqc_args){ | |
598 system ("$path_to_fastqc $fastqc_args $val_2"); | |
599 } | |
600 else{ | |
601 system ("$path_to_fastqc $val_2"); | |
602 } | |
603 | |
604 } | |
605 | |
606 if ($gzip or $filename =~ /\.gz$/){ | |
607 | |
608 # compressing main fastQ output files | |
609 warn "Compressing the validated output file $val_1 ...\n"; | |
610 system ("gzip -f $val_1"); | |
611 | |
612 warn "Compressing the validated output file $val_2 ...\n"; | |
613 system ("gzip -f $val_2"); | |
614 | |
615 | |
616 if ($retain){ # compressing unpaired reads | |
617 warn "Compressing the unpaired read output $un_1 ...\n"; | |
618 system ("gzip -f $un_1"); | |
619 | |
620 warn "Compressing the unpaired read output $un_2 ...\n"; | |
621 system ("gzip -f $un_2"); | |
622 } | |
623 } | |
624 | |
625 warn "Deleting both intermediate output files $file_1 and $file_2\n"; | |
626 unlink "$output_dir$file_1"; | |
627 unlink "$output_dir$file_2"; | |
628 | |
629 warn "\n",'='x100,"\n\n"; | |
630 sleep (3); | |
631 | |
632 $file_1 = undef; # setting file_1 and file_2 to undef once validation is completed | |
633 $file_2 = undef; | |
634 } | |
635 } | |
636 | |
637 } | |
638 | |
639 sub validate_paired_end_files{ | |
640 | |
641 my $file_1 = shift; | |
642 my $file_2 = shift; | |
643 | |
644 warn "file_1 $file_1 file_2 $file_2\n\n"; | |
645 | |
646 if ($file_1 =~ /\.gz$/){ | |
647 open (IN1,"zcat $output_dir$file_1 |") or die "Couldn't read from file $file_1: $!\n"; | |
648 } | |
649 else{ | |
650 open (IN1, "$output_dir$file_1") or die "Couldn't read from file $file_1: $!\n"; | |
651 } | |
652 | |
653 if ($file_2 =~ /\.gz$/){ | |
654 open (IN2,"zcat $output_dir$file_2 |") or die "Couldn't read from file $file_2: $!\n"; | |
655 } | |
656 else{ | |
657 open (IN2, "$output_dir$file_2") or die "Couldn't read from file $file_2: $!\n"; | |
658 } | |
659 | |
660 warn "\n>>>>> Now validing the length of the 2 paired-end infiles: $file_1 and $file_2 <<<<<\n"; | |
661 sleep (3); | |
662 | |
663 my $out_1 = $file_1; | |
664 my $out_2 = $file_2; | |
665 | |
666 if ($out_1 =~ /gz$/){ | |
667 $out_1 =~ s/trimmed\.fq\.gz$/val_1.fq/; | |
668 } | |
669 else{ | |
670 $out_1 =~ s/trimmed\.fq$/val_1.fq/; | |
671 } | |
672 | |
673 if ($out_2 =~ /gz$/){ | |
674 $out_2 =~ s/trimmed\.fq\.gz$/val_2.fq/; | |
675 } | |
676 else{ | |
677 $out_2 =~ s/trimmed\.fq$/val_2.fq/; | |
678 } | |
679 | |
680 open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n"; | |
681 open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n"; | |
682 warn "Writing validated paired-end read 1 reads to $out_1\n"; | |
683 warn "Writing validated paired-end read 2 reads to $out_2\n\n"; | |
684 | |
685 my $unpaired_1; | |
686 my $unpaired_2; | |
687 | |
688 if ($retain){ | |
689 | |
690 $unpaired_1 = $file_1; | |
691 $unpaired_2 = $file_2; | |
692 | |
693 if ($unpaired_1 =~ /gz$/){ | |
694 $unpaired_1 =~ s/trimmed\.fq\.gz$/unpaired_1.fq/; | |
695 } | |
696 else{ | |
697 $unpaired_1 =~ s/trimmed\.fq$/unpaired_1.fq/; | |
698 } | |
699 | |
700 if ($unpaired_2 =~ /gz$/){ | |
701 $unpaired_2 =~ s/trimmed\.fq\.gz$/unpaired_2.fq/; | |
702 } | |
703 else{ | |
704 $unpaired_2 =~ s/trimmed\.fq$/unpaired_2.fq/; | |
705 } | |
706 | |
707 open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n"; | |
708 open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n"; | |
709 | |
710 warn "Writing unpaired read 1 reads to $unpaired_1\n"; | |
711 warn "Writing unpaired read 2 reads to $unpaired_2\n\n"; | |
712 } | |
713 | |
714 my $sequence_pairs_removed = 0; | |
715 my $read_1_printed = 0; | |
716 my $read_2_printed = 0; | |
717 | |
718 my $count = 0; | |
719 | |
720 while (1){ | |
721 my $id_1 = <IN1>; | |
722 my $seq_1 = <IN1>; | |
723 my $l3_1 = <IN1>; | |
724 my $qual_1 = <IN1>; | |
725 last unless ($id_1 and $seq_1 and $l3_1 and $qual_1); | |
726 | |
727 my $id_2 = <IN2>; | |
728 my $seq_2 = <IN2>; | |
729 my $l3_2 = <IN2>; | |
730 my $qual_2 = <IN2>; | |
731 last unless ($id_2 and $seq_2 and $l3_2 and $qual_2); | |
732 | |
733 ++$count; | |
734 | |
735 | |
736 ## small check if the sequence files appear to FastQ files | |
737 if ($count == 1){ # performed just once | |
738 if ($id_1 !~ /^\@/ or $l3_1 !~ /^\+/){ | |
739 die "Input file doesn't seem to be in FastQ format at sequence $count\n"; | |
740 } | |
741 if ($id_2 !~ /^\@/ or $l3_2 !~ /^\+/){ | |
742 die "Input file doesn't seem to be in FastQ format at sequence $count\n"; | |
743 } | |
744 } | |
745 | |
746 chomp $seq_1; | |
747 chomp $seq_2; | |
748 | |
749 | |
750 ### making sure that the reads do have a sensible length | |
751 if ( (length($seq_1) < $length_cutoff) or (length($seq_2) < $length_cutoff) ){ | |
752 ++$sequence_pairs_removed; | |
753 if ($retain){ # writing out single-end reads if they are longer than the cutoff | |
754 | |
755 if ( length($seq_1) >= $length_read_1){ # read 1 is long enough | |
756 print UNPAIRED1 $id_1; | |
757 print UNPAIRED1 "$seq_1\n"; | |
758 print UNPAIRED1 $l3_1; | |
759 print UNPAIRED1 $qual_1; | |
760 ++$read_1_printed; | |
761 } | |
762 | |
763 if ( length($seq_2) >= $length_read_2){ # read 2 is long enough | |
764 print UNPAIRED2 $id_2; | |
765 print UNPAIRED2 "$seq_2\n"; | |
766 print UNPAIRED2 $l3_2; | |
767 print UNPAIRED2 $qual_2; | |
768 ++$read_2_printed; | |
769 } | |
770 | |
771 } | |
772 } | |
773 else{ | |
774 print R1 $id_1; | |
775 print R1 "$seq_1\n"; | |
776 print R1 $l3_1; | |
777 print R1 $qual_1; | |
778 | |
779 print R2 $id_2; | |
780 print R2 "$seq_2\n"; | |
781 print R2 $l3_2; | |
782 print R2 $qual_2; | |
783 } | |
784 | |
785 } | |
786 my $percentage = sprintf("%.2f",$sequence_pairs_removed/$count*100); | |
787 warn "Total number of sequences analysed: $count\n\n"; | |
788 warn "Number of sequence pairs removed: $sequence_pairs_removed ($percentage%)\n"; | |
789 | |
790 print REPORT "Total number of sequences analysed for the sequence pair length validation: $count\n\n"; | |
791 print REPORT "Number of sequence pairs removed because at least one read was shorter than the length cutoff ($length_cutoff bp): $sequence_pairs_removed ($percentage%)\n"; | |
792 | |
793 if ($keep){ | |
794 warn "Number of unpaired read 1 reads printed: $read_1_printed\n"; | |
795 warn "Number of unpaired read 2 reads printed: $read_2_printed\n"; | |
796 } | |
797 | |
798 warn "\n"; | |
799 if ($retain){ | |
800 return ($out_1,$out_2,$unpaired_1,$unpaired_2); | |
801 } | |
802 else{ | |
803 return ($out_1,$out_2); | |
804 } | |
805 } | |
806 | |
807 sub process_commandline{ | |
808 my $help; | |
809 my $quality; | |
810 my $adapter; | |
811 my $adapter2; | |
812 my $stringency; | |
813 my $report; | |
814 my $version; | |
815 my $rrbs; | |
816 my $length_cutoff; | |
817 my $keep; | |
818 my $fastqc; | |
819 my $non_directional; | |
820 my $phred33; | |
821 my $phred64; | |
822 my $fastqc_args; | |
823 my $trim; | |
824 my $gzip; | |
825 my $validate; | |
826 my $retain; | |
827 my $length_read_1; | |
828 my $length_read_2; | |
829 my $error_rate; | |
830 my $output_dir; | |
831 my $no_report_file; | |
832 my $suppress_warn; | |
833 | |
834 my $command_line = GetOptions ('help|man' => \$help, | |
835 'q|quality=i' => \$quality, | |
836 'a|adapter=s' => \$adapter, | |
837 'a2|adapter2=s' => \$adapter2, | |
838 'report' => \$report, | |
839 'version' => \$version, | |
840 'stringency=i' => \$stringency, | |
841 'fastqc' => \$fastqc, | |
842 'RRBS' => \$rrbs, | |
843 'keep' => \$keep, | |
844 'length=i' => \$length_cutoff, | |
845 'non_directional' => \$non_directional, | |
846 'phred33' => \$phred33, | |
847 'phred64' => \$phred64, | |
848 'fastqc_args=s' => \$fastqc_args, | |
849 'trim1' => \$trim, | |
850 'gzip' => \$gzip, | |
851 'paired_end' => \$validate, | |
852 'retain_unpaired' => \$retain, | |
853 'length_1|r1=i' => \$length_read_1, | |
854 'length_2|r2=i' => \$length_read_2, | |
855 'e|error_rate=s' => \$error_rate, | |
856 'o|output_dir=s' => \$output_dir, | |
857 'no_report_file' => \$no_report_file, | |
858 'suppress_warn' => \$suppress_warn, | |
859 ); | |
860 | |
861 ### EXIT ON ERROR if there were errors with any of the supplied options | |
862 unless ($command_line){ | |
863 die "Please respecify command line options\n"; | |
864 } | |
865 | |
866 ### HELPFILE | |
867 if ($help){ | |
868 print_helpfile(); | |
869 exit; | |
870 } | |
871 | |
872 | |
873 | |
874 | |
875 if ($version){ | |
876 print << "VERSION"; | |
877 | |
878 Quality-/Adapter-/RRBS-Trimming | |
879 (powered by Cutadapt) | |
880 version $trimmer_version | |
881 | |
882 Last update: 18 10 2012 | |
883 | |
884 VERSION | |
885 exit; | |
886 } | |
887 | |
888 ### RRBS | |
889 unless ($rrbs){ | |
890 $rrbs = 0; | |
891 } | |
892 | |
893 ### SUPRESS WARNINGS | |
894 if (defined $suppress_warn){ | |
895 $DOWARN = 0; | |
896 } | |
897 | |
898 ### QUALITY SCORES | |
899 my $phred_encoding; | |
900 if ($phred33){ | |
901 if ($phred64){ | |
902 die "Please specify only a single quality encoding type (--phred33 or --phred64)\n\n"; | |
903 } | |
904 $phred_encoding = 33; | |
905 } | |
906 elsif ($phred64){ | |
907 $phred_encoding = 64; | |
908 } | |
909 unless ($phred33 or $phred64){ | |
910 warn "No quality encoding type selected. Assuming that the data provided uses Sanger encoded Phred scores (default)\n\n"; | |
911 $phred_encoding = 33; | |
912 sleep (3); | |
913 } | |
914 | |
915 ### NON-DIRECTIONAL RRBS | |
916 if ($non_directional){ | |
917 unless ($rrbs){ | |
918 die "Option '--non_directional' requires '--rrbs' to be specified as well. Please re-specify!\n"; | |
919 } | |
920 } | |
921 else{ | |
922 $non_directional = 0; | |
923 } | |
924 | |
925 if ($fastqc_args){ | |
926 $fastqc = 1; # specifying fastqc extra arguments automatically means that FastQC will be executed | |
927 } | |
928 else{ | |
929 $fastqc_args = 0; | |
930 } | |
931 | |
932 ### CUSTOM ERROR RATE | |
933 if (defined $error_rate){ | |
934 # make sure that the error rate is between 0 and 1 | |
935 unless ($error_rate >= 0 and $error_rate <= 1){ | |
936 die "Please specify an error rate between 0 and 1 (the default is 0.1)\n"; | |
937 } | |
938 } | |
939 else{ | |
940 $error_rate = 0.1; # (default) | |
941 } | |
942 | |
943 if (defined $adapter){ | |
944 unless ($adapter =~ /^[ACTGNactgn]+$/){ | |
945 die "Adapter sequence must contain DNA characters only (A,C,T,G or N)!\n"; | |
946 } | |
947 $adapter = uc$adapter; | |
948 } | |
949 | |
950 if (defined $adapter2){ | |
951 unless ($validate){ | |
952 die "An optional adapter for read 2 of paired-end files requires '--paired' to be specified as well! Please re-specify\n"; | |
953 } | |
954 unless ($adapter2 =~ /^[ACTGNactgn]+$/){ | |
955 die "Optional adapter 2 sequence must contain DNA characters only (A,C,T,G or N)!\n"; | |
956 } | |
957 $adapter2 = uc$adapter2; | |
958 } | |
959 | |
960 ### files are supposed to be paired-end files | |
961 if ($validate){ | |
962 | |
963 # making sure that an even number of reads has been supplied | |
964 unless ((scalar@ARGV)%2 == 0){ | |
965 die "Please provide an even number of input files for paired-end FastQ trimming! Aborting ...\n"; | |
966 } | |
967 | |
968 ## CUTOFF FOR VALIDATED READ-PAIRS | |
969 | |
970 if (defined $length_read_1 or defined $length_read_2){ | |
971 unless ($retain){ | |
972 die "Please specify --keep_unpaired to alter the unpaired single-end read length cut off(s)\n\n"; | |
973 } | |
974 | |
975 if (defined $length_read_1){ | |
976 unless ($length_read_1 >= 15 and $length_read_1 <= 100){ | |
977 die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n"; | |
978 } | |
979 unless ($length_read_1 > $length_cutoff){ | |
980 die "The single-end unpaired read length needs to be longer than the paired-end cut-off value\n\n"; | |
981 } | |
982 } | |
983 | |
984 if (defined $length_read_2){ | |
985 unless ($length_read_2 >= 15 and $length_read_2 <= 100){ | |
986 die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n"; | |
987 } | |
988 unless ($length_read_2 > $length_cutoff){ | |
989 die "The single-end unpaired read length needs to be longer than the paired-end cut-off value\n\n"; | |
990 } | |
991 } | |
992 } | |
993 | |
994 if ($retain){ | |
995 $length_read_1 = 35 unless (defined $length_read_1); | |
996 $length_read_2 = 35 unless (defined $length_read_2); | |
997 } | |
998 } | |
999 | |
1000 unless ($no_report_file){ | |
1001 $no_report_file = 0; | |
1002 } | |
1003 | |
1004 ### OUTPUT DIR PATH | |
1005 if ($output_dir){ | |
1006 unless ($output_dir =~ /\/$/){ | |
1007 $output_dir =~ s/$/\//; | |
1008 } | |
1009 } | |
1010 else{ | |
1011 $output_dir = ''; | |
1012 } | |
1013 | |
1014 return ($quality,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$adapter2,$error_rate,$output_dir,$no_report_file); | |
1015 } | |
1016 | |
1017 | |
1018 | |
1019 | |
1020 sub print_helpfile{ | |
1021 print << "HELP"; | |
1022 | |
1023 USAGE: | |
1024 | |
1025 trim_galore [options] <filename(s)> | |
1026 | |
1027 | |
1028 -h/--help Print this help message and exits. | |
1029 | |
1030 -v/--version Print the version information and exits. | |
1031 | |
1032 -q/--quality <INT> Trim low-quality ends from reads in addition to adapter removal. For | |
1033 RRBS samples, quality trimming will be performed first, and adapter | |
1034 trimming is carried in a second round. Other files are quality and adapter | |
1035 trimmed in a single pass. The algorithm is the same as the one used by BWA | |
1036 (Subtract INT from all qualities; compute partial sums from all indices | |
1037 to the end of the sequence; cut sequence at the index at which the sum is | |
1038 minimal). Default Phred score: 20. | |
1039 | |
1040 --phred33 Instructs Cutadapt to use ASCII+33 quality scores as Phred scores | |
1041 (Sanger/Illumina 1.9+ encoding) for quality trimming. Default: ON. | |
1042 | |
1043 --phred64 Instructs Cutadapt to use ASCII+64 quality scores as Phred scores | |
1044 (Illumina 1.5 encoding) for quality trimming. | |
1045 | |
1046 --fastqc Run FastQC in the default mode on the FastQ file once trimming is complete. | |
1047 | |
1048 --fastqc_args "<ARGS>" Passes extra arguments to FastQC. If more than one argument is to be passed | |
1049 to FastQC they must be in the form "arg1 arg2 etc.". An example would be: | |
1050 --fastqc_args "--nogroup --outdir /home/". Passing extra arguments will | |
1051 automatically invoke FastQC, so --fastqc does not have to be specified | |
1052 separately. | |
1053 | |
1054 -a/--adapter <STRING> Adapter sequence to be trimmed. If not specified explicitely, the first 13 | |
1055 bp of the Illumina adapter 'AGATCGGAAGAGC' will be used by default. | |
1056 | |
1057 -a2/--adapter2 <STRING> Optional adapter sequence to be trimmed off read 2 of paired-end files. This | |
1058 option requires '--paired' to be specified as well. | |
1059 | |
1060 | |
1061 -s/--stringency <INT> Overlap with adapter sequence required to trim a sequence. Defaults to a | |
1062 very stringent setting of '1', i.e. even a single bp of overlapping sequence | |
1063 will be trimmed of the 3' end of any read. | |
1064 | |
1065 -e <ERROR RATE> Maximum allowed error rate (no. of errors divided by the length of the matching | |
1066 region) (default: 0.1) | |
1067 | |
1068 --gzip Compress the output file with gzip. If the input files are gzip-compressed | |
1069 the output files will be automatically gzip compressed as well. | |
1070 | |
1071 --length <INT> Discard reads that became shorter than length INT because of either | |
1072 quality or adapter trimming. A value of '0' effectively disables | |
1073 this behaviour. Default: 20 bp. | |
1074 | |
1075 For paired-end files, both reads of a read-pair need to be longer than | |
1076 <INT> bp to be printed out to validated paired-end files (see option --paired). | |
1077 If only one read became too short there is the possibility of keeping such | |
1078 unpaired single-end reads (see --retain_unpaired). Default pair-cutoff: 20 bp. | |
1079 | |
1080 -o/--output_dir <DIR> If specified all output will be written to this directory instead of the current | |
1081 directory. | |
1082 | |
1083 --no_report_file If specified no report file will be generated. | |
1084 | |
1085 --suppress_warn If specified any output to STDOUT or STDERR will be suppressed. | |
1086 | |
1087 | |
1088 | |
1089 RRBS-specific options (MspI digested material): | |
1090 | |
1091 --rrbs Specifies that the input file was an MspI digested RRBS sample (recognition | |
1092 site: CCGG). Sequences which were adapter-trimmed will have a further 2 bp | |
1093 removed from their 3' end. This is to avoid that the filled-in C close to the | |
1094 second MspI site in a sequence is used for methylation calls. Sequences which | |
1095 were merely trimmed because of poor quality will not be shortened further. | |
1096 | |
1097 --non_directional Selecting this option for non-directional RRBS libraries will screen | |
1098 quality-trimmed sequences for 'CAA' or 'CGA' at the start of the read | |
1099 and, if found, removes the first two basepairs. Like with the option | |
1100 '--rrbs' this avoids using cytosine positions that were filled-in | |
1101 during the end-repair step. '--non_directional' requires '--rrbs' to | |
1102 be specified as well. | |
1103 | |
1104 --keep Keep the quality trimmed intermediate file. Default: off, which means | |
1105 the temporary file is being deleted after adapter trimming. Only has | |
1106 an effect for RRBS samples since other FastQ files are not trimmed | |
1107 for poor qualities separately. | |
1108 | |
1109 | |
1110 Note for RRBS using MseI: | |
1111 | |
1112 If your DNA material was digested with MseI (recognition motif: TTAA) instead of MspI it is NOT necessary | |
1113 to specify --rrbs or --non_directional since virtually all reads should start with the sequence | |
1114 'TAA', and this holds true for both directional and non-directional libraries. As the end-repair of 'TAA' | |
1115 restricted sites does not involve any cytosines it does not need to be treated especially. Instead, simply | |
1116 run Trim Galore! in the standard (i.e. non-RRBS) mode. | |
1117 | |
1118 | |
1119 Paired-end specific options: | |
1120 | |
1121 --paired This option performs length trimming of quality/adapter/RRBS trimmed reads for | |
1122 paired-end files. To pass the validation test, both sequences of a sequence pair | |
1123 are required to have a certain minimum length which is governed by the option | |
1124 --length (see above). If only one read passes this length threshold the | |
1125 other read can be rescued (see option --retain_unpaired). Using this option lets | |
1126 you discard too short read pairs without disturbing the sequence-by-sequence order | |
1127 of FastQ files which is required by many aligners. | |
1128 | |
1129 Trim Galore! expects paired-end files to be supplied in a pairwise fashion, e.g. | |
1130 file1_1.fq file1_2.fq SRR2_1.fq.gz SRR2_2.fq.gz ... . | |
1131 | |
1132 -t/--trim1 Trims 1 bp off every read from its 3' end. This may be needed for FastQ files that | |
1133 are to be aligned as paired-end data with Bowtie. This is because Bowtie (1) regards | |
1134 alignments like this: | |
1135 | |
1136 R1 ---------------------------> or this: -----------------------> R1 | |
1137 R2 <--------------------------- <----------------- R2 | |
1138 | |
1139 as invalid (whenever a start/end coordinate is contained within the other read). | |
1140 | |
1141 --retain_unpaired If only one of the two paired-end reads became too short, the longer | |
1142 read will be written to either '.unpaired_1.fq' or '.unpaired_2.fq' | |
1143 output files. The length cutoff for unpaired single-end reads is | |
1144 governed by the parameters -r1/--length_1 and -r2/--length_2. Default: OFF. | |
1145 | |
1146 -r1/--length_1 <INT> Unpaired single-end read length cutoff needed for read 1 to be written to | |
1147 '.unpaired_1.fq' output file. These reads may be mapped in single-end mode. | |
1148 Default: 35 bp. | |
1149 | |
1150 -r2/--length_2 <INT> Unpaired single-end read length cutoff needed for read 2 to be written to | |
1151 '.unpaired_2.fq' output file. These reads may be mapped in single-end mode. | |
1152 Default: 35 bp. | |
1153 | |
1154 | |
1155 Last modified on 18 Oct 2012. | |
1156 | |
1157 HELP | |
1158 exit; | |
1159 } |