annotate trim_galore @ 0:3c1664caa8e3 draft

Uploaded
author bgruening
date Sat, 06 Jul 2013 09:52:23 -0400
parents
children 898db63d2e84
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1 #!/usr/bin/perl
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
2 use strict;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
3 use warnings;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
4 use Getopt::Long;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
5 use IPC::Open3;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
6 use File::Spec;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
7 use File::Basename;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
8 use Cwd;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
9
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
10 ## This program is Copyright (C) 2012, Felix Krueger (felix.krueger@babraham.ac.uk)
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
11
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
12 ## This program is free software: you can redistribute it and/or modify
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
13 ## it under the terms of the GNU General Public License as published by
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
14 ## the Free Software Foundation, either version 3 of the License, or
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
15 ## (at your option) any later version.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
16
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
17 ## This program is distributed in the hope that it will be useful,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
18 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
19 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
20 ## GNU General Public License for more details.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
21
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
22 ## You should have received a copy of the GNU General Public License
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
23 ## along with this program. If not, see <http://www.gnu.org/licenses/>.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
24
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
25
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
26
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
27 ## this script is taking in FastQ sequences and trims them with Cutadapt
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
28 ## last modified on 18 10 2012
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
29
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
30 ########################################################################
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
31
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
32 # change these paths if needed
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
33
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
34 my $path_to_cutadapt = 'cutadapt';
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
35 my $path_to_fastqc = 'fastqc';
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
36
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
37 ########################################################################
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
38
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
39
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
40 my $trimmer_version = '0.2.5';
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
41 my $DOWARN = 1; # print on screen warning and text by default
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
42 BEGIN { $SIG{'__WARN__'} = sub { warn $_[0] if $DOWARN } };
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
43
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
44 my ($cutoff,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$a2,$error_rate,$output_dir,$no_report_file) = process_commandline();
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
45
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
46 ### SETTING DEFAULTS UNLESS THEY WERE SPECIFIED
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
47 unless (defined $cutoff){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
48 $cutoff = 20;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
49 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
50 my $phred_score_cutoff = $cutoff; # only relevant for report
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
51
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
52 unless (defined $adapter){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
53 $adapter = 'AGATCGGAAGAGC';
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
54 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
55 unless (defined $a2){ # optional adapter for the second read in a pair. Only works for --paired trimming
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
56 $a2 = '';
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
57 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
58
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
59 unless (defined $stringency){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
60 $stringency = 1;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
61 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
62
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
63 unless (defined $length_cutoff){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
64 $length_cutoff = 20;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
65 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
66
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
67 if ($phred_encoding == 64){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
68 $cutoff += 31;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
69 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
70
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
71 my @filenames = @ARGV;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
72
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
73 my $file_1;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
74 my $file_2;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
75
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
76 foreach my $filename (@ARGV){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
77 trim ($filename);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
78 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
79
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
80
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
81 sub trim{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
82 my $filename = shift;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
83
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
84 my $output_filename = (split (/\//,$filename))[-1];
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
85 # warn "Here is the outputfile name: $output_filename\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
86
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
87 my $report = $output_filename;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
88 $report =~ s/$/_trimming_report.txt/;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
89
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
90 if ($no_report_file) {
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
91 $report = File::Spec->devnull;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
92 open (REPORT,'>',$report) or die "Failed to write to file: $!\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
93 # warn "Redirecting report output to /dev/null\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
94 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
95 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
96 open (REPORT,'>',$output_dir.$report) or die "Failed to write to file: $!\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
97 warn "Writing report to '$output_dir$report'\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
98 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
99
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
100 warn "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
101 print REPORT "\nSUMMARISING RUN PARAMETERS\n==========================\nInput filename: $filename\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
102
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
103 warn "Quality Phred score cutoff: $phred_score_cutoff\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
104 print REPORT "Quality Phred score cutoff: $phred_score_cutoff\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
105
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
106 warn "Quality encoding type selected: ASCII+$phred_encoding\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
107 print REPORT "Quality encoding type selected: ASCII+$phred_encoding\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
108
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
109 warn "Adapter sequence: '$adapter'\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
110 print REPORT "Adapter sequence: '$adapter'\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
111
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
112 if ($error_rate == 0.1){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
113 warn "Maximum trimming error rate: $error_rate (default)\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
114 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
115 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
116 warn "Maximum trimming error rate: $error_rate\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
117 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
118
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
119 print REPORT "Maximum trimming error rate: $error_rate";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
120 if ($error_rate == 0.1){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
121 print REPORT " (default)\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
122 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
123 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
124 print REPORT "\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
125 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
126
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
127 if ($a2){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
128 warn "Optional adapter 2 sequence (only used for read 2 of paired-end files): '$a2'\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
129 print REPORT "Optional adapter 2 sequence (only used for read 2 of paired-end files): '$a2'\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
130 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
131
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
132 warn "Minimum required adapter overlap (stringency): $stringency bp\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
133 print REPORT "Minimum required adapter overlap (stringency): $stringency bp\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
134
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
135 if ($validate){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
136 warn "Minimum required sequence length for both reads before a sequence pair gets removed: $length_cutoff bp\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
137 print REPORT "Minimum required sequence length for both reads before a sequence pair gets removed: $length_cutoff bp\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
138 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
139 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
140 warn "Minimum required sequence length before a sequence gets removed: $length_cutoff bp\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
141 print REPORT "Minimum required sequence length before a sequence gets removed: $length_cutoff bp\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
142 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
143
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
144 if ($validate){ # only for paired-end files
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
145
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
146 if ($retain){ # keeping single-end reads if only one end is long enough
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
147
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
148 if ($length_read_1 == 35){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
149 warn "Length cut-off for read 1: $length_read_1 bp (default)\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
150 print REPORT "Length cut-off for read 1: $length_read_1 bp (default)\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
151 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
152 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
153 warn "Length cut-off for read 1: $length_read_1 bp\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
154 print REPORT "Length cut-off for read 1: $length_read_1 bp\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
155 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
156
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
157 if ($length_read_2 == 35){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
158 warn "Length cut-off for read 2: $length_read_2 b (default)\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
159 print REPORT "Length cut-off for read 2: $length_read_2 bp (default)\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
160 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
161 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
162 warn "Length cut-off for read 2: $length_read_2 bp\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
163 print REPORT "Length cut-off for read 2: $length_read_2 bp\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
164 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
165 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
166 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
167
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
168 if ($rrbs){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
169 warn "File was specified to be an MspI-digested RRBS sample. Sequences with adapter contamination will be trimmed a further 2 bp to remove potential methylation-biased bases from the end-repair reaction\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
170 print REPORT "File was specified to be an MspI-digested RRBS sample. Sequences with adapter contamination will be trimmed a further 2 bp to remove potential methylation-biased bases from the end-repair reaction\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
171 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
172
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
173 if ($non_directional){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
174 warn "File was specified to be a non-directional MspI-digested RRBS sample. Sequences starting with either 'CAA' or 'CGA' will have the first 2 bp trimmed off to remove potential methylation-biased bases from the end-repair reaction\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
175 print REPORT "File was specified to be a non-directional MspI-digested RRBS sample. Sequences starting with either 'CAA' or 'CGA' will have the first 2 bp trimmed off to remove potential methylation-biased bases from the end-repair reaction\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
176 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
177
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
178 if ($trim){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
179 warn "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
180 print REPORT "All sequences will be trimmed by 1 bp on their 3' end to avoid problems with invalid paired-end alignments with Bowtie 1\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
181 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
182
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
183 if ($fastqc){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
184 warn "Running FastQC on the data once trimming has completed\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
185 print REPORT "Running FastQC on the data once trimming has completed\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
186
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
187 if ($fastqc_args){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
188 warn "Running FastQC with the following extra arguments: '$fastqc_args'\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
189 print REPORT "Running FastQC with the following extra arguments: $fastqc_args\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
190 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
191 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
192
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
193 if ($keep and $rrbs){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
194 warn "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
195 print REPORT "Keeping quality trimmed (but not yet adapter trimmed) intermediate FastQ file\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
196 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
197
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
198 if ($gzip or $filename =~ /\.gz$/){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
199 warn "Output file will be GZIP compressed\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
200 print REPORT "Output file will be GZIP compressed\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
201 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
202
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
203 warn "\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
204 print REPORT "\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
205 sleep (3);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
206
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
207 my $temp;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
208
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
209 ### Proceeding differently for RRBS and other type of libraries
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
210 if ($rrbs){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
211
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
212 ### Skipping quality filtering for RRBS libraries if a quality cutoff of 0 was specified
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
213 if ($cutoff == 0){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
214 warn "Quality cutoff selected was 0 - Skipping quality trimming altogether\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
215 sleep (3);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
216 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
217 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
218
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
219 $temp = $filename;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
220 $temp =~ s/$/_qual_trimmed.fastq/;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
221 open (TEMP,'>',$output_dir.$temp) or die "Can't write to $temp: $!";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
222
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
223 warn " >>> Now performing adaptive quality trimming with a Phred-score cutoff of: $cutoff <<<\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
224 sleep (3);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
225
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
226 open (QUAL,"$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -a X $filename |") or die "Can't open pipe to Cutadapt: $!";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
227
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
228 my $qual_count = 0;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
229
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
230 while (1){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
231 my $l1 = <QUAL>;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
232 my $seq = <QUAL>;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
233 my $l3 = <QUAL>;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
234 my $qual = <QUAL>;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
235 last unless (defined $qual);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
236
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
237 $qual_count++;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
238 if ($qual_count%10000000 == 0){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
239 warn "$qual_count sequences processed\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
240 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
241 print TEMP "$l1$seq$l3$qual";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
242 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
243
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
244 warn "\n >>> Quality trimming completed <<<\n$qual_count sequences processed in total\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
245 close QUAL or die "Unable to close filehandle: $!\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
246 sleep (3);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
247
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
248 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
249 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
250
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
251
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
252 if ($output_filename =~ /\.fastq$/){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
253 $output_filename =~ s/\.fastq$/_trimmed.fq/;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
254 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
255 elsif ($output_filename =~ /\.fastq\.gz$/){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
256 $output_filename =~ s/\.fastq\.gz$/_trimmed.fq/;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
257 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
258 elsif ($output_filename =~ /\.fq$/){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
259 $output_filename =~ s/\.fq$/_trimmed.fq/;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
260 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
261 elsif ($output_filename =~ /\.fq\.gz$/){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
262 $output_filename =~ s/\.fq\.gz$/_trimmed.fq/;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
263 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
264 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
265 $output_filename =~ s/$/_trimmed.fq/;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
266 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
267
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
268 warn "Writing final adapter and quality trimmed output to $output_filename\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
269 open (OUT,'>',$output_dir.$output_filename) or die "Can't open $output_filename: $!\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
270 sleep (2);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
271
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
272 my $count = 0;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
273 my $too_short = 0;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
274 my $quality_trimmed = 0;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
275 my $rrbs_trimmed = 0;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
276 my $rrbs_trimmed_start = 0;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
277 my $CAA = 0;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
278 my $CGA = 0;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
279
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
280 if ($rrbs and $cutoff != 0){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
281
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
282 ### optionally using 2 different adapters for read 1 and read 2
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
283 if ($validate and $a2){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
284 ### Figure out whether current file counts as read 1 or read 2 of paired-end files
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
285 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
286 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
287 sleep (3);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
288 open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
289 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
290 else{ # this is read 2 of a pair
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
291 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$a2' from file $temp <<< \n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
292 sleep (3);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
293 open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $a2 $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
294 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
295 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
296 ### Using the same adapter for both read 1 and read 2
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
297 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
298 warn "\n >>> Now performing adapter trimming for the adapter sequence: '$adapter' from file $temp <<< \n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
299 sleep (3);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
300 open3 (\*WRITER, \*TRIM, \*ERROR,"$path_to_cutadapt -f fastq -e $error_rate -O $stringency -a $adapter $output_dir$temp") or die "Failed to launch Cutadapt: $!\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
301 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
302
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
303 close WRITER or die $!; # not needed
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
304
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
305 open (QUAL,"$output_dir$temp") or die $!; # quality trimmed file
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
306
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
307 if ($filename =~ /\.gz$/){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
308 open (IN,"zcat $filename |") or die $!; # original, untrimmed file
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
309 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
310 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
311 open (IN,$filename) or die $!; # original, untrimmed file
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
312 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
313
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
314 while (1){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
315
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
316 # we can process the output from Cutadapt and the original input 1 by 1 to decide if the adapter has been removed or not
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
317 my $l1 = <TRIM>;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
318 my $seq = <TRIM>; # adapter trimmed sequence
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
319 my $l3 = <TRIM>;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
320 my $qual = <TRIM>;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
321
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
322 $_ = <IN>; # irrelevant
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
323 my $original_seq = <IN>;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
324 $_ = <IN>; # irrelevant
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
325 $_ = <IN>; # irrelevant
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
326
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
327 $_ = <QUAL>; # irrelevant
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
328 my $qual_trimmed_seq = <QUAL>;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
329 $_ = <QUAL>; # irrelevant
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
330 my $qual_trimmed_qual = <QUAL>;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
331
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
332 last unless (defined $qual and defined $qual_trimmed_qual); # could be empty strings
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
333
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
334 $count++;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
335 if ($count%10000000 == 0){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
336 warn "$count sequences processed\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
337 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
338
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
339 chomp $seq;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
340 chomp $qual;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
341 chomp $qual_trimmed_seq;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
342 chomp $original_seq;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
343
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
344 my $quality_trimmed_seq_length = length $qual_trimmed_seq;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
345
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
346 if (length $original_seq > length $qual_trimmed_seq){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
347 ++$quality_trimmed;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
348 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
349
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
350 my $nd = 0;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
351
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
352 ### NON-DIRECTIONAL RRBS
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
353 if ($non_directional){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
354 if (length$seq > 2){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
355 if ($seq =~ /^CAA/){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
356 ++$CAA;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
357 $seq = substr ($seq,2,length($seq)-2);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
358 $qual = substr ($qual,2,length($qual)-2);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
359 ++$rrbs_trimmed_start;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
360 $nd = 1;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
361 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
362 elsif ($seq =~ /^CGA/){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
363 $seq = substr ($seq,2,length($seq)-2);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
364 $qual = substr ($qual,2,length($qual)-2);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
365 ++$CGA;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
366 ++$rrbs_trimmed_start;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
367 $nd = 1;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
368 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
369 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
370 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
371
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
372 ### directional read
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
373 unless ($nd == 1){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
374 if (length $seq >= 2 and length$seq < $quality_trimmed_seq_length){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
375 $seq = substr ($seq,0,length($seq)-2);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
376 $qual = substr ($qual,0,length($qual)-2);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
377 ++$rrbs_trimmed;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
378 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
379 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
380
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
381 ### Shortening all sequences by 1 bp on the 3' end
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
382 if ($trim){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
383 $seq = substr($seq,0,length($seq)-1);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
384 $qual = substr($qual,0,length($qual)-1);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
385 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
386
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
387 ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
388 if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later)
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
389 print OUT "$l1$seq\n$l3$qual\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
390 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
391 else{ # single end
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
392 if (length $seq < $length_cutoff){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
393 ++$too_short;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
394 next;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
395 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
396 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
397 print OUT "$l1$seq\n$l3$qual\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
398 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
399 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
400 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
401
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
402 print REPORT "\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
403 while (<ERROR>){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
404 warn $_;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
405 print REPORT $_;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
406 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
407
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
408 close IN or die "Unable to close IN filehandle: $!";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
409 close QUAL or die "Unable to close QUAL filehandle: $!";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
410 close TRIM or die "Unable to close TRIM filehandle: $!";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
411 close OUT or die "Unable to close OUT filehandle: $!";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
412 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
413 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
414
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
415 ### optionally using 2 different adapters for read 1 and read 2
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
416 if ($validate and $a2){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
417 ### Figure out whether current file counts as read 1 or read 2 of paired-end files
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
418 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
419 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
420 sleep (3);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
421 open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $adapter $filename") or die "Failed to launch Cutadapt: $!";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
422 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
423 else{ # this is read 2 of a pair
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
424 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$a2' from file $filename <<< \n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
425 sleep (3);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
426 open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $a2 $filename") or die "Failed to launch Cutadapt: $!";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
427 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
428 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
429 ### Using the same adapter for both read 1 and read 2
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
430 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
431 warn "\n >>> Now performing quality (cutoff $cutoff) and adapter trimming in a single pass for the adapter sequence: '$adapter' from file $filename <<< \n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
432 sleep (3);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
433 open3 (\*WRITER, \*TRIM, \*ERROR, "$path_to_cutadapt -f fastq -e $error_rate -q $cutoff -O $stringency -a $adapter $filename") or die "Failed to launch Cutadapt: $!";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
434 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
435
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
436 close WRITER or die $!; # not needed
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
437
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
438 while (1){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
439
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
440 my $l1 = <TRIM>;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
441 my $seq = <TRIM>; # quality and/or adapter trimmed sequence
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
442 my $l3 = <TRIM>;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
443 my $qual = <TRIM>;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
444 # print "$l1$seq\n$l3$qual\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
445 last unless (defined $qual); # could be an empty string
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
446
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
447 $count++;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
448 if ($count%10000000 == 0){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
449 warn "$count sequences processed\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
450 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
451
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
452 chomp $seq;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
453 chomp $qual;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
454
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
455 ### Shortening all sequences by 1 bp on the 3' end
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
456 if ($trim){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
457 $seq = substr($seq,0,length($seq)-1);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
458 $qual = substr($qual,0,length($qual)-1);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
459 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
460
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
461 ### PRINTING (POTENTIALLY TRIMMED) SEQUENCE
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
462 if ($validate){ # printing the sequence without performing a length check (this is performed for the read pair separately later)
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
463 print OUT "$l1$seq\n$l3$qual\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
464 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
465 else{ # single end
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
466 if (length $seq < $length_cutoff){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
467 ++$too_short;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
468 next;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
469 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
470 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
471 print OUT "$l1$seq\n$l3$qual\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
472 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
473 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
474 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
475
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
476 print REPORT "\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
477 while (<ERROR>){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
478 warn $_;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
479 print REPORT $_;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
480 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
481
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
482 close TRIM or die "Unable to close TRIM filehandle: $!\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
483 close ERROR or die "Unable to close ERROR filehandle: $!\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
484 close OUT or die "Unable to close OUT filehandle: $!\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
485
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
486 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
487
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
488 if ($rrbs){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
489 unless ($keep){ # keeping the quality trimmed intermediate file for RRBS files
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
490
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
491 # deleting temporary quality trimmed file
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
492 my $deleted = unlink "$output_dir$temp";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
493
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
494 if ($deleted){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
495 warn "Successfully deleted temporary file $temp\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
496 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
497 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
498 warn "Could not delete temporary file $temp";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
499 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
500 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
501 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
502
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
503 warn "\nRUN STATISTICS FOR INPUT FILE: $filename\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
504 print REPORT "\nRUN STATISTICS FOR INPUT FILE: $filename\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
505
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
506 warn "="x 45,"\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
507 print REPORT "="x 45,"\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
508
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
509 warn "$count sequences processed in total\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
510 print REPORT "$count sequences processed in total\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
511
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
512 ### only reporting this separately if quality and adapter trimming were performed separately
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
513 if ($rrbs){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
514 my $percentage_shortened = sprintf ("%.1f",$quality_trimmed/$count*100);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
515 warn "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
516 print REPORT "Sequences were truncated to a varying degree because of deteriorating qualities (Phred score quality cutoff: $cutoff):\t$quality_trimmed ($percentage_shortened%)\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
517 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
518
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
519 my $percentage_too_short = sprintf ("%.1f",$too_short/$count*100);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
520 warn "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
521 print REPORT "Sequences removed because they became shorter than the length cutoff of $length_cutoff bp:\t$too_short ($percentage_too_short%)\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
522
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
523 if ($rrbs){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
524 my $percentage_rrbs_trimmed = sprintf ("%.1f",$rrbs_trimmed/$count*100);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
525 warn "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
526 print REPORT "RRBS reads trimmed by additional 2 bp when adapter contamination was detected:\t$rrbs_trimmed ($percentage_rrbs_trimmed%)\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
527 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
528
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
529 if ($non_directional){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
530 my $percentage_rrbs_trimmed_at_start = sprintf ("%.1f",$rrbs_trimmed_start/$count*100);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
531 warn "RRBS reads trimmed by 2 bp at the start when read started with CAA ($CAA) or CGA ($CGA) in total:\t$rrbs_trimmed_start ($percentage_rrbs_trimmed_at_start%)\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
532 print REPORT "RRBS reads trimmed by 2 bp at the start when read started with CAA ($CAA) or CGA ($CGA) in total:\t$rrbs_trimmed_start ($percentage_rrbs_trimmed_at_start%)\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
533 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
534
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
535 warn "\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
536 print REPORT "\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
537
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
538 ### RUNNING FASTQC
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
539 if ($fastqc){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
540 warn "\n >>> Now running FastQC on the data <<<\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
541 sleep (5);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
542 if ($fastqc_args){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
543 system ("$path_to_fastqc $fastqc_args $output_filename");
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
544 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
545 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
546 system ("$path_to_fastqc $output_filename");
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
547 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
548 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
549
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
550 ### COMPRESSING OUTPUT FILE
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
551 unless ($validate){ # not gzipping intermediate files for paired-end files
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
552 if ($gzip or $filename =~ /\.gz$/){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
553 warn "\n >>> GZIP-ing the output file <<<\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
554 system ("gzip -f $output_filename");
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
555 $output_filename = $output_filename.'.gz';
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
556 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
557 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
558
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
559 ### VALIDATE PAIRED-END FILES
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
560 if ($validate){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
561
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
562 ### Figure out whether current file counts as read 1 or read 2 of paired-end files
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
563
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
564 if ( scalar(@filenames)%2 == 0){ # this is read 1 of a pair
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
565 $file_1 = $output_filename;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
566 shift @filenames;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
567 # warn "This is read 1: $file_1\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
568 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
569 else{ # this is read 2 of a pair
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
570 $file_2 = $output_filename;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
571 shift @filenames;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
572 # warn "This is read 2: $file_2\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
573 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
574
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
575 if ($file_1 and $file_2){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
576 warn "Validate paired-end files $file_1 and $file_2\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
577 sleep (1);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
578
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
579 my ($val_1,$val_2,$un_1,$un_2) = validate_paired_end_files($file_1,$file_2);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
580
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
581 ### RUNNING FASTQC
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
582 if ($fastqc){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
583
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
584 warn "\n >>> Now running FastQC on the validated data $val_1<<<\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
585 sleep (3);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
586
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
587 if ($fastqc_args){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
588 system ("$path_to_fastqc $fastqc_args $val_1");
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
589 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
590 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
591 system ("$path_to_fastqc $val_1");
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
592 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
593
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
594 warn "\n >>> Now running FastQC on the validated data $val_2<<<\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
595 sleep (3);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
596
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
597 if ($fastqc_args){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
598 system ("$path_to_fastqc $fastqc_args $val_2");
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
599 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
600 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
601 system ("$path_to_fastqc $val_2");
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
602 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
603
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
604 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
605
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
606 if ($gzip or $filename =~ /\.gz$/){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
607
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
608 # compressing main fastQ output files
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
609 warn "Compressing the validated output file $val_1 ...\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
610 system ("gzip -f $val_1");
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
611
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
612 warn "Compressing the validated output file $val_2 ...\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
613 system ("gzip -f $val_2");
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
614
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
615
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
616 if ($retain){ # compressing unpaired reads
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
617 warn "Compressing the unpaired read output $un_1 ...\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
618 system ("gzip -f $un_1");
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
619
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
620 warn "Compressing the unpaired read output $un_2 ...\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
621 system ("gzip -f $un_2");
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
622 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
623 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
624
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
625 warn "Deleting both intermediate output files $file_1 and $file_2\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
626 unlink "$output_dir$file_1";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
627 unlink "$output_dir$file_2";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
628
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
629 warn "\n",'='x100,"\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
630 sleep (3);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
631
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
632 $file_1 = undef; # setting file_1 and file_2 to undef once validation is completed
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
633 $file_2 = undef;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
634 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
635 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
636
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
637 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
638
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
639 sub validate_paired_end_files{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
640
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
641 my $file_1 = shift;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
642 my $file_2 = shift;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
643
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
644 warn "file_1 $file_1 file_2 $file_2\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
645
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
646 if ($file_1 =~ /\.gz$/){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
647 open (IN1,"zcat $output_dir$file_1 |") or die "Couldn't read from file $file_1: $!\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
648 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
649 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
650 open (IN1, "$output_dir$file_1") or die "Couldn't read from file $file_1: $!\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
651 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
652
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
653 if ($file_2 =~ /\.gz$/){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
654 open (IN2,"zcat $output_dir$file_2 |") or die "Couldn't read from file $file_2: $!\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
655 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
656 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
657 open (IN2, "$output_dir$file_2") or die "Couldn't read from file $file_2: $!\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
658 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
659
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
660 warn "\n>>>>> Now validing the length of the 2 paired-end infiles: $file_1 and $file_2 <<<<<\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
661 sleep (3);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
662
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
663 my $out_1 = $file_1;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
664 my $out_2 = $file_2;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
665
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
666 if ($out_1 =~ /gz$/){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
667 $out_1 =~ s/trimmed\.fq\.gz$/val_1.fq/;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
668 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
669 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
670 $out_1 =~ s/trimmed\.fq$/val_1.fq/;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
671 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
672
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
673 if ($out_2 =~ /gz$/){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
674 $out_2 =~ s/trimmed\.fq\.gz$/val_2.fq/;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
675 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
676 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
677 $out_2 =~ s/trimmed\.fq$/val_2.fq/;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
678 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
679
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
680 open (R1,'>',$output_dir.$out_1) or die "Couldn't write to $out_1 $!\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
681 open (R2,'>',$output_dir.$out_2) or die "Couldn't write to $out_2 $!\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
682 warn "Writing validated paired-end read 1 reads to $out_1\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
683 warn "Writing validated paired-end read 2 reads to $out_2\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
684
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
685 my $unpaired_1;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
686 my $unpaired_2;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
687
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
688 if ($retain){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
689
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
690 $unpaired_1 = $file_1;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
691 $unpaired_2 = $file_2;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
692
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
693 if ($unpaired_1 =~ /gz$/){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
694 $unpaired_1 =~ s/trimmed\.fq\.gz$/unpaired_1.fq/;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
695 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
696 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
697 $unpaired_1 =~ s/trimmed\.fq$/unpaired_1.fq/;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
698 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
699
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
700 if ($unpaired_2 =~ /gz$/){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
701 $unpaired_2 =~ s/trimmed\.fq\.gz$/unpaired_2.fq/;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
702 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
703 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
704 $unpaired_2 =~ s/trimmed\.fq$/unpaired_2.fq/;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
705 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
706
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
707 open (UNPAIRED1,'>',$output_dir.$unpaired_1) or die "Couldn't write to $unpaired_1: $!\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
708 open (UNPAIRED2,'>',$output_dir.$unpaired_2) or die "Couldn't write to $unpaired_2: $!\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
709
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
710 warn "Writing unpaired read 1 reads to $unpaired_1\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
711 warn "Writing unpaired read 2 reads to $unpaired_2\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
712 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
713
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
714 my $sequence_pairs_removed = 0;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
715 my $read_1_printed = 0;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
716 my $read_2_printed = 0;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
717
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
718 my $count = 0;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
719
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
720 while (1){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
721 my $id_1 = <IN1>;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
722 my $seq_1 = <IN1>;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
723 my $l3_1 = <IN1>;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
724 my $qual_1 = <IN1>;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
725 last unless ($id_1 and $seq_1 and $l3_1 and $qual_1);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
726
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
727 my $id_2 = <IN2>;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
728 my $seq_2 = <IN2>;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
729 my $l3_2 = <IN2>;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
730 my $qual_2 = <IN2>;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
731 last unless ($id_2 and $seq_2 and $l3_2 and $qual_2);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
732
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
733 ++$count;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
734
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
735
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
736 ## small check if the sequence files appear to FastQ files
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
737 if ($count == 1){ # performed just once
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
738 if ($id_1 !~ /^\@/ or $l3_1 !~ /^\+/){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
739 die "Input file doesn't seem to be in FastQ format at sequence $count\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
740 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
741 if ($id_2 !~ /^\@/ or $l3_2 !~ /^\+/){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
742 die "Input file doesn't seem to be in FastQ format at sequence $count\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
743 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
744 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
745
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
746 chomp $seq_1;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
747 chomp $seq_2;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
748
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
749
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
750 ### making sure that the reads do have a sensible length
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
751 if ( (length($seq_1) < $length_cutoff) or (length($seq_2) < $length_cutoff) ){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
752 ++$sequence_pairs_removed;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
753 if ($retain){ # writing out single-end reads if they are longer than the cutoff
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
754
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
755 if ( length($seq_1) >= $length_read_1){ # read 1 is long enough
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
756 print UNPAIRED1 $id_1;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
757 print UNPAIRED1 "$seq_1\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
758 print UNPAIRED1 $l3_1;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
759 print UNPAIRED1 $qual_1;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
760 ++$read_1_printed;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
761 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
762
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
763 if ( length($seq_2) >= $length_read_2){ # read 2 is long enough
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
764 print UNPAIRED2 $id_2;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
765 print UNPAIRED2 "$seq_2\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
766 print UNPAIRED2 $l3_2;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
767 print UNPAIRED2 $qual_2;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
768 ++$read_2_printed;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
769 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
770
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
771 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
772 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
773 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
774 print R1 $id_1;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
775 print R1 "$seq_1\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
776 print R1 $l3_1;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
777 print R1 $qual_1;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
778
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
779 print R2 $id_2;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
780 print R2 "$seq_2\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
781 print R2 $l3_2;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
782 print R2 $qual_2;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
783 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
784
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
785 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
786 my $percentage = sprintf("%.2f",$sequence_pairs_removed/$count*100);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
787 warn "Total number of sequences analysed: $count\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
788 warn "Number of sequence pairs removed: $sequence_pairs_removed ($percentage%)\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
789
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
790 print REPORT "Total number of sequences analysed for the sequence pair length validation: $count\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
791 print REPORT "Number of sequence pairs removed because at least one read was shorter than the length cutoff ($length_cutoff bp): $sequence_pairs_removed ($percentage%)\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
792
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
793 if ($keep){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
794 warn "Number of unpaired read 1 reads printed: $read_1_printed\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
795 warn "Number of unpaired read 2 reads printed: $read_2_printed\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
796 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
797
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
798 warn "\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
799 if ($retain){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
800 return ($out_1,$out_2,$unpaired_1,$unpaired_2);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
801 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
802 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
803 return ($out_1,$out_2);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
804 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
805 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
806
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
807 sub process_commandline{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
808 my $help;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
809 my $quality;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
810 my $adapter;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
811 my $adapter2;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
812 my $stringency;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
813 my $report;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
814 my $version;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
815 my $rrbs;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
816 my $length_cutoff;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
817 my $keep;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
818 my $fastqc;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
819 my $non_directional;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
820 my $phred33;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
821 my $phred64;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
822 my $fastqc_args;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
823 my $trim;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
824 my $gzip;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
825 my $validate;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
826 my $retain;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
827 my $length_read_1;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
828 my $length_read_2;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
829 my $error_rate;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
830 my $output_dir;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
831 my $no_report_file;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
832 my $suppress_warn;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
833
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
834 my $command_line = GetOptions ('help|man' => \$help,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
835 'q|quality=i' => \$quality,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
836 'a|adapter=s' => \$adapter,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
837 'a2|adapter2=s' => \$adapter2,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
838 'report' => \$report,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
839 'version' => \$version,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
840 'stringency=i' => \$stringency,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
841 'fastqc' => \$fastqc,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
842 'RRBS' => \$rrbs,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
843 'keep' => \$keep,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
844 'length=i' => \$length_cutoff,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
845 'non_directional' => \$non_directional,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
846 'phred33' => \$phred33,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
847 'phred64' => \$phred64,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
848 'fastqc_args=s' => \$fastqc_args,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
849 'trim1' => \$trim,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
850 'gzip' => \$gzip,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
851 'paired_end' => \$validate,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
852 'retain_unpaired' => \$retain,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
853 'length_1|r1=i' => \$length_read_1,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
854 'length_2|r2=i' => \$length_read_2,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
855 'e|error_rate=s' => \$error_rate,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
856 'o|output_dir=s' => \$output_dir,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
857 'no_report_file' => \$no_report_file,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
858 'suppress_warn' => \$suppress_warn,
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
859 );
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
860
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
861 ### EXIT ON ERROR if there were errors with any of the supplied options
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
862 unless ($command_line){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
863 die "Please respecify command line options\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
864 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
865
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
866 ### HELPFILE
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
867 if ($help){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
868 print_helpfile();
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
869 exit;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
870 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
871
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
872
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
873
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
874
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
875 if ($version){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
876 print << "VERSION";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
877
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
878 Quality-/Adapter-/RRBS-Trimming
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
879 (powered by Cutadapt)
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
880 version $trimmer_version
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
881
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
882 Last update: 18 10 2012
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
883
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
884 VERSION
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
885 exit;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
886 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
887
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
888 ### RRBS
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
889 unless ($rrbs){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
890 $rrbs = 0;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
891 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
892
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
893 ### SUPRESS WARNINGS
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
894 if (defined $suppress_warn){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
895 $DOWARN = 0;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
896 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
897
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
898 ### QUALITY SCORES
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
899 my $phred_encoding;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
900 if ($phred33){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
901 if ($phred64){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
902 die "Please specify only a single quality encoding type (--phred33 or --phred64)\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
903 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
904 $phred_encoding = 33;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
905 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
906 elsif ($phred64){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
907 $phred_encoding = 64;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
908 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
909 unless ($phred33 or $phred64){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
910 warn "No quality encoding type selected. Assuming that the data provided uses Sanger encoded Phred scores (default)\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
911 $phred_encoding = 33;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
912 sleep (3);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
913 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
914
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
915 ### NON-DIRECTIONAL RRBS
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
916 if ($non_directional){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
917 unless ($rrbs){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
918 die "Option '--non_directional' requires '--rrbs' to be specified as well. Please re-specify!\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
919 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
920 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
921 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
922 $non_directional = 0;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
923 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
924
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
925 if ($fastqc_args){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
926 $fastqc = 1; # specifying fastqc extra arguments automatically means that FastQC will be executed
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
927 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
928 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
929 $fastqc_args = 0;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
930 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
931
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
932 ### CUSTOM ERROR RATE
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
933 if (defined $error_rate){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
934 # make sure that the error rate is between 0 and 1
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
935 unless ($error_rate >= 0 and $error_rate <= 1){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
936 die "Please specify an error rate between 0 and 1 (the default is 0.1)\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
937 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
938 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
939 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
940 $error_rate = 0.1; # (default)
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
941 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
942
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
943 if (defined $adapter){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
944 unless ($adapter =~ /^[ACTGNactgn]+$/){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
945 die "Adapter sequence must contain DNA characters only (A,C,T,G or N)!\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
946 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
947 $adapter = uc$adapter;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
948 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
949
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
950 if (defined $adapter2){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
951 unless ($validate){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
952 die "An optional adapter for read 2 of paired-end files requires '--paired' to be specified as well! Please re-specify\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
953 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
954 unless ($adapter2 =~ /^[ACTGNactgn]+$/){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
955 die "Optional adapter 2 sequence must contain DNA characters only (A,C,T,G or N)!\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
956 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
957 $adapter2 = uc$adapter2;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
958 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
959
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
960 ### files are supposed to be paired-end files
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
961 if ($validate){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
962
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
963 # making sure that an even number of reads has been supplied
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
964 unless ((scalar@ARGV)%2 == 0){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
965 die "Please provide an even number of input files for paired-end FastQ trimming! Aborting ...\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
966 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
967
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
968 ## CUTOFF FOR VALIDATED READ-PAIRS
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
969
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
970 if (defined $length_read_1 or defined $length_read_2){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
971 unless ($retain){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
972 die "Please specify --keep_unpaired to alter the unpaired single-end read length cut off(s)\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
973 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
974
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
975 if (defined $length_read_1){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
976 unless ($length_read_1 >= 15 and $length_read_1 <= 100){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
977 die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
978 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
979 unless ($length_read_1 > $length_cutoff){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
980 die "The single-end unpaired read length needs to be longer than the paired-end cut-off value\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
981 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
982 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
983
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
984 if (defined $length_read_2){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
985 unless ($length_read_2 >= 15 and $length_read_2 <= 100){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
986 die "Please select a sensible cutoff for when a read pair should be filtered out due to short length (allowed range: 15-100 bp)\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
987 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
988 unless ($length_read_2 > $length_cutoff){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
989 die "The single-end unpaired read length needs to be longer than the paired-end cut-off value\n\n";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
990 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
991 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
992 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
993
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
994 if ($retain){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
995 $length_read_1 = 35 unless (defined $length_read_1);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
996 $length_read_2 = 35 unless (defined $length_read_2);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
997 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
998 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
999
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1000 unless ($no_report_file){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1001 $no_report_file = 0;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1002 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1003
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1004 ### OUTPUT DIR PATH
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1005 if ($output_dir){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1006 unless ($output_dir =~ /\/$/){
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1007 $output_dir =~ s/$/\//;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1008 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1009 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1010 else{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1011 $output_dir = '';
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1012 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1013
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1014 return ($quality,$adapter,$stringency,$rrbs,$length_cutoff,$keep,$fastqc,$non_directional,$phred_encoding,$fastqc_args,$trim,$gzip,$validate,$retain,$length_read_1,$length_read_2,$adapter2,$error_rate,$output_dir,$no_report_file);
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1015 }
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1016
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1017
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1018
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1019
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1020 sub print_helpfile{
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1021 print << "HELP";
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1022
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1023 USAGE:
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1024
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1025 trim_galore [options] <filename(s)>
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1026
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1027
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1028 -h/--help Print this help message and exits.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1029
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1030 -v/--version Print the version information and exits.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1031
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1032 -q/--quality <INT> Trim low-quality ends from reads in addition to adapter removal. For
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1033 RRBS samples, quality trimming will be performed first, and adapter
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1034 trimming is carried in a second round. Other files are quality and adapter
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1035 trimmed in a single pass. The algorithm is the same as the one used by BWA
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1036 (Subtract INT from all qualities; compute partial sums from all indices
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1037 to the end of the sequence; cut sequence at the index at which the sum is
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1038 minimal). Default Phred score: 20.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1039
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1040 --phred33 Instructs Cutadapt to use ASCII+33 quality scores as Phred scores
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1041 (Sanger/Illumina 1.9+ encoding) for quality trimming. Default: ON.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1042
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1043 --phred64 Instructs Cutadapt to use ASCII+64 quality scores as Phred scores
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1044 (Illumina 1.5 encoding) for quality trimming.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1045
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1046 --fastqc Run FastQC in the default mode on the FastQ file once trimming is complete.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1047
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1048 --fastqc_args "<ARGS>" Passes extra arguments to FastQC. If more than one argument is to be passed
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1049 to FastQC they must be in the form "arg1 arg2 etc.". An example would be:
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1050 --fastqc_args "--nogroup --outdir /home/". Passing extra arguments will
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1051 automatically invoke FastQC, so --fastqc does not have to be specified
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1052 separately.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1053
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1054 -a/--adapter <STRING> Adapter sequence to be trimmed. If not specified explicitely, the first 13
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1055 bp of the Illumina adapter 'AGATCGGAAGAGC' will be used by default.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1056
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1057 -a2/--adapter2 <STRING> Optional adapter sequence to be trimmed off read 2 of paired-end files. This
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1058 option requires '--paired' to be specified as well.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1059
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1060
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1061 -s/--stringency <INT> Overlap with adapter sequence required to trim a sequence. Defaults to a
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1062 very stringent setting of '1', i.e. even a single bp of overlapping sequence
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1063 will be trimmed of the 3' end of any read.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1064
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1065 -e <ERROR RATE> Maximum allowed error rate (no. of errors divided by the length of the matching
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1066 region) (default: 0.1)
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1067
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1068 --gzip Compress the output file with gzip. If the input files are gzip-compressed
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1069 the output files will be automatically gzip compressed as well.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1070
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1071 --length <INT> Discard reads that became shorter than length INT because of either
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1072 quality or adapter trimming. A value of '0' effectively disables
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1073 this behaviour. Default: 20 bp.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1074
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1075 For paired-end files, both reads of a read-pair need to be longer than
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1076 <INT> bp to be printed out to validated paired-end files (see option --paired).
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1077 If only one read became too short there is the possibility of keeping such
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1078 unpaired single-end reads (see --retain_unpaired). Default pair-cutoff: 20 bp.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1079
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1080 -o/--output_dir <DIR> If specified all output will be written to this directory instead of the current
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1081 directory.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1082
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1083 --no_report_file If specified no report file will be generated.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1084
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1085 --suppress_warn If specified any output to STDOUT or STDERR will be suppressed.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1086
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1087
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1088
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1089 RRBS-specific options (MspI digested material):
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1090
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1091 --rrbs Specifies that the input file was an MspI digested RRBS sample (recognition
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1092 site: CCGG). Sequences which were adapter-trimmed will have a further 2 bp
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1093 removed from their 3' end. This is to avoid that the filled-in C close to the
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1094 second MspI site in a sequence is used for methylation calls. Sequences which
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1095 were merely trimmed because of poor quality will not be shortened further.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1096
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1097 --non_directional Selecting this option for non-directional RRBS libraries will screen
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1098 quality-trimmed sequences for 'CAA' or 'CGA' at the start of the read
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1099 and, if found, removes the first two basepairs. Like with the option
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1100 '--rrbs' this avoids using cytosine positions that were filled-in
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1101 during the end-repair step. '--non_directional' requires '--rrbs' to
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1102 be specified as well.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1103
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1104 --keep Keep the quality trimmed intermediate file. Default: off, which means
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1105 the temporary file is being deleted after adapter trimming. Only has
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1106 an effect for RRBS samples since other FastQ files are not trimmed
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1107 for poor qualities separately.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1108
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1109
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1110 Note for RRBS using MseI:
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1111
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1112 If your DNA material was digested with MseI (recognition motif: TTAA) instead of MspI it is NOT necessary
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1113 to specify --rrbs or --non_directional since virtually all reads should start with the sequence
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1114 'TAA', and this holds true for both directional and non-directional libraries. As the end-repair of 'TAA'
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1115 restricted sites does not involve any cytosines it does not need to be treated especially. Instead, simply
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1116 run Trim Galore! in the standard (i.e. non-RRBS) mode.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1117
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1118
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1119 Paired-end specific options:
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1120
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1121 --paired This option performs length trimming of quality/adapter/RRBS trimmed reads for
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1122 paired-end files. To pass the validation test, both sequences of a sequence pair
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1123 are required to have a certain minimum length which is governed by the option
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1124 --length (see above). If only one read passes this length threshold the
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1125 other read can be rescued (see option --retain_unpaired). Using this option lets
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1126 you discard too short read pairs without disturbing the sequence-by-sequence order
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1127 of FastQ files which is required by many aligners.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1128
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1129 Trim Galore! expects paired-end files to be supplied in a pairwise fashion, e.g.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1130 file1_1.fq file1_2.fq SRR2_1.fq.gz SRR2_2.fq.gz ... .
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1131
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1132 -t/--trim1 Trims 1 bp off every read from its 3' end. This may be needed for FastQ files that
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1133 are to be aligned as paired-end data with Bowtie. This is because Bowtie (1) regards
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1134 alignments like this:
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1135
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1136 R1 ---------------------------> or this: -----------------------> R1
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1137 R2 <--------------------------- <----------------- R2
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1138
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1139 as invalid (whenever a start/end coordinate is contained within the other read).
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1140
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1141 --retain_unpaired If only one of the two paired-end reads became too short, the longer
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1142 read will be written to either '.unpaired_1.fq' or '.unpaired_2.fq'
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1143 output files. The length cutoff for unpaired single-end reads is
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1144 governed by the parameters -r1/--length_1 and -r2/--length_2. Default: OFF.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1145
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1146 -r1/--length_1 <INT> Unpaired single-end read length cutoff needed for read 1 to be written to
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1147 '.unpaired_1.fq' output file. These reads may be mapped in single-end mode.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1148 Default: 35 bp.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1149
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1150 -r2/--length_2 <INT> Unpaired single-end read length cutoff needed for read 2 to be written to
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1151 '.unpaired_2.fq' output file. These reads may be mapped in single-end mode.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1152 Default: 35 bp.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1153
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1154
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1155 Last modified on 18 Oct 2012.
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1156
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1157 HELP
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1158 exit;
3c1664caa8e3 Uploaded
bgruening
parents:
diff changeset
1159 }