comparison FLASH.xml @ 2:6889442b27dc draft default tip

Uploaded
author aaronpetkau
date Sat, 04 Jul 2015 08:58:21 -0400
parents
children
comparison
equal deleted inserted replaced
1:a444685f161c 2:6889442b27dc
1 <tool id="FLASH" name="FLASH" version="1.3.0">
2 <description>merge paired-end reads from fragments that are shorter than twice the length of reads</description>
3 <command interpreter="bash">
4 FLASH.sh $extendedFrags $notCombined1 $notCombined2 $interNotCombined $readsAndPairs $log_file -o out -t 4
5 #if $min_overlap
6 -m $min_overlap
7 #end if
8 #if $max_overlap
9 -M $max_overlap
10 #else
11 -M 250
12 #end if
13 #if $outputs.output_type == "Interleaved_fastq"
14 --interleaved-output
15 #else if $outputs.output_type == "tab"
16 -To
17 #end if
18 #if $options.options_select == "advanced"
19 #if $options.max_mismatch_density
20 -x $options.max_mismatch_density
21 #end if
22 #if $options.phred_offset
23 -p $options.phred_offset
24 #end if
25 #if $options.read_length
26 -r $options.read_length
27 #end if
28 #if $options.fragment_length
29 -f $options.fragment_length
30 #end if
31 #if $options.fragment_stdev
32 -s $options.fragment_stdev
33 #end if
34 #if $options.cap_mismatch_quals
35 $options.cap_mismatch_quals
36 #end if
37 #if $options.quiet
38 $options.quiet
39 #end if
40 #end if
41
42 #if $input_type.sPaired == "paired":
43 $input_type.pInput1 $input_type.pInput2
44 #elif $input_type.sPaired == "collections":
45 $input_type.fastq_collection.forward $input_type.fastq_collection.reverse
46 #end if
47
48 </command>
49 <inputs>
50 <conditional name="input_type">
51 <param name="sPaired" type="select" label="Single Pair or Collection">
52 <option value="collections">Paired-end Collections</option>
53 <option value="paired">Paired-end</option>
54 </param>
55 <when value="paired">
56 <param name="pInput1" type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa" label="Forward FASTQ file" help="Must have ASCII encoded quality scores"/>
57 <param name="pInput2" type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa" label="Reverse FASTQ file" help="File format must match the Forward FASTQ file"/>
58 </when>
59 <when value="collections">
60 <param name="fastq_collection" type="data_collection" label="Paired-end Fastq collection" help="" optional="false" format="txt" collection_type="paired" />
61 </when>
62 </conditional>
63
64 <param name="min_overlap" type="integer" label="Minimum overlap" optional="true"/>
65 <param name="max_overlap" type="integer" label="Maximum overlap" value="250" optional="true"/>
66 <conditional name="outputs">
67 <param name="output_type" type="select" label="Output type">
68 <option value="Non-interleaved_fastq">Non-interleaved fastq</option>
69 <option value="Interleaved_fastq">Interleaved fastq</option>
70 <option value="tab">Tab-deliminated</option>
71 </param>
72 </conditional>
73 <conditional name="options">
74 <param name="options_select" type="select" label="Options Type">
75 <option value="basic">Basic</option>
76 <option value="advanced">Advanced</option>
77 </param>
78 <when value="advanced">
79 <param name="max_mismatch_density" type="float" label="Maximum mismatch density" optional="true"/>
80 <param name="phred_offset" type="select" label="Phred-offset" optional="true">
81 <option value="33">33</option>
82 <option value="64">64</option>
83 </param>
84 <param name="read_length" type="integer" label="Average read length" optional="true"/>
85 <param name="fragment_length" type="integer" label="Fragment length" optional="true"/>
86 <param name="fragment_stdev" type="integer" label="Fragment length standard deviation" optional="true"/>
87 <param name="cap_mismatch_quals" type="boolean" label="Cap mismatch quality scores" truevalue="--cap-mismatch-quals" optional="true"/>
88 <!--<param name="compress" type="boolean" label="Compress output files with gzip" optional="true"/>
89 <param name="compress_prog" type="text" label="Compression program" optional="true"/>
90 <param name="compress_prog_args" type="text" label="Compression program arguments" optional="true"/> <~~~~~~~~Phil says the compression options aren't needed-->
91 <param name="quiet" type="boolean" label="Do not print informational messages" truevalue="-q" optional="true"/>
92 </when>
93 </conditional>
94 </inputs>
95 <outputs>
96 <data format="fastqsanger" name="extendedFrags" label="Merged reads">
97 <filter>outputs['output_type'] != "tab"</filter>
98 </data>
99 <data format="fastqsanger" name="notCombined1" label="Read 1 of mate pairs not merged">
100 <filter>outputs['output_type'] == "Non-interleaved_fastq"</filter>
101 </data>
102 <data format="fastqsanger" name="notCombined2" label="Read 2 of mate pairs not merged">
103 <filter>outputs['output_type'] == "Non-interleaved_fastq"</filter>
104 </data>
105 <data format="fastqsanger" name="interNotCombined" label="Interleaved non-combined pairs">
106 <filter>outputs['output_type'] == "Interleaved_fastq"</filter>
107 </data>
108 <data format="tabular" name="readsAndPairs" label="Merged and non-merged pairs">
109 <filter>outputs['output_type'] == "tab"</filter>
110 </data>
111 <data format="txt" name="log_file" label="Log file"/>
112 <!-- <data format="txt" name="numericHistogram" label="Numeric histogram of merged read lengths"/>
113 <data format="txt" name="visualHistogram" label="Visual histogram of merged read lengths"/>-->
114 </outputs>
115 <requirements>
116 <requirement type="package" version="1.2.9">FLASH</requirement>
117 </requirements>
118 <help>
119 ----------------------------------------------------------------------------
120 DESCRIPTION
121 ----------------------------------------------------------------------------
122
123 FLASH (Fast Length Adjustment of SHort reads) is an accurate and fast tool
124 to merge paired-end reads that were generated from DNA fragments whose
125 lengths are shorter than twice the length of reads. Merged read pairs result
126 in unpaired longer reads, which are generally more desired in genome
127 assembly and genome analysis processes.
128
129 Briefly, the FLASH algorithm considers all possible overlaps at or above a
130 minimum length between the reads in a pair and chooses the overlap that
131 results in the lowest mismatch density (proportion of mismatched bases in
132 the overlapped region). Ties between multiple overlaps are broken by
133 considering quality scores at mismatch sites. When building the merged
134 sequence, FLASH computes a consensus sequence in the overlapped region.
135 More details can be found in the original publication
136 (http://bioinformatics.oxfordjournals.org/content/27/21/2957.full).
137
138 Limitations of FLASH include:
139 - FLASH cannot merge paired-end reads that do not overlap.
140 - FLASH cannot merge read pairs that have an outward orientation, either
141 due to being "jumping" reads or due to excessive trimming.
142 - FLASH is not designed for data that has a significant amount of indel
143 errors (such as Sanger sequencing data). It is best suited for Illumina
144 data.
145
146 ----------------------------------------------------------------------------
147 MANDATORY INPUT
148 ----------------------------------------------------------------------------
149
150 The most common input to FLASH is two FASTQ files containing read 1 and read 2
151 of each mate pair, respectively, in the same order.
152
153 Alternatively, you may provide one FASTQ file, which may be standard input,
154 containing paired-end reads in either interleaved FASTQ (see the
155 --interleaved-input option) or tab-delimited (see the --tab-delimited-input
156 option) format. In all cases, gzip compressed input is autodetected. Also,
157 in all cases, the PHRED offset is, by default, assumed to be 33; use the
158 --phred-offset option to change it.
159
160 ----------------------------------------------------------------------------
161 OUTPUT
162 ----------------------------------------------------------------------------
163
164 The default output of FLASH consists of the following files:
165
166 - out.extendedFrags.fastq The merged reads.
167 - out.notCombined_1.fastq Read 1 of mate pairs that were not merged.
168 - out.notCombined_2.fastq Read 2 of mate pairs that were not merged.
169 - out.hist Numeric histogram of merged read lengths.
170 - out.histogram Visual histogram of merged read lengths.
171
172 FLASH also logs informational messages to standard output. These can be
173 redirected to a file, as in the following example:
174
175 $ flash reads_1.fq reads_2.fq | tee flash.log
176
177 In addition, FLASH supports several features affecting the output:
178
179 - Writing the merged reads directly to standard output (--to-stdout)
180 - Writing gzip compressed output files (-z) or using an external
181 compression program (--compress-prog)
182 - Writing the uncombined read pairs in interleaved FASTQ format
183 (--interleaved-output)
184 - Writing all output reads to a single file in tab-delimited format
185 (--tab-delimited-output)
186
187 ----------------------------------------------------------------------------
188 OPTIONS
189 ----------------------------------------------------------------------------
190
191 -m, --min-overlap=NUM The minimum required overlap length between two
192 reads to provide a confident overlap. Default:
193 10bp.
194
195 -M, --max-overlap=NUM Maximum overlap length expected in approximately
196 90% of read pairs. It is by default set to 70bp,
197 which works well for 100bp reads generated from a
198 180bp library, assuming a normal distribution of
199 fragment lengths. Overlaps longer than the maximum
200 overlap parameter are still considered as good
201 overlaps, but the mismatch density (explained below)
202 is calculated over the first max_overlap bases in
203 the overlapped region rather than the entire
204 overlap. Default: 70bp, or calculated from the
205 specified read length, fragment length, and fragment
206 length standard deviation.
207
208 -x, --max-mismatch-density=NUM
209 Maximum allowed ratio between the number of
210 mismatched base pairs and the overlap length.
211 Two reads will not be combined with a given overlap
212 if that overlap results in a mismatched base density
213 higher than this value. Note: Any occurence of an
214 'N' in either read is ignored and not counted
215 towards the mismatches or overlap length. Our
216 experimental results suggest that higher values of
217 the maximum mismatch density yield larger
218 numbers of correctly merged read pairs but at
219 the expense of higher numbers of incorrectly
220 merged read pairs. Default: 0.25.
221
222 -p, --phred-offset=OFFSET
223 The smallest ASCII value of the characters used to
224 represent quality values of bases in FASTQ files.
225 It should be set to either 33, which corresponds
226 to the later Illumina platforms and Sanger
227 platforms, or 64, which corresponds to the
228 earlier Illumina platforms. Default: 33.
229
230 -r, --read-len=LEN
231
232 -f, --fragment-len=LEN
233
234 -s, --fragment-len-stddev=LEN
235 Average read length, fragment length, and fragment
236 standard deviation. These are convenience parameters
237 only, as they are only used for calculating the
238 maximum overlap (--max-overlap) parameter.
239 The maximum overlap is calculated as the overlap of
240 average-length reads from an average-size fragment
241 plus 2.5 times the fragment length standard
242 deviation. The default values are -r 100, -f 180,
243 and -s 18, so this works out to a maximum overlap of
244 65 bp. If --max-overlap is specified, then the
245 specified value overrides the calculated value.
246
247 If you do not know the standard deviation of the
248 fragment library, you can probably assume that the
249 standard deviation is 10% of the average fragment
250 length.
251
252 --cap-mismatch-quals Cap quality scores assigned at mismatch locations
253 to 2. This was the default behavior in FLASH v1.2.7
254 and earlier. Later versions will instead calculate
255 such scores as the
256 absolute value of the difference in quality scores,
257 but at least 2. Essentially, the new behavior
258 prevents a low quality base call that is likely a
259 sequencing error from significantly bringing down
260 the quality of a high quality, likely correct base
261 call.
262
263 --interleaved-input Instead of requiring files MATES_1.FASTQ and
264 MATES_2.FASTQ, allow a single file MATES.FASTQ that
265 has the paired-end reads interleaved. Specify "-"
266 to read from standard input.
267
268 --interleaved-output Write the uncombined pairs in interleaved FASTQ
269 format.
270
271 -I, --interleaved Equivalent to specifying both --interleaved-input
272 and --interleaved-output.
273
274 -Ti, --tab-delimited-input
275 Assume the input is in tab-delimited format
276 rather than FASTQ, in the format described below in
277 '--tab-delimited-output'. In this mode you should
278 provide a single input file, each line of which must
279 contain either a read pair (5 fields) or a single
280 read (3 fields). FLASH will try to combine the read
281 pairs. Single reads will be written to the output
282 file as-is if also using --tab-delimited-output;
283 otherwise they will be ignored. Note that you may
284 specify "-" as the input file to read the
285 tab-delimited data from standard input.
286
287 -To, --tab-delimited-output
288 Write output in tab-delimited format (not FASTQ).
289 Each line will contain either a combined pair in the
290 format 'tag &lt;tab&gt; seq &lt;tab&gt; qual' or an uncombined
291 pair in the format 'tag &lt;tab&gt; seq_1 &lt;tab&gt; qual_1
292 &lt;tab&gt; seq_2 &lt;tab&gt; qual_2'.
293
294 -o, --output-prefix=PREFIX
295 Prefix of output files. Default: "out".
296
297 -d, --output-directory=DIR
298 Path to directory for output files. Default:
299 current working directory.
300
301 -c, --to-stdout
302 Write the combined reads to standard output. In
303 this mode, with FASTQ output (the default) the
304 uncombined reads are discarded. With tab-delimited
305 output, uncombined reads are included in the
306 tab-delimited data written to standard output.
307 In both cases, histogram files are not written,
308 and informational messages are sent to standard
309 error rather than to standard output.
310
311 --suffix=SUFFIX, --output-suffix=SUFFIX
312 Use SUFFIX as the suffix of the output files
313 after ".fastq". A dot before the suffix is assumed,
314 unless an empty suffix is provided. Default:
315 nothing; or 'gz' if -z is specified; or PROG if
316 --compress-prog=PROG is specified.
317
318 -t, --threads=NTHREADS Set the number of worker threads. This is in
319 addition to the I/O threads. Default: number of
320 processors. Note: if you need FLASH's output to
321 appear deterministically or in the same order as
322 the original reads, you must specify -t 1
323 (--threads=1).
324
325 -q, --quiet Do not print informational messages.
326
327 -h, --help Display this help and exit.
328
329 -v, --version Display version.
330 </help>
331 </tool>