comparison bbnorm.xml @ 0:1ef267476a17 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/bbtools commit 35db9ac8668f3e376886ea09de63c87dce93e1ce
author iuc
date Tue, 30 May 2023 09:02:11 +0000
parents
children 1baa4ad1ac2f
comparison
equal deleted inserted replaced
-1:000000000000 0:1ef267476a17
1 <tool id="bbtools_bbnorm" name="BBTools: BBNorm" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
2 <description>Normalise sequencing coverage</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="edam_ontology"/>
7 <expand macro="requirements"/>
8 <stdio>
9 <regex match="This table is (very|crazy|totally) full, which may reduce accuracy. Ideal load is under" source="stderr" level="fatal_oom" description="Too low memory - generated results might be inaccurate."/>
10 </stdio>
11 <command detect_errors="exit_code"><![CDATA[
12 ## bbtools recommends keeping a proper file extension to determine the input format
13 ## so we hardcode them in the below block.
14 #if str($input_type_cond.input_type) == 'paired':
15 #set read1 = $input_type_cond.reads_collection['forward']
16 #set read2 = $input_type_cond.reads_collection['reverse']
17 #else:
18 #set read1 = $input_type_cond.read1
19 #if str($input_type_cond.input_type) == 'PE_2files':
20 #set read2 = $input_type_cond.read2
21 #end if
22 #end if
23
24 #if $read1.ext.endswith('.gz'):
25 #set read1_file = 'forward.fastq.gz'
26 #else
27 #set read1_file = 'forward.fastq'
28 #end if
29 ln -s '${read1}' '${read1_file}' &&
30
31 #if $str($input_type_cond.input_type) in ['PE_2files', 'paired']:
32 #if $read2.ext.endswith('.gz'):
33 #set read2_file = 'reverse.fastq.gz'
34 #else
35 #set read2_file = 'reverse.fastq'
36 #end if
37 ln -s '${read2}' '${read2_file}' &&
38 #end if
39
40
41 ## As the program relies on a count-min-sketch algorithm, the potential hash collisions
42 ## are ignored and translates to a decreased accuracy of the results. Therefore the allocated
43 ## memory amount effects the numerical output: the more RAM the more accurate the results.
44 ## Check if a memory cap was set.
45 if [[ "\${_JAVA_OPTIONS}" != *-Xmx* && "\${JAVA_TOOL_OPTIONS}" != *-Xmx* ]]; then
46 export _JAVA_OPTIONS="\${_JAVA_OPTIONS} -Xmx\${GALAXY_MEMORY_MB:-4096}m -Xms256m";
47 fi &&
48
49 bbnorm.sh tmpdir="\$TMPDIR" t="\${GALAXY_SLOTS:-2}"
50
51 #### Input parameters
52 #if str($input_type_cond.input_type) == 'single_end':
53 in='${read1_file}'
54 interleaved=f
55 #else:
56 #if str($input_type_cond.input_type) == 'PE_1file':
57 in='${read1_file}'
58 interleaved=t
59 #else:
60 in1='${read1_file}'
61 in2='${read2_file}'
62 interleaved=f
63 #end if
64 #end if
65
66 #### Output options
67 out=normalised_R1.fastq
68 #if $str($input_type_cond.input_type) in ['PE_2files', 'paired']:
69 out2=normalised_R2.fastq
70 #end if
71 #if $output_options.save_discarded_reads:
72 outt=discarded.fastq
73 #end if
74 touppercase=t
75 #if $output_options.save_kmer_hists:
76 hist=kmer_hist_input.tabular
77 histout=kmer_hist_output.tabular
78 #end if
79
80 #### Hashing parameters
81 k=$hashing_params.k
82 bits=$hashing_params.bits
83 hashes=$hashing_params.hashes
84 #if str($hashing_params.prefilter) == "true":
85 prefilter=t
86 prehashes=$hashing_params.prehashes
87 prefilterbits=$hashing_params.prefilterbits
88 prefiltersize=$hashing_params.prefiltersize
89 #end if
90 buildpasses=$hashing_params.buildpasses
91 minq=$hashing_params.minq
92 minprob=$hashing_params.minprob
93 rdk=$hashing_params.rdk
94
95 ### Normalization parameters
96 fixspikes=$norm_params.fixspikes
97 target=$target
98 maxdepth=$norm_params.maxdepth
99 mindepth=$norm_params.mindepth
100 minkmers=$norm_params.minkmers
101 percentile=$norm_params.percentile
102 uselowerdepth=$norm_params.uselowerdepth
103 deterministic=$norm_params.deterministic
104 passes=$norm_params.passes
105
106 ### Error detection parameters
107 hdp=$error_det_params.hdp
108 ldp=$error_det_params.ldp
109 tossbadreads=$error_det_params.tossbadreads
110 requirebothbad=$error_det_params.requirebothbad
111 errordetectratio=$error_det_params.errordetectratio
112 highthresh=$error_det_params.highthresh
113 lowthresh=$error_det_params.lowthresh
114
115 ### Error correction parameters
116 #if str($error_corr_params.ecc.ecc) == "true":
117 ecc=t
118 ecclimit=$error_corr_params.ecclimit
119 errorcorrectratio=$error_corr_params.errorcorrectratio
120 echighthresh=$error_corr_params.echighthresh
121 eclowthresh=$error_corr_params.eclowthresh
122 eccmaxqual=$error_corr_params.eccmaxqual
123 meo=$error_corr_params.meo
124 mue=$error_corr_params.mue
125 overlap=$error_corr_params.overlap
126 #end if
127 ]]></command>
128 <inputs>
129 <conditional name="input_type_cond">
130 <param name="input_type" type="select" label="Choose the category of inputs to be analyzed">
131 <option value="single_end">Single-end reads</option>
132 <option value="PE_1file">Paired-end reads as a single, interleaved dataset</option>
133 <option value="PE_2files" selected="true">Paired-end reads as two separate datasets</option>
134 <option value="paired">Paired-end reads as a collection</option>
135 </param>
136 <when value="single_end">
137 <param name="read1" type="data" format="fastqsanger,fastqsanger.gz" label="Single-end data"/>
138 </when>
139 <when value="PE_1file">
140 <param name="read1" type="data" format="fastqsanger,fastqsanger.gz" label="Interleaved paired-end data"/>
141 </when>
142 <when value="PE_2files">
143 <param name="read1" type="data" format="fastqsanger,fastqsanger.gz" label="Forward reads"/>
144 <param name="read2" type="data" format="fastqsanger,fastqsanger.gz" label="Reverse reads"/>
145 </when>
146 <when value="paired">
147 <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="paired" label="Collection of forward and reverse reads"/>
148 </when>
149 </conditional>
150 <param argument="target" type="integer" value="100" min="1" label="Target normalization depth" help="All depth parameters control kmer depth, not read depth. For kmer depth Dk, read depth Dr, read length R, and kmer size K: Dr=Dk*(R/(R-K+1))"/>
151 <section name="norm_params" title="Normalization parameters">
152 <param argument="maxdepth" type="integer" value="-1" min="-1" label="Reads will not be downsampled when below this depth, even if they are above the target depth." help="All depth parameters control kmer depth, not read depth. For kmer depth Dk, read depth Dr, read length R, and kmer size K: Dr=Dk*(R/(R-K+1))"/>
153 <param argument="mindepth" type="integer" value="5" min="0" label="kmers with depth below this number will not be included when calculating the depth of a read." help="All depth parameters control kmer depth, not read depth. For kmer depth Dk, read depth Dr, read length R, and kmer size K: Dr=Dk*(R/(R-K+1))"/>
154 <param argument="minkmers" type="integer" value="15" min="0" label="Reads must have at least this many kmers over min depth to be retained."/>
155 <param argument="percentile" type="integer" value="54" min="1" max="100" label="Percentile to infer read depth" help="Read depth is by default inferred from the 54th percentile of kmer depth, but this may be changed to any number 1-100."/>
156 <param argument="uselowerdepth" type="boolean" checked="true" label="For pairs, use the depth of the lower read as the depth proxy."/>
157 <param argument="deterministic" type="boolean" checked="true" label="Generate random numbers deterministically" help="This would ensure identical output between multiple runs. May decrease speed with a huge number of threads."/>
158 <param argument="fixspikes" type="boolean" checked="false" label="Do a slower, high-precision bloom filter lookup of kmers that appear to have an abnormally high depth due to collisions."/>
159 <param argument="passes" type="integer" value="2" label="Number of passes to perform" help=" pass is the basic mode. 2 passes allows greater accuracy, error detection, better contol of output depth."/>
160 </section>
161 <section name="hashing_params" title="Hashing parameters">
162 <param argument="k" type="integer" value="31" min="1" label="kmer length" help="Values under 32 are most efficient, but arbitrarily high values are supported."/>
163 <param argument="bits" type="select" label="Bits per cell in bloom filter" help="Maximum kmer depth recorded is 2^c bits. Large values decrease accuracy for a fixed amount of memory, so use the lowest number you can that will still capture highest-depth kmers.">
164 <option value="2">2</option>
165 <option value="4">4</option>
166 <option value="8">8</option>
167 <option value="16" selected="true">16</option>
168 <option value="32">32</option>
169 </param>
170 <param argument="hashes" type="integer" value="3" min="1" label="Number of times each kmer is hashed and stored." help="Higher is slower. Higher is more accurate if there is enough memory, but less accurate if there is not enough memory."/>
171 <conditional name="prefilter">
172 <param argument="prefilter" type="select" label="Use a prefilter to eliminate low-depth kmers" help="True is slower, but generally more accurate; filters out low-depth kmers from the main hashtable. The prefilter is more memory-efficient because it uses 2-bit cells.">
173 <option value="true">Yes</option>
174 <option value="false" selected="true">No</option>
175 </param>
176 <when value="false"/>
177 <when value="true">
178 <param argument="prehashes" type="integer" value="2" min="1" label="Number of hashes for the prefilter"/>
179 <param argument="prefilterbits" type="integer" value="2" min="1" label="Bits per cell in prefilter"/>
180 <param argument="prefiltersize" type="float" value="0.35" min="0" max="1" label="Fraction of memory to allocate for the prefilter."/>
181 </when>
182 </conditional>
183 <param argument="buildpasses" type="integer" value="1" min="1" label="Number of passes" help="More passes can sometimes increase accuracy by iteratively removing low-depth kmers"/>
184 <param argument="minq" type="integer" value="6" min="0" label="Ignore kmers containing bases with quality below this threshold"/>
185 <param argument="minprob" type="float" value="0.5" min="0" max="1" label="Ignore kmers with overall probability of correctness below this threshold"/>
186 <param argument="rdk" type="boolean" checked="true" label="Remove duplicate kmers" help="When true, a kmer's count will only be incremented once per read pair, even if that kmer occurs more than once."/>
187 </section>
188 <section name="error_det_params" title="Error detection parameters">
189 <param argument="hdp" type="integer" value="90" min="0" max="100" label="highdepthpercentile" help="Position in sorted kmer depth array used as proxy of a read's high kmer depth."/>
190 <param argument="ldp" type="integer" value="25" min="0" max="100" label="lowdepthpercentile" help="Position in sorted kmer depth array used as proxy of a read's low kmer depth."/>
191 <param argument="tossbadreads" type="boolean" checked="false" label="Throw away reads detected as containing errors."/>
192 <param argument="requirebothbad" type="boolean" checked="false" label="Only toss bad pairs if both reads are bad."/>
193 <param argument="errordetectratio" type="integer" value="125" min="0" label="Error detection ratio" help="Reads with a ratio of at least this much between their high and low depth kmers will be classified as error reads."/>
194 <param argument="highthresh" type="integer" value="12" min="0" label="Threshold for high kmer" help="A high kmer at this or above are considered non-error."/>
195 <param argument="lowthresh" type="integer" value="3" min="0" label="Threshold for low kmer" help="Kmers at this and below are always considered errors."/>
196 </section>
197
198 <section name="error_corr_params" title="Error correction parameters">
199 <conditional name="ecc">
200 <param argument="ecc" type="select" label="What should be done with detected errors?" help="Tadpole is now preferred for error correction, as it does a better job.">
201 <option value="true" >Correct errors when possible</option>
202 <option value="false" selected="true">Do not attempt to correct errors</option>
203 </param>
204 <when value="false"/>
205 <when value="true">
206 <param argument="ecclimit" type="integer" value="3" min="1" label="Correct up to this many errors per read." help="If more are detected, the read will remain unchanged."/>
207 <param argument="errorcorrectratio" type="integer" value="140" min="0" label="Depth ratio" help="Adjacent kmers with a depth ratio of at least this much between will be classified as an error."/>
208 <param argument="echighthresh" type="integer" value="22" min="0" label="Threshold for high kmer" help="A kmer at this or above may be considered non-error."/>
209 <param argument="eclowthresh" type="integer" value="2" min="0" label="Threshold for low kmer." help="kmers at this depth or below will be considered as errors."/>
210 <param argument="eccmaxqual" type="integer" value="127" min="0" label="Do not correct bases with quality above this value."/>
211 <param argument="meo" type="boolean" checked="false" label="Marks errors by reducing quality value of suspected errors; does not correct anything."/>
212 <param argument="mue" type="boolean" checked="true" label="Mark errors only on uncorrectable reads."/>
213 <param argument="overlap" type="boolean" checked="false" label="Correct errors by read overlap."/>
214 </when>
215 </conditional>
216 </section>
217
218 <section name="output_options" title="Output options">
219 <param name="save_discarded_reads" type="boolean" checked="false" label="Save the reads that were eliminated from the input datasets to the history"/>
220 <param name="save_kmer_hists" type="boolean" checked="false" label="Save the kmer histograms (in tabular format) for the input and output datasets to the history"/>
221 </section>
222 </inputs>
223 <outputs>
224 <data name="output_normalised_R1" format_source="read1" from_work_dir="normalised_R1.fastq" label="${tool.name} on ${on_string} (normalised R1 reads)">
225 <filter>input_type_cond['input_type'] != 'paired'</filter>
226 </data>
227 <data name="output_normalised_R2" format_source="read2" from_work_dir="normalised_R2.fastq" label="${tool.name} on ${on_string} (normalised R2 reads)">
228 <filter>input_type_cond['input_type'] == 'PE_2files'</filter>
229 </data>
230 <collection name="output_pair" type="paired" format_source="reads_collection" label="${tool.name} on ${on_string} (normalised reads)">
231 <filter>input_type_cond['input_type'] == 'paired'</filter>
232 <data name="forward" from_work_dir="normalised_R1.fastq" label="${tool.name} on ${on_string} (normalised R1 reads)"/>
233 <data name="reverse" from_work_dir="normalised_R2.fastq" label="${tool.name} on ${on_string} (normalised R2 reads)"/>
234 </collection>
235 <data name="output_discarded" format="fastqsanger" from_work_dir="discarded.fastq" label="${tool.name} on ${on_string} (discarded reads)">
236 <filter>output_options['save_discarded_reads'] is True</filter>
237 </data>
238 <data name="kmer_hist_input" format="tabular" from_work_dir="kmer_hist_input.tabular" label="${tool.name} on ${on_string} (kmer histogram input)">
239 <filter>output_options['save_kmer_hists'] is True</filter>
240 </data>
241 <data name="kmer_hist_output" format="tabular" from_work_dir="kmer_hist_output.tabular" label="${tool.name} on ${on_string} (kmer histogram output)">
242 <filter>output_options['save_kmer_hists'] is True</filter>
243 </data>
244 </outputs>
245 <tests>
246 <!-- Single end sequencing -->
247 <test expect_num_outputs="1">
248 <param name="input_type" value="single_end"/>
249 <param name="read1" ftype="fastqsanger" value="bbnorm/input_R1.fastq"/>
250 <param name="target" value="4"/>
251 <section name="norm_params">
252 <param name="deterministic" value="true"/>
253 <param name="mindepth" value="0"/>
254 </section>
255 <output name="output_normalised_R1" ftype="fastqsanger" value="bbnorm/normalised_R1.fastq"/>
256 </test>
257 <!-- Single end sequencing, compressed -->
258 <test expect_num_outputs="1">
259 <param name="input_type" value="single_end"/>
260 <param name="read1" ftype="fastqsanger.gz" value="bbnorm/input_R1.fastq.gz"/>
261 <param name="target" value="4"/>
262 <section name="norm_params">
263 <param name="deterministic" value="true"/>
264 <param name="mindepth" value="0"/>
265 </section>
266 <output name="output_normalised_R1" ftype="fastqsanger.gz" value="bbnorm/normalised_R1.fastq"/>
267 </test>
268 <!-- PE as an interleaved file -->
269 <test expect_num_outputs="4">
270 <param name="input_type" value="PE_1file"/>
271 <param name="read1" ftype="fastqsanger" value="bbnorm/input_interleaved.fastq"/>
272 <param name="target" value="4"/>
273 <section name="norm_params">
274 <param name="deterministic" value="true"/>
275 <param name="mindepth" value="0"/>
276 </section>
277 <section name="output_options">
278 <param name="save_discarded_reads" value="true"/>
279 <param name="save_kmer_hists" value="true"/>
280 </section>
281 <output name="output_normalised_R1" ftype="fastqsanger" value="bbnorm/normalised_interleaved.fastq"/>
282 <output name="output_discarded" ftype="fastqsanger" value="bbnorm/discarded.fastq"/>
283 <output name="kmer_hist_input" ftype="tabular" file="bbnorm/kmer_hist_input.tabular"/>
284 <output name="kmer_hist_output" ftype="tabular" file="bbnorm/kmer_hist_output.tabular"/>
285 </test>
286 <!-- PE as 2 files -->
287 <test expect_num_outputs="2">
288 <param name="input_type" value="PE_2files"/>
289 <param name="read1" ftype="fastqsanger" value="bbnorm/input_R1.fastq"/>
290 <param name="read2" ftype="fastqsanger" value="bbnorm/input_R2.fastq"/>
291 <param name="target" value="4"/>
292 <section name="norm_params">
293 <param name="deterministic" value="true"/>
294 <param name="mindepth" value="0"/>
295 </section>
296 <output name="output_normalised_R1" ftype="fastqsanger" value="bbnorm/normalised_R1.fastq"/>
297 <output name="output_normalised_R2" ftype="fastqsanger" value="bbnorm/normalised_R2.fastq"/>
298 </test>
299 <!-- Paired end collection -->
300 <test expect_num_outputs="3">
301 <param name="input_type" value="paired"/>
302 <param name="reads_collection">
303 <collection type="paired">
304 <element name="forward" ftype="fastqsanger" value="bbnorm/input_R1.fastq"/>
305 <element name="reverse" ftype="fastqsanger" value="bbnorm/input_R2.fastq"/>
306 </collection>
307 </param>
308 <param name="target" value="4"/>
309 <section name="norm_params">
310 <param name="deterministic" value="true"/>
311 <param name="mindepth" value="0"/>
312 </section>
313 <output_collection name="output_pair" type="paired" count="2">
314 <element name="forward" ftype="fastqsanger" value="bbnorm/normalised_R1.fastq"/>
315 <element name="reverse" ftype="fastqsanger" value="bbnorm/normalised_R2.fastq"/>
316 </output_collection>
317 </test>
318 </tests>
319 <help>
320 **What it does**
321
322 BBNorm downsamples a provided sequencing output, while paying attention to potential heteregeneities in sequencing depth obtained from the wet-lab workflow. The reads corresponding to regions with low coverage will be kept as is, whereas some of the reads contributing to an above-threshold coverage depth will be subsampled. The resulting data set is expected to be smaller in size, whereas the genome regions with low coverage levels will still be represented in the subsampled dataset. This provides a more uniform coverage depth against all genomic coordinates while the computational resources needed for subsequent steps such as assembly can be substantially reduced without losing coverage anywhere.
323
324 -----
325
326 **If the target sequencing depth is 2X, a Martian genome sequencing result is expected to be down-sampled as follows:**
327
328 input.fastq::
329
330 @read_header_1
331 AAAAATTTTTCCCCCGGGGGAAATTT
332 +
333 FFFFFFFFFFFFFFFEFFFFFF,FFE
334 @read_header_2
335 TTTTTCCCCCGGGGGAAATTTCCCGGG
336 +
337 FFFFFFFFFFFFFFFEFFFFFFEFFDD
338 @read_header_3
339 TTTTTCCCCCGGGGGAAATTTCCCGGG
340 +
341 FFFFFFFFFFFCEFFEFFFFFFEFFEE
342 @read_header_4
343 TTTTTCCCCCGGGGGAAATTTCCCGGG
344 +
345 FFFFFDDFFFFFFFFEFFFFFFEFFEF
346 @read_header_5
347 TTTTTCCCCCGGGGGAAATTTCCCGGG
348 +
349 FFFFFEFFFFEEFFFEFFFFFFDFFFF
350 @read_header_6
351 AAAAATTTTTCCCCCGGGGGAAATTT
352 +
353 FFFFFFFFFFFFFFFEFFFFFFEFFD
354
355
356 output.fastq::
357
358 @read_header_1
359 AAAAATTTTTCCCCCGGGGGAAATTT
360 +
361 FFFFFFFFFFFFFFFEFFFFFF,FFE
362 @read_header_2
363 TTTTTCCCCCGGGGGAAATTTCCCGGG
364 +
365 FFFFFFFFFFFFFFFEFFFFFFEFFDD
366 @read_header_3
367 TTTTTCCCCCGGGGGAAATTTCCCGGG
368 +
369 FFFFFFFFFFFCEFFEFFFFFFEFFEE
370 @read_header_6
371 AAAAATTTTTCCCCCGGGGGAAATTT
372 +
373 FFFFFFFFFFFFFFFEFFFFFFEFFD
374
375
376 **Indications**
377
378 BBNorm is mainly intended for use in assembly pipelines out of short reads. It might be useful when there is too much data that increases computation time or a highly skewed coverage distribution by subsampling from the existing data. As opposed to keeping a randomly selected subset of reads, such as retaining the first n-many elements, this is a weighted resampling that tries to reduce coverage around coordinates of very high sequencing depth.
379
380
381 **Contraindications**
382
383 * The data already has a roughly uniform coverage that does not need to be normalised further.
384 * You do not have any excess data to discard: BBnorm does not increase data quantity by imputation or by repeatedly sampling with replacement.
385 * Your pipeline reports results that rely on quantification of abundance (ex: differential expression profiling or ChIP-Seq)
386 * You want to do variant discovery. Reduction of sequencing depth might bias significance levels, or even obscure the existence of rare variants altogether.
387 * The sequencing platform has a very high error rate (ex: ONT) that might mislead this algorithm.
388
389 </help>
390 <expand macro="citations"/>
391 </tool>
392