comparison necat.xml @ 0:6ee7eb5821f0 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/necat commit 6946d81de9419c90e9bc4ea2f7bd5e4168dd6dd6
author iuc
date Fri, 25 Nov 2022 14:24:27 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6ee7eb5821f0
1 <tool id="necat" name="necat" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="MIT">
2 <description>Error correction and de-novo assembly for ONT Nanopore reads</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <xrefs>
7 <xref type="bio.tools">necat</xref>
8 </xrefs>
9 <requirements>
10 <requirement type="package" version="@TOOL_VERSION@">necat</requirement>
11 </requirements>
12 <command detect_errors="exit_code"><![CDATA[
13 ## helper function
14 #def make_filename($i, $input_param)
15 #set ext = $input_param.extension
16 #if $ext == "fastqsanger"
17 #set $ext = "fastq"
18 #end if
19 #set filename = "reads_" + str($i) + "." + $ext
20 #return $filename
21 #end def
22
23 ## push each input file and everything in input collections into read_list.txt
24 #set i = 1
25 #for input in $input_fastqs
26 #set filename = $make_filename($i, $input)
27 cp '$input' $filename
28 && echo $filename >> read_list.txt &&
29 #set i = $i + 1
30 #end for
31
32 ## #for $i, $input in enumerate($input_fastqs):
33 ## #set filename = 'reads_${i}.$input.ext'
34 ## ln -s '$input' $filename &&
35 ## echo $filename >> read_list.txt &&
36 ## #end for
37
38 ## necat commands
39 necat correct '${job_configfile}'
40 #if $assembly.should_assemble == "yes":
41 && necat assemble '${job_configfile}'
42 && necat bridge '${job_configfile}'
43 #end if
44 ]]></command>
45 <configfiles>
46 <expand macro="job_conf" />
47 </configfiles>
48 <inputs>
49 <param name="input_fastqs" type="data" format="fastq,fastq.gz,fasta,fasta.gz" multiple="true" label="Input reads" help="Input read files (FASTQ or FASTA). To select more than one file or collection from your history, use the 'ctrl' key" />
50
51 <param name="genome_size" type="integer" value="" min="1" max="100000000000" label="Genome size" help="Estimated size of genome (bp)" />
52 <param name="min_read_length" type="integer" value="1000" min="1" max="10000000" label="Min read length" help="Minimum length for input reads" />
53 <param name="correction_coverage" type="integer" value="40" min="1" max="10000" label="Correction coverage" help="Number of reads to correct in terms of genome coverage. For a 4Gb genome, setting correction coverage = 10 will correct the longest 40Gb worth of reads from the input fastq. " />
54 <conditional name="assembly">
55 <param name="should_assemble" type="select" label="Assembly">
56 <option value="no" selected="true">Don't perform assembly</option>
57 <option value="yes">Perform assembly on corrected reads</option>
58 </param>
59 <when value="no" />
60 <when value="yes">
61 <param name="assembly_coverage" type="integer" value="30" min="1" max="10000" label="Assembly coverage" help="Number of reads to use in genome assembly in terms of genome coverage" />
62 <param name="polish_contigs" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Polish contigs" help="Polish contigs as final step after briding" />
63 </when>
64 </conditional>
65
66 <section name="adv" title="Advanced options" expanded="false" help="Warning: only change these if you really know what you are doing">
67 <expand macro="overlap_sensitive_options" />
68 <expand macro="consensus_sensitive_options" />
69 <expand macro="overlap_fast_options" />
70 <expand macro="consensus_fast_options" />
71 <expand macro="trimming_overlap_options" />
72 <expand macro="assembly_overlap_options" />
73 <expand macro="assembly_overlap_filtering" />
74 <expand macro="contig_assembly" />
75 <expand macro="contig_bridging" />
76 </section>
77 </inputs>
78 <outputs>
79 <data name="out_reads" format="fasta.gz" from_work_dir="project/1-consensus/cns_final.fasta.gz" label="${tool.name} on ${on_string}: corrected reads" />
80 <data name="out_assembly" format="fasta" from_work_dir="project/6-bridge_contigs/polished_contigs.fasta" label="${tool.name} on ${on_string}: bridged assembly">
81 <filter>assembly['should_assemble'] == 'yes' and not assembly['polish_contigs']</filter>
82 </data>
83 <data name="out_polished_assembly" format="fasta" from_work_dir="project/6-bridge_contigs/bridged_contigs.fasta" label="${tool.name} on ${on_string}: polished assembly">
84 <filter>assembly['should_assemble'] == 'yes' and assembly['polish_contigs']</filter>
85 </data>
86 </outputs>
87 <tests>
88 <!-- single input fastq -->
89 <test expect_num_outputs="2">
90 <param name="input_fastqs" value="test1.fa" />
91 <param name="genome_size" value="13000" />
92 <param name="min_read_length" value="1000" />
93 <param name="correction_coverage" value="40" />
94 <conditional name="assembly">
95 <param name="should_assemble" value="yes" />
96 <param name="assembly_coverage" value="30"/>
97 <param name="polish_contigs" value="true"/>
98 </conditional>
99 <output name="out_reads" ftype="fasta.gz">
100 <assert_contents>
101 <has_size value="75000" delta="2000" />
102 </assert_contents>
103 </output>
104 <output name="out_polished_assembly" ftype="fasta">
105 <assert_contents>
106 <has_line line="&#62;bctg00000000 000000F" />
107 <has_size value="13000" delta="1000" />
108 </assert_contents>
109 </output>
110 </test>
111 <!-- multiple input files of different format -->
112 <test expect_num_outputs="2">
113 <param name="input_fastqs" value="test1_head.fastq,test1_tail.fasta" />
114 <param name="genome_size" value="13000" />
115 <param name="min_read_length" value="1000" />
116 <param name="correction_coverage" value="40" />
117 <conditional name="assembly">
118 <param name="should_assemble" value="yes" />
119 <param name="assembly_coverage" value="30"/>
120 <param name="polish_contigs" value="true"/>
121 </conditional>
122 <output name="out_reads" ftype="fasta.gz">
123 <assert_contents>
124 <has_size value="29000" delta="2000" />
125 </assert_contents>
126 </output>
127 <output name="out_polished_assembly" ftype="fasta">
128 <assert_contents>
129 <has_line line="&#62;bctg00000000 000000F" />
130 <has_size value="13000" delta="1000" />
131 </assert_contents>
132 </output>
133 </test>
134 <!-- advanced params 1 -->
135 <test expect_num_outputs="2">
136 <param name="input_fastqs" value="test1.fa" />
137 <param name="genome_size" value="13000" />
138 <param name="min_read_length" value="1000" />
139 <param name="correction_coverage" value="40" />
140 <conditional name="assembly">
141 <param name="should_assemble" value="yes" />
142 <param name="assembly_coverage" value="30"/>
143 <param name="polish_contigs" value="true"/>
144 </conditional>
145 <section name="adv">
146 <section name="ovs">
147 <param name="n" value="600" />
148 <param name="k" value="14" />
149 <param name="q" value="600" />
150 <param name="z" value="15" />
151 <param name="b" value="2500" />
152 <param name="a" value="800" />
153 <param name="d" value="0.25" />
154 <param name="e" value="0.4" />
155 <param name="m" value="600" />
156 </section>
157 </section>
158 <output name="out_reads" ftype="fasta.gz">
159 <assert_contents>
160 <has_size value="75000" delta="2000" />
161 </assert_contents>
162 </output>
163 <output name="out_polished_assembly" ftype="fasta">
164 <assert_contents>
165 <has_line line="&#62;bctg00000000 000000F" />
166 <has_size value="13000" delta="1000" />
167 </assert_contents>
168 </output>
169 </test>
170 <!-- advanced params 2 -->
171 <test expect_num_outputs="2">
172 <param name="input_fastqs" value="test1.fa" />
173 <param name="genome_size" value="13000" />
174 <param name="min_read_length" value="1000" />
175 <param name="correction_coverage" value="40" />
176 <conditional name="assembly">
177 <param name="should_assemble" value="yes" />
178 <param name="assembly_coverage" value="30"/>
179 <param name="polish_contigs" value="true"/>
180 </conditional>
181 <section name="adv">
182 <section name="fol">
183 <param name="min_length" value="2000" />
184 <param name="max_length" value="200000" />
185 <param name="min_aligned_length" value="2000" />
186 <param name="max_overhang" value="20000" />
187 <param name="min_coverage" value="5" />
188 <param name="bestn" value="5" />
189 <param name="overhang_local_deviation1" value="5" />
190 </section>
191 </section>
192 <output name="out_reads" ftype="fasta.gz">
193 <assert_contents>
194 <has_size value="75000" delta="2000" />
195 </assert_contents>
196 </output>
197 <output name="out_polished_assembly" ftype="fasta">
198 <assert_contents>
199 <has_line line="&#62;bctg00000000 000000F" />
200 <has_size value="13000" delta="1000" />
201 </assert_contents>
202 </output>
203 </test>
204 <!-- advanced params 3 -->
205 <test expect_num_outputs="2">
206 <param name="input_fastqs" value="test1.fa" />
207 <param name="genome_size" value="13000" />
208 <param name="min_read_length" value="1000" />
209 <param name="correction_coverage" value="40" />
210 <conditional name="assembly">
211 <param name="should_assemble" value="yes" />
212 <param name="assembly_coverage" value="30"/>
213 <param name="polish_contigs" value="true"/>
214 </conditional>
215 <section name="adv">
216 <section name="fa">
217 <param name="min_length" value="1000" />
218 <param name="min_identity" value="40" />
219 <param name="min_contig_length" value="600" />
220 <param name="select_branch" value="true" />
221 </section>
222 </section>
223 <output name="out_reads" ftype="fasta.gz">
224 <assert_contents>
225 <has_size value="75000" delta="2000" />
226 </assert_contents>
227 </output>
228 <output name="out_polished_assembly" ftype="fasta">
229 <assert_contents>
230 <has_line line="&#62;bctg00000000 000000F" />
231 <has_size value="13000" delta="1000" />
232 </assert_contents>
233 </output>
234 </test>
235 <!-- advanced params 4 -->
236 <test expect_num_outputs="2">
237 <param name="input_fastqs" value="test1.fa" />
238 <param name="genome_size" value="13000" />
239 <param name="min_read_length" value="1000" />
240 <param name="correction_coverage" value="40" />
241 <conditional name="assembly">
242 <param name="should_assemble" value="yes" />
243 <param name="assembly_coverage" value="30"/>
244 <param name="polish_contigs" value="true"/>
245 </conditional>
246 <section name="adv">
247 <section name="fcb">
248 <param name="read_min_length" value="4000" />
249 <param name="ctg_min_length" value="1000" />
250 <param name="ctg2ctg_min_identity" value="90" />
251 <param name="read2ctg_min_identity" value="60" />
252 <param name="min_contig_length" value="1000" />
253 </section>
254 </section>
255 <output name="out_reads" ftype="fasta.gz">
256 <assert_contents>
257 <has_size value="75000" delta="2000" />
258 </assert_contents>
259 </output>
260 <output name="out_polished_assembly" ftype="fasta">
261 <assert_contents>
262 <has_line line="&#62;bctg00000000 000000F" />
263 <has_size value="13000" delta="1000" />
264 </assert_contents>
265 </output>
266 </test>
267 </tests>
268
269 <help><![CDATA[
270
271 NECAT
272 .....
273
274 **What it does**
275
276 | NECAT performs error correction to remove complex errors in nanopore reads. It can also optionally de novo assembly.
277 | After assembly it is recommended to use MEDAKA for long-read polishing, then NextPolish for short-read polishing.
278 |
279 | Github: https://github.com/xiaochuanle/NECAT
280 |
281
282 **Input**
283
284 - One or more files or collections containing sequence reads (fastq / fasta)
285
286 **Output**
287
288 - Corrected reads (fasta)
289 - Genome assembly (fasta) (Optional)
290
291 |
292
293 **Advanced Settings**
294
295 | Necat runs multiple subprograms in an assembly pipeline to create its final output.
296 | Each subprogram does a specific task, then hands its output to the next.
297 | The subprograms are listed in order below, alongside the settings which can be configured:
298 |
299
300 *oc2pmov*
301
302 | Finds overlaps between raw-reads
303 | *Overlap Sensitive Options & Overlap Fast Options*
304 |
305
306 -k <Integer> kmer size
307 -z <Integer> scan window size
308 -q <Integer> kmer occurs > q times will be ignored
309 -b <Integer> block size
310 -n <Integer> number of candidates
311 -a <Integer> min align length
312 -d <Real> ddf score cutoff
313 -e <Real> sequencing error
314 -m <Integer> number of output
315
316 |
317
318 | DEFAULT OPTIONS:
319 | -k 15 -z 10 -q 500 -b 2000 -s 3 -n 500 -a 500 -d 0.250000 -e 0.500000 -m 500 -t 1
320
321 |
322 |
323
324 *oc2cns*
325
326 | Creates consensus reads from raw-read overlaps
327 | *Consensus Sensitive Options & Consensus Fast Options*
328 |
329
330 -a <Integer> align length cutoff
331 -x <Integer> minimal coverage
332 -y <Integer> maximal coverage
333 -l <Integer> minimal length of corrected reads.
334 -f <0 or 1> full consensus or not: 1 = yes, 0 = no
335 -e <Real> sequencing error
336 -p <Real> minimal mapping ratio
337 -r <0 or 1> rescue long indels or not: 1 = yes, 0 = no
338 -u <0 or 1> use dynamic or fixed ident cutoff: 1 = fixed, 0 = dynamic
339
340 |
341
342 | DEFAULT OPTIONS:
343 | -a 400 -x 4 -y 12 -l 500 -f 0 -e 0.500000 -p 0.800000 -t 1 -r 0 -u 0 -s 0
344
345 |
346 |
347
348 *oc2asmpm*
349
350 | Identifies corrected-read overlaps for assembly
351 | *Trimming Overlap Options & Assembly Overlap Options*
352 |
353
354
355 -k <Integer> kmer size
356 -z <Integer> scan window size
357 -q <Integer> kmer occurs > q times will be ignored
358 -b <Integer> block size
359 -n <Integer> number of candidates
360 -a <Integer> min align length
361 -d <Real> ddf score cutoff
362 -e <Real> sequencing error
363 -m <Integer> number of output
364
365 |
366 |
367
368 *fsa_ol_filter*
369
370 | Filters out low-quality corrected-read overlaps for assembly
371 | *Assembly Overlap Filtering Options*
372 |
373
374 --min_length=INT minimum length of reads. default: 2500
375 --max_length=INT maximum length of reads. default: 2147483647
376 --min_identity=DOUBLE minimum identity of overlaps default: -1
377 --min_aligned_length=INT minimum aligned length of overlaps default: 2500
378 --max_overhang=INT maximum overhang of overlaps, negative number = determined by the program. default: -1
379 --min_coverage=INT minimum base coverage, negative number = determined by the program. default: -1
380 --max_coverage=INT maximum base coverage, negative number = determined by the program default: -1
381 --max_diff_coverage=INT maximum difference of base coverage, negative number = determined by the program default: -1
382 --coverage_discard=DOUBLE discard ratio of base coverage. If max_coverage or max_diff_coverage is negative, it will be reset to (100-coverage_discard)th percentile. default: 0.01
383 --bestn=INT output best n overlaps on 5' or 3' end for each read. default: 10
384 --genome_size=INT genome size. It determines the maximum length of reads with coverage together default: 0
385 --coverage=INT coverage. It determines the maximum length of reads with genome_size together default: 40
386 --identity_global_deviation1=DOUBLE If min_identity < 0, min_identity is set to min(m, deviation1) - 1.4826*mad*deviation2 default: 98
387 --identity_global_deviation2=DOUBLE If min_identity < 0, min_identity is set to min(m, deviation1) - 1.4826*mad*deviation2 default: 6
388 --overhang_global_deviation1=DOUBLE If max_overhang < 0, max_overhang is set to max(m, deviation1) + 1.4826*mad*deviation2 default: 30
389 --overhang_global_deviation2=DOUBLE If max_overhang < 0, max_overhang is set to max(m, deviation1) + 1.4826*mad*deviation2 default: 6
390 --identity_local_deviation1=DOUBLE The local threshold of identity is set to min(m, deviation1) - 1.4826*mad*deviation2 default: 99
391 --identity_local_deviation2=DOUBLE The local threshold of identity is set to min(m, deviation1) - 1.4826*mad*deviation2 default: 6
392 --overhang_local_deviation1=DOUBLE The local threshold of overhang is set to max(m, deviation1) + 1.253*mad*deviation2 default: 10
393 --overhang_local_deviation2=DOUBLE The local threshold of overhang is set to max(m, deviation1) + 1.253*mad*deviation2 default: 6
394 --identity_local_condition=INT Local filtering conditions. 0 = overlap idenitity < threshold, 1 = overlap idenitity < threshold and query identity >= target identity default: 0
395 --local_low_coverage=INT If the coverage of reads is less than local_low_coverage, min_identity and max_overhang are used to filter out low-quality overlaps. Otherwise, the local threshold is used. default: 25
396
397 |
398 |
399
400 *fsa_assemble*
401
402 | Constructs contigs from filtered overlaps
403 | *Contig Assembly Options*
404 |
405
406 --min_length=INT minimum length of reads default: 0
407 --min_identity=DOUBLE minimum identity of overlaps default: 0
408 --min_aligned_length=INT minimum aligned length of overlaps default: 0
409 --min_contig_length=INT minimum length of contigs default: 500
410 --select_branch=BOOL select the most probable branch default: "no"
411 --max_spur_length=INT branches less the threshod are treated as spurs default: 50000
412
413 |
414 |
415
416 *fsa_ctg_bridge*
417
418 | Bridges contigs using input long raw-reads
419 | *Contig Bridging Options*
420 |
421
422 --read_min_length=INT minimum rawread length default: 5000
423 --ctg_min_length=INT minimum contig length default: 500
424 --ctg2ctg_min_identity=DOUBLE minimum identity of overlaps between contigs default: 95
425 --ctg2ctg_max_overhang=INT maximum overhang of overlaps between contigs default: 100
426 --ctg2ctg_min_aligned_length=INT minimum aligned length of overlaps between contigs default: 2000
427 --read2ctg_min_identity=DOUBLE minimum identity of overlaps between rawreads and contigs default: 80
428 --read2ctg_max_overhang=INT maximum overhang of overlaps between rawreads and contigs default: 500
429 --read2ctg_min_aligned_length=INT minimum aligned length of overlaps between rawreads and contigs default: 5000
430 --read2ctg_min_coverage=INT minimum coverage of links between rawreads and contigs default: 3
431 --min_contig_length=INT minimum length of bridged contig default: 500
432 --select_branch=BOOL select the most probable branch default: "no"
433 --window_size=INT threshold is used to group rawreads that bridge contigs default: 1000
434
435 |
436
437
438 ]]></help>
439 <expand macro="citations" />
440 </tool>