wtdbg: wtdbg.xml comparison

comparison wtdbg.xml @ 2:2668027a533b draft default tip

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/wtdbg commit 05f8373310ce1728426b89f33b643406e0cba54b"

author	bgruening
date	Sat, 29 Jan 2022 12:49:28 +0000
parents	e100f3f4d80e
children

comparison

equal deleted inserted replaced

-:e100f3f4d80e
+:2668027a533b
-<tool id="wtdbg" name="WTDBG" version="2.0">
+<tool id="wtdbg" name="WTDBG2" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01" license="GPL-3.0-only">
-<description>De novo assembler AND consensuser for long noisy sequences</description>
+<description>Fast de novo sequence assembler for long noisy reads</description>
+<xrefs>
+<xref type='bio.tools'>wtdbg2</xref>
+</xrefs>
 <macros>
 <import>macros.xml</import>
 </macros>
-<expand macro="requirements" />
+<requirements>
-<version_command>wtdbg2 -help | grep 'Version:'</version_command>
+<requirement type="package" version="@TOOL_VERSION@">wtdbg</requirement>
+</requirements>
+<version_command>wtdbg2 -V</version_command>
 <command detect_errors="exit_code"><![CDATA[
-wtdbg2
+## helper function to sort fastqs before fastas in input readsets
--t \${GALAXY_SLOTS:-4}
+#def sort_fastq_fasta(files):
--i '$i'
+#set fastqs = [f for f in $files if f.is_of_type('fastq')]
--o 'dbg'
+#set fastas = [f for f in $files if f.is_of_type('fasta')]
-#if $I:
+#set out = $fastqs + $fastas
--I '$I'
+#return $out
-#end if
+#end def
-#if $load_alignments:
---load-alignments '$load_alignments'
+## prepare readset inputs (must be fastqs then fastas if multiple files)
-#end if
+#set input_reads_sorted = $sort_fastq_fasta($input_reads)
--k $k
+## perform assembly
--p $p
+wtdbg2
--K $K
+-t \${GALAXY_SLOTS:-4}
--E $E
+-x '${sequencing_technology}'
-$F
+-g '${genome_size}'
--S $S
+@ASM_OPTIONS@
--X $X
--Y $Y
+#for $readset in $input_reads_sorted:
--x $x
+-i '${readset}'
--y $y
+#end for
--l $l
+-fo out
--m $m
--s $s
+&&
---tidy-reads $tidy_reads
+wtpoa-cns
---edge-min $edge_min
+-t \${GALAXY_SLOTS:-4}
-$rescue_low_cov_edges
+-i out.ctg.lay.gz
-&&
+-fo out.fa
-wtdbg-cns
+@CNS_OPTIONS@
--t \${GALAXY_SLOTS:-4}
--o dbg.ctg.lay.fa
--i dbg.ctg.lay
--j $cns.j
--k $cns.k
--Z $cns.Z
--W $cns.W
--H $cns.H
--L $cns.L
--c $cns.c
--M $cns.M
--X $cns.X
--I $cns.I
--D $cns.D
--E $cns.E
--m $cns.m
--S $cns.S
 ]]></command>
 <inputs>
-<param type="data" argument="-i" format="fasta,fasta.gz" label="Long reads sequences file"/>
+<param name="input_reads" type="data" format="fastq,fasta" multiple='true' label="Select input reads from history" help="Select one or more input fastq or fasta files from your history. To select multiple files, use ctrl + click" />
-<param type="data" argument="-I" format="fasta,fasta.gz" optional="True" label="Error-free sequences file"/>
+<param name="sequencing_technology" type="select" label="Sequencing Technology" help="Sequencing technology used to generate reads">
-<param type="data" argument="--load-alignments" name="load_alignments" format="tabular" optional="True" label="Load pre-computed alignments"/>
+<option value="ont">Oxford Nanopore (ont)</option>
+<option value="ccs">PacBio CCS (ccs)</option>
-<param argument="k" type="integer" value="0" min="0" max="25" label="Kmer fsize" />
+<option value="rs">PacBio RSII (rs)</option>
-<param argument="p" type="integer" value="21" min="0" max="25" label="Kmer psize" />
+<option value="sq">PacBio Sequel (sq)</option>
-<param argument="K" type="float" value="1000" min="0" max="65535" label="Filter high frequency kmers" />
+</param>
-<param argument="E" type="integer" value="2" label="Min kmer frequency" />
+<param name="genome_size" type="text" value="" label="Genome size" help="Estimated genome size. k/m/g suffix is allowed - eg a 4500000bp ecoli genome can be written as 4.5m. For a human genome, use 3.2g">
-<param argument="F" type="boolean" truevalue="-F" falsevalue="" checked="False" label="Filter low frequency kmers by a 4G-bytes array" />
+<sanitizer invalid_char="">
-<param argument="S" type="integer" value="4" label="Subsampling kmers, 1/S kmers are indexed" />
+<valid initial="string.letters,string.digits">
-<param argument="X" type="integer" value="4" label="Max number of bin (256bp) in one gap" />
+<add value="." />
-<param argument="Y" type="integer" value="4" label="Max number of bin (256bp) in one deviation" />
+</valid>
-<param argument="x" type="integer" value="-7" label="penalty for BIN gap" />
+</sanitizer>
-<param argument="y" type="integer" value="-21" label="penalty for BIN deviation" />
+</param>
-<param argument="l" type="float" value="2048" min="1" label="Min length of alignment" />
-<param argument="m" type="float" value="200" label="Min matched" />
+<section name="asm" title="Assembly Options" expanded="false">
-<param argument="s" type="float" value="0.2" label="Max length variation of two aligned fragments" />
+<param argument="-X" type="float" value="50" label="Read depth" help="(-X) [float] Choose the best [float] depth from input reads. ie if the estimated genome size is 5m, setting this value to 50.0 would select the best 2.5mb worth of reads." />
+<param argument="-L" type="integer" value="0" label="Min read length" help="(-L) [int] Choose the longest subread and drop reads shorter than [int]" />
-<param argument="--tidy-reads" name="tidy_reads" type="integer" value="0" label="Filter reads less than tidy-reads" />
+<param argument="-k" type="integer" value="0" min="0" max="23" label="Kmer size" help="(-k) [int] Kmer size, 0 &#8804; k &#8804; 23" />
-<param argument="--edge-min" name="edge_min" type="integer" value="3" label="The minimal depth of a valid edge set to" />
+<param argument="-p" type="integer" value="21" label="Homopolymer-compressed kmer size" help="(-p) [int] Homopolymer-compressed kmer size, 0 &#8804; p &#8804; 23" />
-<param argument="--rescue-low-cov-edges" name="rescue_low_cov_edges" type="boolean" truevalue="--rescue-low-cov-edges"
+<param argument="-K" type="float" value="1000" label="Max kmer frequency" help="(-K) [float] Filter high frequency kmers where frequency > [float]" />
-falsevalue="" label="Try to rescue low coverage edges" />
+<param argument="-s" type="float" value="0.05" label="Min read similarity" help="(-s) [float] Min similarity between reads to label as related, calculated by kmer matched length / aligned length" />
+<param argument="-e" type="integer" value="3" label="Min edge depth" help="(-e) [int] Min read depth of a valid edge" />
-<section name="cns" title="Consensus options">
+<param name="realign" type="boolean" truevalue="-R" falsevalue="" label="Realignment" help="(-R) Enable realignment mode" />
-<!-- optional inputs -->
+<param name="contained_reads" type="boolean" truevalue="-A" falsevalue="" label="Contained reads" help="(-A) Keep contained reads during alignment" />
-<!-- <param argument="-i" type="data" format="utg.cns" label="Input file(s) *.utg.cns" /> -->
+</section>
-<param argument="-j" type="integer" value="1000" label="Expected length of node" />
+<section name="cns" title="Consensus Options" expanded="false">
-<param argument="-k" type="integer" value="15" label="Kmer size for long reads" />
+<param argument="-j" type="integer" value="1500" label="Expected length of node" />
-<param argument="-Z" type="integer" value="4" label="Z-cutoff, drop the lower" />
+<param argument="-M" type="integer" value="2" label="Match score" />
-<param argument="-W" type="integer" value="48" label="W-cutoff, drop the lagger (position)" />
+<param argument="-X" type="integer" value="-5" label="Mismatch score" />
-<param argument="-H" type="integer" value="1" label="High coverage bonus" />
+<param argument="-I" type="integer" value="-2" label="Insertion score" />
-<param argument="-L" type="integer" value="10" label="High coverage cutoff" />
+<param argument="-D" type="integer" value="-4" label="Deletion score" />
-<param argument="-c" type="select" label="Candidate strategy">
+<param argument="-b" type="integer" value="0" label="Tri-base match bonus" />
-<option value="0" selected="true">best-kmers</option>
+<param argument="-H" type="integer" value="-3" label="Homopolymer merge score used in dp-call-cns mode" />
-<option value="1" >median length</option>
+<param argument="-B" type="text" value="64,1024,0.92" label="POA Bandwidth (Wmin,Wmax,mat_rate)" help="mat_rate = matched_bases/total_bases" />
-<option value="2" >first (include)</option>
+<param argument="-W" type="integer" value="200" label="Window size in the middle of the first read for fast align remaining reads. If -W is negative, will disable fast align, but use the abs(-W) as Band align score cutoff" />
-<option value="3" >first (exclude)</option>
+<param argument="-w" type="integer" value="100" label="Min size of aligned size in window. Will default to -W * 0.5" />
-<option value="4" >longest</option>
+<param argument="-A" type="boolean" truevalue="-A" falsevalue="" label="Abort TriPOA" help="Abort TriPOA when any read cannot be fast aligned, then try POA" />
-<option value="5" >shortest</option>
+<param argument="-S" type="select" label="Shuffle mode">
+<option value="0">don't shuffle reads</option>
+<option value="1" selected="true">shuffle using shared kmers</option>
+<option value="2" >shuffle using subsampling</option>
 </param>
+<param argument="-R" type="integer" value="16" label="Realignment bandwidth" help="set to 0 to disable" />
-<param argument="-M" type="integer" value="2" label="Match score" />
+<param argument="-c" type="select" label="Consensus mode">
-<param argument="-X" type="integer" value="-7" label="Mismatch score" />
+<option value="0" selected="true">run-length</option>
-<param argument="-I" type="integer" value="-3" label="Insertion score" />
+<option value="1">dp-call-cns</option>
-<param argument="-D" type="integer" value="-4" label="Deletion score" />
-<param argument="-E" type="integer" value="-2" label="Gap extension score" />
-<param argument="-m" type="select" label="Correction mode">
-<option value="1" selected="true">DBG correction</option>
-<option value="2" >DAG correction</option>
 </param>
-<param argument="-S" type="integer" value="1" label="Correct structure before error correction" />
+<param argument="-C" type="integer" value="3" label="Min count of bases to call a consensus base" />
+<param argument="-F" type="float" value="0.5" label="Min frequency of non-gap bases to call a consensus base" />
+<param argument="-N" type="integer" value="20" label="Max number of reads in PO-MSA" />
 </section>
 </inputs>
 <outputs>
-<data name="output_alignments" format="fasta" label="${tool.name}  alignments" from_work_dir="dbg.alignments" />
+<data name='out_assembly' format='fasta' label="${tool.name} on ${on_string}: assembled contigs" from_work_dir="out.fa" />
-<data name="output_ctglay" format="txt" label="${tool.name}  contigs layout" from_work_dir="dbg.ctg.lay" />
-<data name="output_consensus" format="fasta" label="${tool.name} consensus" from_work_dir="dbg.ctg.lay.fa" />
 </outputs>
 <tests>
-<test>
+<test expect_num_outputs="1">
-<param name="i" value="ecoli-reads.fa"/>
+<param name="input_reads" value="test1.fastq" />
-<output name="output_alignments" file="result1.alignments"/>
+<param name="sequencing_technology" value="ont" />
-<output name="output_ctglay" file="result1.ctg.lay"/>
+<param name="genome_size" value="60k" />
-<output name="output_consensus" file="consensus_result1.fa"/>
+<output name="out_assembly">
+<assert_contents>
+<has_text text=">ctg1 " />
+<has_text text=">ctg2 " />
+<has_text text=">ctg3 " />
+<has_size value="70000" delta="10000" />
+</assert_contents>
+</output>
 </test>
-<test>
+<test expect_num_outputs="1">
-<param name="i" value="ecoli-reads.fa"/>
+<param name="input_reads" value="test1_head.fa,test1_tail.fastq" />
-<param name="tidy_reads" value="5000"/>
+<param name="sequencing_technology" value="ont" />
-<param name="edge_min" value="2"/>
+<param name="genome_size" value="60k" />
-<param name="rescue_low_cov_edges" value="True"/>
+<output name="out_assembly">
-<output name="output_consensus" file="consensus_result2.fa"/>
+<assert_contents>
+<has_text text=">ctg1 " />
+<has_text text=">ctg2 " />
+<has_text text=">ctg3 " />
+<has_size value="70000" delta="10000" />
+</assert_contents>
+</output>
 </test>
-<test>
+<test expect_num_outputs="1">
-<param name="i" value="ecoli-reads.fa"/>
+<param name="input_reads" value="test1.fastq" />
-<param name="cns.c" value="1"/>
+<param name="sequencing_technology" value="ont" />
-<param name="cns.E" value="-3"/>
+<param name="genome_size" value="60k" />
-<param name="cns.j" value="500"/>
+<section name="asm">
-<param name="cns.m" value="2"/>
+<param name='X' value="10.0" />
-<param name="cns.k" value="5"/>
+<param name='L' value="2000" />
-<output name="output_consensus" file="consensus_result3.fa"/>
+<param name='k' value="15" />
+<param name='p' value="0" />
+<param name='K' value="500" />
+<param name='l' value="1024" />
+<param name='m' value="200" />
+<param name='s' value="0.1" />
+<param name='e' value="5" />
+</section>
+<output name="out_assembly">
+<assert_contents>
+<has_text text=">ctg1 " />
+<has_text text=">ctg2 " />
+<has_text text=">ctg3 " />
+<has_size value="50000" delta="10000" />
+</assert_contents>
+</output>
 </test>
 </tests>
 <help><![CDATA[
+**Wtdbg2**
+|
 **What it does**
 WTDBG is a de novo assembler for long noisy sequences, based on fuzzy Bruijn graphs (FBG).
+"Wtdbg2 is a de novo sequence assembler for long noisy reads produced by PacBio or Oxford Nanopore Technologies (ONT). It assembles raw reads without error correction and then builds the consensus from intermediate assembly output. Wtdbg2 is able to assemble the human and even the 32Gb Axolotl genome at a speed tens of times faster than CANU and FALCON while producing contigs of comparable base accuracy.
 **Alignment**
 KBM (Kmer-BIN-Mapping) groups k-mers from each non-overlapped sliding 256 bp fragments in long reads into bins.
 Bins of which most k-mers are high frequency, are filtered as highly repetitive ones.
 FBG (Fuzzy Bruijn Graph) is composed of vertices in length of 1024 bp from reads, and edges connecting vertices
 in their order on read paths. Comparing with DBG, the size of vertices in FBG are much bigger, thus won't be
 sensitive to small repeat. To tolerate high sequencing errors, FBG's vertices are found using gapped
 sequence alignments from KBM or other aligners, comparing with searching identical k-mers in DBG.
-]]></help>
+See the github (https://github.com/ruanjue/wtdbg2) and paper (https://doi.org/10.1038/s41592-019-0669-3) for more information.
+|
+**Input**
+One or more fastq or fasta files. Can be in any fastq/fasta format with any valid exension.
+|
+**Output**
+Assembled contigs (the assembled genome).
+To polish, use external tools such as pilon, racon, medaka, nextpolish etc.
+|
+**Sequencing Technology Preset Information**
+- Oxford Nanopore (ont) (genome size < 1G):     -p 0 -k 15 -AS 2 -s 0.05 -L 5000
+- Oxford Nanopore (ont) (genome size >= 1G):    -p 19 -AS 2 -s 0.05 -L 5000
+- PacBio CCS (ccs):                             -p 21 -k 0 -AS 4 -K 0.05 -s 0.5
+- PacBio RSII (rs):                             -p 21 -S 4 -s 0.05 -L 5000
+- PacBio Sequel (sq) (genome size < 1G):        -p 0 -k 15 -AS 2 -s 0.05 -L 5000
+- PacBio Sequel (sq) (genome size >= 1G):       -p 19 -AS 2 -s 0.05 -L 5000
+|
+]]></help>
 <expand macro="citations" />
 </tool>

Mercurial > repos > bgruening > wtdbg

comparison wtdbg.xml @ 2:2668027a533b draft default tip