comparison mash.xml @ 0:572587cbb1dd draft default tip

"planemo upload for repository https://github.com/brinkmanlab/galaxy-tools/tree/master/<name of containing folder> commit 33b02e08cbc8f76fb4b8537f8c968393f85a1b5e"
author brinkmanlab
date Fri, 24 Jan 2020 17:41:36 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:572587cbb1dd
1 <tool id="mash" name="MASH" version="2.1" profile="18.09">
2 <description>Fast genome and metagenome distance estimation using MinHash</description>
3 <edam_topics>
4 <edam_topic>topic_0091</edam_topic>
5 </edam_topics>
6 <edam_operations>
7 <edam_operation>operation_0289</edam_operation>
8 <edam_operation>operation_0296</edam_operation>
9 </edam_operations>
10 <macros>
11 <xml name="sketching_options">
12 <!-- sketching -->
13 <param name="kmer_size" type="integer" min="1" max="32" value="21" argument="-k" label="K-mer size" />
14 <param name="sketch_size" type="integer" min="1" value="1000" argument="-s" label="Sketch size" help="Each sketch will have at most this many non-redundant min-hashes." />
15 <param name="hash_seed" type="integer" min="0" max="4294967296" value="42" argument="-S" label="Seed to provide to the hash function" />
16 <param name="kmer_size_thresh" type="float" min="0" max="1" value="0.01" argument="-w" label="Probability threshold for warning about low k-mer size" />
17 <conditional name="input_option">
18 <param name="individual_seq" type="boolean" checked="false" argument="-i" label="Sketch individual sequences" help="Sketch individual sequences, rather than whole files, e.g. for multi-fastas of single-chromosome genomes or pair-wise gene comparisons." />
19 <when value="false">
20 <param name="input_read_set" type="boolean" checked="false" truevalue="-r" falsevalue="" argument="-r" label="Input is a read set" help="Sketch individual sequences, rather than whole files, e.g. for multi-fastas of single-chromosome genomes or pair-wise gene comparisons." />
21 <!-- sketching reads -->
22 <param name="bloom_size" type="text" argument="-b" optional="true" label="Use a Bloom filter of this size (raw bytes or with K/M/G/T)" help="Use a Bloom filter of this size (raw bytes or with K/M/G/T) to filter out unique k-mers. This is useful if exact filtering uses too much memory. However, some unique k-mers may pass erroneously, and copies cannot be counted beyond 2." >
23 <validator type="regex"><![CDATA[^[0-9]+[KMGT]?$]]></validator>
24 </param>
25 <param name="kmer_count" type="integer" min="0" value="1" argument="-m" label="Minimum copies of each k-mer required to pass noise filter for reads." />
26 <param name="target_coverage" type="float" min="0" value="0" argument="-c" label="Target coverage" help="Sketching will conclude if this coverage is reached before the end of the input file (estimated by average k-mer multiplicity)." />
27 <param name="genome_size" type="text" argument="-g" optional="true" label="Genome size (raw bases or with K/M/G/T)" help="If specified, will be used for p-value calculation instead of an estimated size from k-mer content." >
28 <validator type="regex"><![CDATA[^[0-9]+[KMGT]?$]]></validator>
29 </param>
30 </when>
31 </conditional>
32 <!-- alphabet -->
33 <param name="preserve_strand" type="boolean" checked="false" truevalue="-n" falsevalue="" argument="-n" label="Preserve strand" help="By default, strand is ignored by using canonical DNA k-mers, which are alphabetical minima of forward-reverse pairs." />
34 <param name="use_aaa" type="boolean" checked="false" truevalue="-a" falsevalue="" argument="-a" label="Use amino acid alphabet (A-Z, except BJOUXZ)" help="Implied kmer size of 9 and preserve strand" />
35 <param name="alphabet" type="text" area="false" argument="-z" label="Alphabet to base hashes on" >
36 <validator type="regex"><![CDATA[^[A-Za-z]*$]]></validator>
37 </param>
38 <param name="preserve_case" type="boolean" checked="false" truevalue="-Z" falsevalue="" argument="-Z" label="Preserve case in k-mers and alphabet" help="Sequence letters whose case is not in the current alphabet will be skipped when sketching." />
39 </xml>
40 </macros>
41 <requirements>
42 <requirement type="package" version="2.1">mash</requirement>
43 </requirements>
44 <version_command>mash --version</version_command>
45 <command detect_errors="aggressive"><![CDATA[
46 #import os
47 ## ===== Declare reusable parameter output for different modes ===
48 #def sketching
49 -k $commands.kmer_size -s $commands.sketch_size
50 #if $commands.input_option.individual_seq
51 -i ##Hack around current conditional truevalue/falsevalue behavior
52 #end if
53 -S $commands.hash_seed -w $commands.kmer_size_thresh
54 #unless $commands.input_option.individual_seq
55 $commands.input_option.input_read_set
56 #if $commands.input_option.bloom_size
57 -b $commands.input_option.bloom_size
58 #end if
59 -m $commands.input_option.kmer_count
60 -c $commands.input_option.target_coverage
61 #if $commands.input_option.genome_size
62 -g $commands.input_option.genome_size
63 #end if
64 #end unless
65 $commands.preserve_strand $commands.use_aaa
66 #if $commands.alphabet
67 -z "$commands.alphabet"
68 #end if
69 $commands.preserve_case
70 #end def
71
72 ## ===== Change to extension MASH expects =====
73 #if $commands.command in ["dist", "screen"] and $commands.reference.is_of_type("msh")
74 ln -sf $commands.reference ${os.path.basename(str($commands.reference))}.msh &&
75 #end if
76 #if $commands.command in ["dist", "sketch", "triangle"]
77 #for $input in $commands.inputs
78 #if $input.is_of_type("msh")
79 ln -sf $input ${os.path.basename(str($input))}.msh &&
80 #end if
81 #end for
82 #end if
83 #if $commands.command in ["sketch"]
84 ln -sf $output output.msh &&
85 #end if
86
87 ## ===== Execute MASH =====
88 mash $commands.command
89
90 ## ===== Enable threading =====
91 #if False and $commands.command in ["dist", "screen", "sketch", "triangle"]
92 -p \${GALAXY_SLOTS:-1} ## -p seems broken at the time of writing
93 #end if
94
95 ## ===== Select correct output parameters for selected mode =====
96 #if $commands.command == "bounds"
97 >> $output
98 #else if $commands.command == "dist"
99 $commands.table_output -v $commands.max_p -d $commands.max_dist
100 #set $sketch=True
101 #for $input in $commands.inputs
102 #if $input.is_of_type("msh")
103 #set $sketch=False
104 #break
105 #end if
106 #end for
107 #if $sketch
108 $sketching()
109 #end if
110 #if $commands.reference.is_of_type("msh")
111 ${os.path.basename(str($commands.reference))}.msh
112 #else
113 $commands.reference
114 #end if
115 #for $input in $commands.inputs
116 #if $input.is_of_type("msh")
117 ${os.path.basename(str($input))}.msh
118 #else
119 $input
120 #end if
121 #end for
122 >> $output
123 #else if $commands.command == "info"
124 $commands.info_mode $commands.sketch >> $output
125 #else if $commands.command == "paste"
126 "$commands.out_prefix"
127 #for $sketch in $commands.sketches
128 $sketch
129 #end for
130 >> $output
131 #else if $commands.command == "screen"
132 $commands.winner_takes_all -i $commands.min_ident -v $commands.max_p
133 #if $commands.reference.is_of_type("msh")
134 ${os.path.basename(str($commands.reference))}.msh
135 #else
136 $commands.reference
137 #end if
138 #for $pool in $commands.pools
139 $pool
140 #end for
141 >> $output
142 #else if $commands.command == "sketch"
143 -o output
144 $sketching()
145 #for $input in $commands.inputs
146 $input
147 #end for
148 #else if $commands.command == "triangle"
149 $commands.comment_fields
150 $sketching()
151 #for $input in $commands.inputs
152 #if $input.is_of_type("msh")
153 ${os.path.basename(str($input))}.msh
154 #else
155 $input
156 #end if
157 #end for
158 >> $output
159 #end if
160 ]]></command>
161 <inputs>
162 <conditional name="commands">
163 <param name="command" type="select" label="Mode">
164 <option value="bounds">Bounds: Print a table of Mash error bounds</option>
165 <option value="dist">Dist: Estimate the distance of query sequences to references</option>
166 <option value="info">Info: Display information about sketch files</option>
167 <option value="paste">Paste: Create a single sketch file from multiple sketch files</option>
168 <option value="screen">Screen: Determine whether query sequences are within a larger pool of sequences</option>
169 <option value="sketch">Sketch: Create sketches (reduced representations for fast operations)</option>
170 <option value="triangle">Triangle: Estimate a lower-triangular distance matrix</option>
171 </param>
172 <when value="bounds">
173 <param name="kmer_size" type="integer" min="1" max="32" value="21" argument="-k" label="K-mer size" />
174 <param name="error_bound" type="float" min="0" max="1" value="0.99" argument="-p" label="Mash distance estimates will be within the given error bounds with this probability" />
175 </when>
176 <when value="dist">
177 <!-- input -->
178 <param name="reference" type="data" format="fasta,fastq,fasta.gz,fastq.gz,msh" label="Reference" />
179 <param name="inputs" type="data" format="fasta,fastq,fasta.gz,fastq.gz,msh" multiple="true" label="Queries" />
180 <!-- output -->
181 <param name="table_output" type="boolean" checked="false" truevalue="-t" falsevalue="" argument="-t" label="Table output" help="Table output (will not report p-values, but fields will be blank if they do not meet the p-value threshold)." />
182 <param name="max_p" type="float" min="0" max="1" value="1" argument="-v" label="Maximum p-value to report" />
183 <param name="max_dist" type="float" min="0" max="1" value="1" argument="-d" label="Maximum distance to report" />
184 <expand macro="sketching_options" />
185 </when>
186 <when value="info">
187 <param name="info_mode" type="select" label="Mode">
188 <option value="" selected="true">Default</option>
189 <option value="-H">Only show header info, do not list each sketch</option>
190 <option value="-t">Tabular output (rather than padded), with no header</option>
191 <option value="-c">Show hash count histograms for each sketch</option>
192 <option value="-d">Dump sketches in JSON format</option>
193 </param>
194 </when>
195 <when value="paste">
196 <param name="sketches" type="data" format="msh" multiple="true" label="Sketches to merge" />
197 <param name="out_prefix" type="text" area="false" label="Out prefix" />
198 </when>
199 <when value="screen">
200 <param name="reference" type="data" format="msh" label="inputs" help="Use the sketch command to combine multiple sketch inputs." />
201 <param name="pools" type="data" format="fasta,fastq,fasta.gz,fastq.gz" multiple="true" label="Pools" help="The pool sequences are assumed to be nucleotides, and will be 6-frame translated if the inputs are amino acids." />
202 <param name="winner_takes_all" type="boolean" checked="false" truevalue="-w" falsevalue="" argument="-w" label="Winner-takes-all strategy for identity estimates" help="After counting hashes for each query, hashes that appear in multiple inputs will be removed from all except the one with the best identity (ties broken by larger query), and other identities will be reduced. This removes output redundancy, providing a rough compositional outline." />
203 <param name="min_ident" type="float" min="-1" max="1" value="0" argument="-i" label="Minimum identity to report" help="Inclusive unless set to zero, in which case only identities greater than zero (i.e. with at least one shared hash) will be reported. Set to -1 to output everything." />
204 <param name="max_p" type="float" min="0" max="1" value="1" argument="-v" label="Maximum p-value to report" />
205 </when>
206 <when value="sketch">
207 <param name="inputs" type="data" format="fasta,fastq,fasta.gz,fastq.gz" multiple="true" label="Inputs" />
208 <!-- TODO -I and -C -->
209 <expand macro="sketching_options" />
210 </when>
211 <when value="triangle">
212 <param name="inputs" type="data" format="fasta,fastq,fasta.gz,fastq.gz,msh" multiple="true" label="Input sequences" />
213 <param name="comment_fields" type="boolean" checked="false" truevalue="-C" falsevalue="" label="Use comment fields for sequence names instead of IDs" />
214 <expand macro="sketching_options" />
215 </when>
216 </conditional>
217 </inputs>
218 <outputs>
219 <data name="output" format="txt">
220 <change_format>
221 <when input="commands.command" value="bounds" format="tabular" />
222 <when input="commands.command" value="dist" format="tsv" />
223 <when input="commands.command" value="info" format="txt" />
224 <when input="commands.command" value="paste" format="msh" />
225 <when input="commands.command" value="screen" format="txt" />
226 <when input="commands.command" value="sketch" format="msh" />
227 <when input="commands.command" value="triangle" format="phylip" />
228 </change_format>
229 </data>
230 </outputs>
231 <tests>
232 <test expect_num_outputs="1">
233 <conditional name="commands">
234 <param name="command" value="bounds" />
235 </conditional>
236 <output name="output" ftype="tabular">
237 <assert_contents>
238 <has_text text="Mash distance" />
239 <has_text text="Screen distance" />
240 </assert_contents>
241 </output>
242 </test>
243 <test expect_num_outputs="1">
244 <conditional name="commands">
245 <param name="command" value="dist" />
246 <param name="reference" value="test-data/15600_genome.fasta" ftype="fasta" />
247 <param name="genomes" >
248 <collection type="list">
249 <element name="test1" value="test-data/15596_genome.fasta" ftype="fasta" />
250 <element name="test2" value="test-data/15602_genome.fasta" ftype="fasta" />
251 <element name="test3" value="test-data/15599_genome.fasta" ftype="fasta" />
252 </collection>
253 </param>
254 <conditional name="input_option">
255 <param name="individual_seq" value="true" />
256 </conditional>
257 </conditional>
258 <output name="output" checksum="sha256:701ecc0d4fec3383267699ebd9d78dd7b84360ff57754aac4f2bc8bc00aee580" ftype="tsv" />
259 </test>
260 </tests>
261 <help><![CDATA[
262 https://mash.readthedocs.io/en/latest/
263 ]]></help>
264 <citations>
265 <citation type="doi">10.5281/zenodo.3364789</citation>
266 <citation type="doi">10.1186/s13059-016-0997-x</citation>
267 </citations>
268 </tool>