comparison matches.xml @ 0:26df66c32861 draft

planemo upload commit 80c22275be05e29208e991019309dfffa9704f39
author nml
date Thu, 15 Feb 2018 13:59:31 -0500
parents
children 2c1cb37a3ffe
comparison
equal deleted inserted replaced
-1:000000000000 0:26df66c32861
1 <tool id="refseq_masher_matches" name="RefSeq Masher Matches" version="0.1.1">
2 <description>
3 Find closest matching NCBI RefSeq Genomes to your sequences
4 </description>
5 <requirements>
6 <requirement type="package" version="0.1.1">refseq_masher</requirement>
7 </requirements>
8 <command detect_errors="exit_code">
9 <![CDATA[
10
11 #import re
12
13 #if $input.type == 'fasta'
14 #set $input_files = '"{}"'.format($input.fasta.name)
15 ln -s "$input.fasta" $input_files &&
16 #elif $input.type == 'paired'
17 #set $_forward_ext = '.fastq.gz' if $re.match(r'.*\.gz$', $input.forward.name) else '.fastq'
18 #set $_forward = '"{}_1{}"'.format($re.sub(r'_[12]\..+$', '', $input.forward.name), $_forward_ext)
19 #set $_reverse_ext = '.fastq.gz' if $re.match(r'.*\.gz$', $input.reverse.name) else '.fastq'
20 #set $_reverse = '"{}_2{}"'.format($re.sub(r'_[12]\..+$', '', $input.reverse.name), $_reverse_ext)
21 #set $input_files = '{} {}'.format($_forward, $_reverse)
22 ln -s "$input.forward" $_forward &&
23 ln -s "$input.reverse" $_reverse &&
24 #elif $input.type == 'single'
25 #set $input_files = '"{}"'.format($input.single.name)
26 ln -s "$input.single" $input_files &&
27 #elif $input.type == 'paired_collection'
28 #set $_forward_ext = '.fastq.gz' if $re.match(r'.*\.gz$', str($input.paired_collection.forward)) else '.fastq'
29 #set $_forward = '"{}_1{}"'.format($input.paired_collection.name, $_forward_ext)
30 #set $_reverse_ext = '.fastq.gz' if $re.match(r'.*\.gz$', str($input.paired_collection.reverse)) else '.fastq'
31 #set $_reverse = '"{}_2{}"'.format($input.paired_collection.name, $_reverse_ext)
32 #set $input_files = '{} {}'.format($_forward, $_reverse)
33 ln -s "$input.paired_collection.forward" $_forward &&
34 ln -s "$input.paired_collection.reverse" $_reverse &&
35 #end if
36
37 refseq_masher
38 $adv.verbosity
39 matches
40 --output refseq_masher-matches.${adv.output_type}
41 --output-type $adv.output_type
42 --top-n-results $top_n_results
43 #if $adv.min_kmer_threshold
44 --min-kmer-threshold $adv.min_kmer_threshold
45 #end if
46 -T "\${TMPDIR:-/tmp}"
47 $input_files
48 ]]>
49 </command>
50 <inputs>
51 <conditional name="input">
52 <param name="type" type="select" label="Sequence input type">
53 <option value="fasta">Genome FASTA</option>
54 <option value="paired">Paired-end FASTQs</option>
55 <option value="single">Single-end FASTQ</option>
56 <option value="paired_collection">Paired-end FASTQ collection</option>
57 </param>
58 <when value="fasta">
59 <param name="fasta"
60 type="data" format="fasta"
61 optional="false"
62 label="Genome FASTA file"
63 />
64 </when>
65 <when value="paired">
66 <param name="forward"
67 type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa"
68 optional="false"
69 label="Forward FASTQ file"
70 />
71 <param name="reverse"
72 type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa"
73 optional="false"
74 label="Reverse FASTQ file"
75 help="File format must match the Forward FASTQ file"
76 />
77 </when>
78 <when value="single">
79 <param name="single"
80 type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa"
81 optional="false"
82 label="Single-end FASTQ file"
83 />
84 </when>
85 <when value="paired_collection">
86 <param name="paired_collection"
87 type="data_collection" format="fastq,fastqsanger,fastqillumina,fastqsolexa,fastq.gz,txt"
88 collection_type="paired"
89 optional="false"
90 label="Paired-end FASTQ collection"
91 />
92 </when>
93 </conditional>
94 <param name="top_n_results"
95 type="integer"
96 min="0"
97 value="20"
98 optional="true"
99 label="Top N matches to report (set to 0 to report all)"
100 />
101 <section name="adv" title="Advanced Options" expanded="false">
102 <param name="min_kmer_threshold"
103 type="integer"
104 min="1"
105 value="8"
106 optional="true"
107 label="Mash sketch of reads: Minimum copies of each k-mer required to pass noise filter for reads (default=8)"
108 />
109 <param name="output_type"
110 type="select"
111 label="Output type"
112 multiple="false">
113 <option value="tab" selected="true">
114 Tabular (tab-delimited values)
115 </option>
116 <option value="csv">
117 CSV (Comma Separated Values)
118 </option>
119 </param>
120 <param name="verbosity"
121 type="select"
122 label="Logging verbosity">
123 <option value="">Error messages only</option>
124 <option value="-v">Show warning messages</option>
125 <option value="-vv" selected="true">Show info messages</option>
126 <option value="-vvv">Show debug messages</option>
127 </param>
128 </section>
129 </inputs>
130 <outputs>
131 <data name="output_path_csv"
132 format="csv"
133 label="RefSeq Masher matches table"
134 from_work_dir="refseq_masher-matches.csv">
135 <filter>adv['output_type'] == 'csv'</filter>
136 </data>
137 <data name="output_path_tab"
138 format="tabular"
139 label="RefSeq Masher matches table"
140 from_work_dir="refseq_masher-matches.tab">
141 <filter>adv['output_type'] == 'tab'</filter>
142 </data>
143 </outputs>
144 <tests>
145 <test>
146 <conditional name="input">
147 <param name="type" value="fasta"/>
148 <param name="fasta" value="Se-Enteritidis.fasta"/>
149 </conditional>
150 <param name="top_n_results" value="1"/>
151 <section name="adv">
152 <param name="output_type" value="tab"/>
153 </section>
154 <output name="output_path_tab"
155 value="Se-Enteritidis-refseq_masher-matches.tab"
156 ftype="tabular"
157 lines_diff="0">
158 </output>
159 </test>
160 <test>
161 <conditional name="input">
162 <param name="type" value="single"/>
163 <param name="single" value="SRR1203042_1-head4000.fastq"/>
164 </conditional>
165 <param name="top_n_results" value="1"/>
166 <section name="adv">
167 <param name="output_type" value="tab"/>
168 <param name="min_kmer_threshold" value="2"/>
169 </section>
170 <output name="output_path_tab"
171 value="SRR1203042_1-head4000-refseq_masher-matches-m2.tab"
172 ftype="tabular"
173 lines_diff="0">
174 </output>
175 </test>
176 </tests>
177 <help>
178 <![CDATA[
179 RefSeq Masher - Genomic Distance
180 ================================
181
182 Find what NCBI RefSeq genomes most closely match your sequence data using Mash_ with a Mash sketch database of 54,925 NCBI RefSeq Genomes.
183
184
185 Source code available on Github at https://github.com/phac-nml/refseq_masher
186
187
188 `matches` - find the closest matching NCBI RefSeq Genomes in your input sequences
189 ---------------------------------------------------------------------------------
190
191 Command-line usage information::
192
193 Usage: refseq_masher matches [OPTIONS] INPUT...
194
195 Find NCBI RefSeq genome matches for an input genome fasta file
196
197 Input is expected to be one or more FASTA/FASTQ files or one or more
198 directories containing FASTA/FASTQ files. Files can be Gzipped.
199
200 Options:
201 --mash-bin TEXT Mash binary path (default="mash")
202 -o, --output PATH Output file path (default="-"/stdout)
203 --output-type [tab|csv] Output file type (tab|csv)
204 -n, --top-n-results INTEGER Output top N results sorted by distance in
205 ascending order (default=5)
206 -m, --min-kmer-threshold INTEGER
207 Mash sketch of reads: "Minimum copies of
208 each k-mer required to pass noise filter for
209 reads" (default=8)
210 -h, --help Show this message and exit.
211
212
213 Example
214 ~~~~~~~
215
216 With the FNA.GZ_ file for *Salmonella enterica* subsp. enterica serovar Enteritidis str. CHS44_::
217
218
219 # download sequence file
220 wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/329/025/GCF_000329025.1_ASM32902v1/GCF_000329025.1_ASM32902v1_genomic.fna.gz
221
222 # find RefSeq matches
223 refseq_masher -vv matches GCF_000329025.1_ASM32902v1_genomic.fna.gz
224
225
226 **Log**::
227
228
229 2018-01-29 11:02:13,786 INFO: Collected 1 FASTA inputs and 0 read sets [in ...refseq_masher/refseq_masher/utils.py:185]
230 2018-01-29 11:02:13,786 INFO: Creating Mash sketch file for ...refseq_masher/GCF_000329025.1_ASM32902v1_genomic.fna.gz [in ...refseq_masher/refseq_masher/mash/sketch.py:24]
231 2018-01-29 11:02:14,055 INFO: Created Mash sketch file at "/tmp/GCF_000329025.1_ASM32902v1_genomic.msh" [in ...refseq_masher/refseq_masher/mash/sketch.py:40]
232 2018-01-29 11:02:14,613 INFO: Ran Mash dist successfully (output length=11647035). Parsing Mash dist output [in ...refseq_masher/refseq_masher/mash/dist.py:64]
233 2018-01-29 11:02:15,320 INFO: Parsed Mash dist output into Pandas DataFrame with 54924 rows [in ...refseq_masher/refseq_masher/mash/dist.py:67]
234 2018-01-29 11:02:15,321 INFO: Deleting temporary sketch file "/tmp/GCF_000329025.1_ASM32902v1_genomic.msh" [in ...refseq_masher/refseq_masher/mash/dist.py:72]
235 2018-01-29 11:02:15,321 INFO: Sketch file "/tmp/GCF_000329025.1_ASM32902v1_genomic.msh" deleted! [in ...refseq_masher/refseq_masher/mash/dist.py:74]
236 2018-01-29 11:02:15,322 INFO: Ran Mash dist on all input. Merging NCBI taxonomic information into results output. [in ...refseq_masher/refseq_masher/cli.py:88]
237 2018-01-29 11:02:15,323 INFO: Fetching all taxonomy info for 5 unique NCBI Taxonomy UIDs [in ...refseq_masher/refseq_masher/taxonomy.py:35]
238 2018-01-29 11:02:15,325 INFO: Dropping columns with all NA values (ncol=32) [in ...refseq_masher/refseq_masher/taxonomy.py:38]
239 2018-01-29 11:02:15,327 INFO: Columns with all NA values dropped (ncol=11) [in ...refseq_masher/refseq_masher/taxonomy.py:40]
240 2018-01-29 11:02:15,327 INFO: Merging Mash results with relevant taxonomic information [in ...refseq_masher/refseq_masher/taxonomy.py:41]
241 2018-01-29 11:02:15,329 INFO: Merged Mash results with taxonomy info [in ...refseq_masher/refseq_masher/taxonomy.py:43]
242 2018-01-29 11:02:15,329 INFO: Merged taxonomic info into results output [in ...refseq_masher/refseq_masher/cli.py:90]
243 2018-01-29 11:02:15,329 INFO: Reordering output columns [in ...refseq_masher/refseq_masher/cli.py:91]
244 2018-01-29 11:02:15,331 INFO: Writing output to stdout [in ...refseq_masher/refseq_masher/writers.py:16]
245
246
247 **Output**
248
249 +---------------------------------------+--------------------------------------------------------------------+----------+--------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------+---------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+-------------+---------+-------------+--------------+--------+---------------------+------------------------------------------------------------------------------------------------------------------------------------------+
250 | sample | top_taxonomy_name | distance | pvalue | matching | full_taxonomy | taxonomic_subspecies | taxonomic_species | taxonomic_genus | taxonomic_family | taxonomic_order | taxonomic_class | taxonomic_phylum | taxonomic_superkingdom | subspecies | serovar | plasmid | bioproject | biosample | taxid | assembly_accession | match_id |
251 +=======================================+====================================================================+==========+========+==========+=============================================================================================================================================================+=====================================+=====================+==================+====================+==================+=====================+===================+=========================+============+=============+=========+=============+==============+========+=====================+==========================================================================================================================================+
252 | GCF_000329025.1_ASM32902v1_genomic | Salmonella enterica subsp. enterica serovar Enteritidis str. CHS44 | 0.0 | 0.0 | 400/400 | Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacterales; Enterobacteriaceae; Salmonella; enterica; subsp. enterica; serovar Enteritidis; str. CHS44 | Salmonella enterica subsp. enterica | Salmonella enterica | Salmonella | Enterobacteriaceae | Enterobacterales | Gammaproteobacteria | Proteobacteria | Bacteria | enterica | Enteritidis | | PRJNA185053 | SAMN01041154 | 702979 | NZ_ALFF | ./rcn/refseq-NZ-702979-PRJNA185053-SAMN01041154-NZ_ALFF-.-Salmonella_enterica_subsp._enterica_serovar_Enteritidis_str._CHS44.fna |
253 +---------------------------------------+--------------------------------------------------------------------+----------+--------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------+---------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+-------------+---------+-------------+--------------+--------+---------------------+------------------------------------------------------------------------------------------------------------------------------------------+
254
255
256 The top match is *Salmonella enterica* subsp. enterica serovar Enteritidis str. CHS44_ with a distance of 0.0 and 400/400 sketches matching, which is what we expected. There's other taxonomic information available in the results table that may be useful.
257
258
259
260 Legal
261 -----
262
263 Copyright Government of Canada 2017
264
265 Written by: National Microbiology Laboratory, Public Health Agency of Canada
266
267 Licensed under the Apache License, Version 2.0 (the "License"); you may not use
268 this work except in compliance with the License. You may obtain a copy of the
269 License at:
270
271 http://www.apache.org/licenses/LICENSE-2.0
272
273 Unless required by applicable law or agreed to in writing, software distributed
274 under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
275 CONDITIONS OF ANY KIND, either express or implied. See the License for the
276 specific language governing permissions and limitations under the License.
277
278 Contact
279 -------
280
281 **Gary van Domselaar**: gary.vandomselaar@phac-aspc.gc.ca
282
283 .. _Mash: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x
284 .. _FNA.GZ: ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/329/025/GCF_000329025.1_ASM32902v1/GCF_000329025.1_ASM32902v1_genomic.fna.gz
285 .. _CHS44: ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/329/025/GCF_000329025.1_ASM32902v1/
286
287
288 ]]>
289 </help>
290 <citations>
291 <!-- Citation for Mash paper -->
292 <citation type="doi">10.1186/s13059-016-0997-x</citation>
293 </citations>
294 </tool>