Mercurial > repos > nml > refseq_masher
comparison matches.xml @ 0:26df66c32861 draft
planemo upload commit 80c22275be05e29208e991019309dfffa9704f39
author | nml |
---|---|
date | Thu, 15 Feb 2018 13:59:31 -0500 |
parents | |
children | 2c1cb37a3ffe |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:26df66c32861 |
---|---|
1 <tool id="refseq_masher_matches" name="RefSeq Masher Matches" version="0.1.1"> | |
2 <description> | |
3 Find closest matching NCBI RefSeq Genomes to your sequences | |
4 </description> | |
5 <requirements> | |
6 <requirement type="package" version="0.1.1">refseq_masher</requirement> | |
7 </requirements> | |
8 <command detect_errors="exit_code"> | |
9 <![CDATA[ | |
10 | |
11 #import re | |
12 | |
13 #if $input.type == 'fasta' | |
14 #set $input_files = '"{}"'.format($input.fasta.name) | |
15 ln -s "$input.fasta" $input_files && | |
16 #elif $input.type == 'paired' | |
17 #set $_forward_ext = '.fastq.gz' if $re.match(r'.*\.gz$', $input.forward.name) else '.fastq' | |
18 #set $_forward = '"{}_1{}"'.format($re.sub(r'_[12]\..+$', '', $input.forward.name), $_forward_ext) | |
19 #set $_reverse_ext = '.fastq.gz' if $re.match(r'.*\.gz$', $input.reverse.name) else '.fastq' | |
20 #set $_reverse = '"{}_2{}"'.format($re.sub(r'_[12]\..+$', '', $input.reverse.name), $_reverse_ext) | |
21 #set $input_files = '{} {}'.format($_forward, $_reverse) | |
22 ln -s "$input.forward" $_forward && | |
23 ln -s "$input.reverse" $_reverse && | |
24 #elif $input.type == 'single' | |
25 #set $input_files = '"{}"'.format($input.single.name) | |
26 ln -s "$input.single" $input_files && | |
27 #elif $input.type == 'paired_collection' | |
28 #set $_forward_ext = '.fastq.gz' if $re.match(r'.*\.gz$', str($input.paired_collection.forward)) else '.fastq' | |
29 #set $_forward = '"{}_1{}"'.format($input.paired_collection.name, $_forward_ext) | |
30 #set $_reverse_ext = '.fastq.gz' if $re.match(r'.*\.gz$', str($input.paired_collection.reverse)) else '.fastq' | |
31 #set $_reverse = '"{}_2{}"'.format($input.paired_collection.name, $_reverse_ext) | |
32 #set $input_files = '{} {}'.format($_forward, $_reverse) | |
33 ln -s "$input.paired_collection.forward" $_forward && | |
34 ln -s "$input.paired_collection.reverse" $_reverse && | |
35 #end if | |
36 | |
37 refseq_masher | |
38 $adv.verbosity | |
39 matches | |
40 --output refseq_masher-matches.${adv.output_type} | |
41 --output-type $adv.output_type | |
42 --top-n-results $top_n_results | |
43 #if $adv.min_kmer_threshold | |
44 --min-kmer-threshold $adv.min_kmer_threshold | |
45 #end if | |
46 -T "\${TMPDIR:-/tmp}" | |
47 $input_files | |
48 ]]> | |
49 </command> | |
50 <inputs> | |
51 <conditional name="input"> | |
52 <param name="type" type="select" label="Sequence input type"> | |
53 <option value="fasta">Genome FASTA</option> | |
54 <option value="paired">Paired-end FASTQs</option> | |
55 <option value="single">Single-end FASTQ</option> | |
56 <option value="paired_collection">Paired-end FASTQ collection</option> | |
57 </param> | |
58 <when value="fasta"> | |
59 <param name="fasta" | |
60 type="data" format="fasta" | |
61 optional="false" | |
62 label="Genome FASTA file" | |
63 /> | |
64 </when> | |
65 <when value="paired"> | |
66 <param name="forward" | |
67 type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa" | |
68 optional="false" | |
69 label="Forward FASTQ file" | |
70 /> | |
71 <param name="reverse" | |
72 type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa" | |
73 optional="false" | |
74 label="Reverse FASTQ file" | |
75 help="File format must match the Forward FASTQ file" | |
76 /> | |
77 </when> | |
78 <when value="single"> | |
79 <param name="single" | |
80 type="data" format="fastq,fastqsanger,fastqillumina,fastqsolexa" | |
81 optional="false" | |
82 label="Single-end FASTQ file" | |
83 /> | |
84 </when> | |
85 <when value="paired_collection"> | |
86 <param name="paired_collection" | |
87 type="data_collection" format="fastq,fastqsanger,fastqillumina,fastqsolexa,fastq.gz,txt" | |
88 collection_type="paired" | |
89 optional="false" | |
90 label="Paired-end FASTQ collection" | |
91 /> | |
92 </when> | |
93 </conditional> | |
94 <param name="top_n_results" | |
95 type="integer" | |
96 min="0" | |
97 value="20" | |
98 optional="true" | |
99 label="Top N matches to report (set to 0 to report all)" | |
100 /> | |
101 <section name="adv" title="Advanced Options" expanded="false"> | |
102 <param name="min_kmer_threshold" | |
103 type="integer" | |
104 min="1" | |
105 value="8" | |
106 optional="true" | |
107 label="Mash sketch of reads: Minimum copies of each k-mer required to pass noise filter for reads (default=8)" | |
108 /> | |
109 <param name="output_type" | |
110 type="select" | |
111 label="Output type" | |
112 multiple="false"> | |
113 <option value="tab" selected="true"> | |
114 Tabular (tab-delimited values) | |
115 </option> | |
116 <option value="csv"> | |
117 CSV (Comma Separated Values) | |
118 </option> | |
119 </param> | |
120 <param name="verbosity" | |
121 type="select" | |
122 label="Logging verbosity"> | |
123 <option value="">Error messages only</option> | |
124 <option value="-v">Show warning messages</option> | |
125 <option value="-vv" selected="true">Show info messages</option> | |
126 <option value="-vvv">Show debug messages</option> | |
127 </param> | |
128 </section> | |
129 </inputs> | |
130 <outputs> | |
131 <data name="output_path_csv" | |
132 format="csv" | |
133 label="RefSeq Masher matches table" | |
134 from_work_dir="refseq_masher-matches.csv"> | |
135 <filter>adv['output_type'] == 'csv'</filter> | |
136 </data> | |
137 <data name="output_path_tab" | |
138 format="tabular" | |
139 label="RefSeq Masher matches table" | |
140 from_work_dir="refseq_masher-matches.tab"> | |
141 <filter>adv['output_type'] == 'tab'</filter> | |
142 </data> | |
143 </outputs> | |
144 <tests> | |
145 <test> | |
146 <conditional name="input"> | |
147 <param name="type" value="fasta"/> | |
148 <param name="fasta" value="Se-Enteritidis.fasta"/> | |
149 </conditional> | |
150 <param name="top_n_results" value="1"/> | |
151 <section name="adv"> | |
152 <param name="output_type" value="tab"/> | |
153 </section> | |
154 <output name="output_path_tab" | |
155 value="Se-Enteritidis-refseq_masher-matches.tab" | |
156 ftype="tabular" | |
157 lines_diff="0"> | |
158 </output> | |
159 </test> | |
160 <test> | |
161 <conditional name="input"> | |
162 <param name="type" value="single"/> | |
163 <param name="single" value="SRR1203042_1-head4000.fastq"/> | |
164 </conditional> | |
165 <param name="top_n_results" value="1"/> | |
166 <section name="adv"> | |
167 <param name="output_type" value="tab"/> | |
168 <param name="min_kmer_threshold" value="2"/> | |
169 </section> | |
170 <output name="output_path_tab" | |
171 value="SRR1203042_1-head4000-refseq_masher-matches-m2.tab" | |
172 ftype="tabular" | |
173 lines_diff="0"> | |
174 </output> | |
175 </test> | |
176 </tests> | |
177 <help> | |
178 <![CDATA[ | |
179 RefSeq Masher - Genomic Distance | |
180 ================================ | |
181 | |
182 Find what NCBI RefSeq genomes most closely match your sequence data using Mash_ with a Mash sketch database of 54,925 NCBI RefSeq Genomes. | |
183 | |
184 | |
185 Source code available on Github at https://github.com/phac-nml/refseq_masher | |
186 | |
187 | |
188 `matches` - find the closest matching NCBI RefSeq Genomes in your input sequences | |
189 --------------------------------------------------------------------------------- | |
190 | |
191 Command-line usage information:: | |
192 | |
193 Usage: refseq_masher matches [OPTIONS] INPUT... | |
194 | |
195 Find NCBI RefSeq genome matches for an input genome fasta file | |
196 | |
197 Input is expected to be one or more FASTA/FASTQ files or one or more | |
198 directories containing FASTA/FASTQ files. Files can be Gzipped. | |
199 | |
200 Options: | |
201 --mash-bin TEXT Mash binary path (default="mash") | |
202 -o, --output PATH Output file path (default="-"/stdout) | |
203 --output-type [tab|csv] Output file type (tab|csv) | |
204 -n, --top-n-results INTEGER Output top N results sorted by distance in | |
205 ascending order (default=5) | |
206 -m, --min-kmer-threshold INTEGER | |
207 Mash sketch of reads: "Minimum copies of | |
208 each k-mer required to pass noise filter for | |
209 reads" (default=8) | |
210 -h, --help Show this message and exit. | |
211 | |
212 | |
213 Example | |
214 ~~~~~~~ | |
215 | |
216 With the FNA.GZ_ file for *Salmonella enterica* subsp. enterica serovar Enteritidis str. CHS44_:: | |
217 | |
218 | |
219 # download sequence file | |
220 wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/329/025/GCF_000329025.1_ASM32902v1/GCF_000329025.1_ASM32902v1_genomic.fna.gz | |
221 | |
222 # find RefSeq matches | |
223 refseq_masher -vv matches GCF_000329025.1_ASM32902v1_genomic.fna.gz | |
224 | |
225 | |
226 **Log**:: | |
227 | |
228 | |
229 2018-01-29 11:02:13,786 INFO: Collected 1 FASTA inputs and 0 read sets [in ...refseq_masher/refseq_masher/utils.py:185] | |
230 2018-01-29 11:02:13,786 INFO: Creating Mash sketch file for ...refseq_masher/GCF_000329025.1_ASM32902v1_genomic.fna.gz [in ...refseq_masher/refseq_masher/mash/sketch.py:24] | |
231 2018-01-29 11:02:14,055 INFO: Created Mash sketch file at "/tmp/GCF_000329025.1_ASM32902v1_genomic.msh" [in ...refseq_masher/refseq_masher/mash/sketch.py:40] | |
232 2018-01-29 11:02:14,613 INFO: Ran Mash dist successfully (output length=11647035). Parsing Mash dist output [in ...refseq_masher/refseq_masher/mash/dist.py:64] | |
233 2018-01-29 11:02:15,320 INFO: Parsed Mash dist output into Pandas DataFrame with 54924 rows [in ...refseq_masher/refseq_masher/mash/dist.py:67] | |
234 2018-01-29 11:02:15,321 INFO: Deleting temporary sketch file "/tmp/GCF_000329025.1_ASM32902v1_genomic.msh" [in ...refseq_masher/refseq_masher/mash/dist.py:72] | |
235 2018-01-29 11:02:15,321 INFO: Sketch file "/tmp/GCF_000329025.1_ASM32902v1_genomic.msh" deleted! [in ...refseq_masher/refseq_masher/mash/dist.py:74] | |
236 2018-01-29 11:02:15,322 INFO: Ran Mash dist on all input. Merging NCBI taxonomic information into results output. [in ...refseq_masher/refseq_masher/cli.py:88] | |
237 2018-01-29 11:02:15,323 INFO: Fetching all taxonomy info for 5 unique NCBI Taxonomy UIDs [in ...refseq_masher/refseq_masher/taxonomy.py:35] | |
238 2018-01-29 11:02:15,325 INFO: Dropping columns with all NA values (ncol=32) [in ...refseq_masher/refseq_masher/taxonomy.py:38] | |
239 2018-01-29 11:02:15,327 INFO: Columns with all NA values dropped (ncol=11) [in ...refseq_masher/refseq_masher/taxonomy.py:40] | |
240 2018-01-29 11:02:15,327 INFO: Merging Mash results with relevant taxonomic information [in ...refseq_masher/refseq_masher/taxonomy.py:41] | |
241 2018-01-29 11:02:15,329 INFO: Merged Mash results with taxonomy info [in ...refseq_masher/refseq_masher/taxonomy.py:43] | |
242 2018-01-29 11:02:15,329 INFO: Merged taxonomic info into results output [in ...refseq_masher/refseq_masher/cli.py:90] | |
243 2018-01-29 11:02:15,329 INFO: Reordering output columns [in ...refseq_masher/refseq_masher/cli.py:91] | |
244 2018-01-29 11:02:15,331 INFO: Writing output to stdout [in ...refseq_masher/refseq_masher/writers.py:16] | |
245 | |
246 | |
247 **Output** | |
248 | |
249 +---------------------------------------+--------------------------------------------------------------------+----------+--------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------+---------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+-------------+---------+-------------+--------------+--------+---------------------+------------------------------------------------------------------------------------------------------------------------------------------+ | |
250 | sample | top_taxonomy_name | distance | pvalue | matching | full_taxonomy | taxonomic_subspecies | taxonomic_species | taxonomic_genus | taxonomic_family | taxonomic_order | taxonomic_class | taxonomic_phylum | taxonomic_superkingdom | subspecies | serovar | plasmid | bioproject | biosample | taxid | assembly_accession | match_id | | |
251 +=======================================+====================================================================+==========+========+==========+=============================================================================================================================================================+=====================================+=====================+==================+====================+==================+=====================+===================+=========================+============+=============+=========+=============+==============+========+=====================+==========================================================================================================================================+ | |
252 | GCF_000329025.1_ASM32902v1_genomic | Salmonella enterica subsp. enterica serovar Enteritidis str. CHS44 | 0.0 | 0.0 | 400/400 | Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacterales; Enterobacteriaceae; Salmonella; enterica; subsp. enterica; serovar Enteritidis; str. CHS44 | Salmonella enterica subsp. enterica | Salmonella enterica | Salmonella | Enterobacteriaceae | Enterobacterales | Gammaproteobacteria | Proteobacteria | Bacteria | enterica | Enteritidis | | PRJNA185053 | SAMN01041154 | 702979 | NZ_ALFF | ./rcn/refseq-NZ-702979-PRJNA185053-SAMN01041154-NZ_ALFF-.-Salmonella_enterica_subsp._enterica_serovar_Enteritidis_str._CHS44.fna | | |
253 +---------------------------------------+--------------------------------------------------------------------+----------+--------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------+---------------------+------------------+--------------------+------------------+---------------------+-------------------+-------------------------+------------+-------------+---------+-------------+--------------+--------+---------------------+------------------------------------------------------------------------------------------------------------------------------------------+ | |
254 | |
255 | |
256 The top match is *Salmonella enterica* subsp. enterica serovar Enteritidis str. CHS44_ with a distance of 0.0 and 400/400 sketches matching, which is what we expected. There's other taxonomic information available in the results table that may be useful. | |
257 | |
258 | |
259 | |
260 Legal | |
261 ----- | |
262 | |
263 Copyright Government of Canada 2017 | |
264 | |
265 Written by: National Microbiology Laboratory, Public Health Agency of Canada | |
266 | |
267 Licensed under the Apache License, Version 2.0 (the "License"); you may not use | |
268 this work except in compliance with the License. You may obtain a copy of the | |
269 License at: | |
270 | |
271 http://www.apache.org/licenses/LICENSE-2.0 | |
272 | |
273 Unless required by applicable law or agreed to in writing, software distributed | |
274 under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |
275 CONDITIONS OF ANY KIND, either express or implied. See the License for the | |
276 specific language governing permissions and limitations under the License. | |
277 | |
278 Contact | |
279 ------- | |
280 | |
281 **Gary van Domselaar**: gary.vandomselaar@phac-aspc.gc.ca | |
282 | |
283 .. _Mash: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x | |
284 .. _FNA.GZ: ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/329/025/GCF_000329025.1_ASM32902v1/GCF_000329025.1_ASM32902v1_genomic.fna.gz | |
285 .. _CHS44: ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/329/025/GCF_000329025.1_ASM32902v1/ | |
286 | |
287 | |
288 ]]> | |
289 </help> | |
290 <citations> | |
291 <!-- Citation for Mash paper --> | |
292 <citation type="doi">10.1186/s13059-016-0997-x</citation> | |
293 </citations> | |
294 </tool> |