comparison biohansel.xml @ 0:c6e29bb1ab31 draft

planemo upload for repository https://github.com/phac-nml/biohansel commit 854e6b30be03b1402efd45d0a7925cce15c3af0c
author nml
date Wed, 15 Aug 2018 10:34:50 -0400
parents
children 1f8eccf9d15d
comparison
equal deleted inserted replaced
-1:000000000000 0:c6e29bb1ab31
1 <tool id="biohansel" name="biohansel" version="2.1.0">
2 <description>SNP subtyping of genome sequence reads or assemblies</description>
3 <requirements>
4 <requirement type="package" version="2.1.0">bio_hansel</requirement>
5 </requirements>
6 <command detect_errors="exit_code">
7 <![CDATA[
8
9 #import re
10
11 ## Illumina FASTQ naming regular expression (https://github.com/phac-nml/biohansel/issues/38)
12 #set global $ILLUMINA_REGEX = $re.compile(r'^([\w\-\_]+)_S\d+_L\d{3}_R(\d)_001\.fastq(\.gz)?$')
13
14 #def is_gzipped_fastq($data_input)
15 ## Is FASTQ data param gzipped type? i.e. either 'fastq.gz' or 'fastqsanger.gz'?
16 #return $data_input.is_of_type('fastqsanger.gz') or $data_input.is_of_type('fastq.gz')
17 #end def
18
19 #def get_fastq_ext($data_input)
20 ## Get file extension for FASTQ data param
21 #return '.fastq.gz' if $is_gzipped_fastq($data_input) else '.fastq'
22 #end def
23
24 #def base_sample_name($name)
25 ## Get the base sample name and append 1/2 depending on if forward/reverse read
26 #set $illumina_match = $ILLUMINA_REGEX.match($name)
27 #if $illumina_match
28 #return $illumina_match.group(1)
29 #elif $re.search(r'_R(1|2)', $name):
30 #return $re.sub(r'(.+)_R(1|2)([^\.]*)(\..+)', r'\1\3', $name)
31 #elif $re.match(r'.+_\d\.', $name):
32 #return $re.sub(r'(.+)_(\d)(\..+)', r'\1', $name)
33 #else
34 #return $name
35 #end if
36 #end def
37
38 #def get_paired_fastq_filename($data_input, $name=None, $is_forward=True)
39 ## Get paired FASTQ filename for a data param with appropriate file extension
40 ## with '_1' or '_2' appended if forward or reverse reads, respectively.
41 #set $name = $name if $name is not None else $data_input.name
42 #set $name = $base_sample_name($name)
43 #set $postfix = '1' if $is_forward else '2'
44 #set $ending = '_{}{}'.format($postfix, $get_fastq_ext($data_input))
45 #return '"{}"'.format($name) if $ending in $name else '"{}{}"'.format($name, $ending)
46 #end def
47
48 ## Create symlinks from Galaxy *.dat to <sample_name>(.fasta|.fastq|.fastq.gz)
49 #if $input.type == 'fasta'
50 #set $input_files = '"{}"'.format($input.fasta.name)
51 ln -s "$input.fasta" $input_files &&
52 #elif $input.type == 'paired'
53 #set $forward_filename = $get_paired_fastq_filename($input.forward)
54 #set $reverse_filename = $get_paired_fastq_filename($input.reverse, is_forward=False)
55 #set $input_files = '{} {}'.format($forward_filename, $reverse_filename)
56 ln -s "$input.forward" $forward_filename &&
57 ln -s "$input.reverse" $reverse_filename &&
58 #elif $input.type == 'single'
59 #set $input_files = '"{}"'.format($input.single.name)
60 ln -s "$input.single" $input_files &&
61 #elif $input.type == 'paired_collection'
62 #set $forward_filename = $get_paired_fastq_filename($input.paired_collection.forward)
63 #set $reverse_filename = $get_paired_fastq_filename($input.paired_collection.reverse, is_forward=False)
64 #set $input_files = '{} {}'.format($forward_filename, $reverse_filename)
65 ln -s "$input.paired_collection.forward" $forward_filename &&
66 ln -s "$input.paired_collection.reverse" $reverse_filename &&
67 #end if
68
69 ## Checking for custom scheme.
70 #if $type_of_scheme.scheme_type == "custom":
71 #if $type_of_scheme.scheme_input.is_of_type('fasta'):
72 ln -s '$type_of_scheme.scheme_input' '$type_of_scheme.scheme_input.name' &&
73 #end if
74 #end if
75
76 #def get_subtype_metadata_filename($data_input)
77 ## Ensure that the subtype metadata table file has the proper extension - .tab or .csv
78 #set $filename = $data_input.name
79 #if $data_input.is_of_type('tabular')
80 #return '{}.tab'.format($filename)
81 #elif $data_input.is_of_type('csv')
82 #return '{}.csv'.format($filename)
83 #else
84 #return None
85 #end if
86 #end def
87
88 ## Symlink to subtype metadata table if it is specified
89 #if $subtype_metadata
90 #set global $subtype_metadata_filename = $get_subtype_metadata_filename($subtype_metadata)
91 #if $subtype_metadata_filename
92 ln -s '$subtype_metadata' '$subtype_metadata_filename' &&
93 #end if
94 #else
95 #set global $subtype_metadata_filename = None
96 #end if
97
98 #################################
99 ## biohansel command starts here:
100 #################################
101 hansel
102 -vvv
103 -t "\${GALAXY_SLOTS:-1}"
104 -o results.tab
105 -O match_results.tab
106 -S tech_results.tab
107 $dev_args.use_json
108 $input_files
109 --scheme
110 #if $type_of_scheme.scheme_type == "custom":
111 '$type_of_scheme.scheme_input.name'
112 #else:
113 $type_of_scheme.scheme_type
114 #end if
115 #if $subtype_metadata_filename
116 --scheme-metadata $subtype_metadata_filename
117 #end if
118 #if $kmer_vals.kmer_min
119 --min-kmer-freq $kmer_vals.kmer_min
120 #end if
121 #if $kmer_vals.kmer_max
122 --max-kmer-freq $kmer_vals.kmer_max
123 #end if
124 #if $qc_vals.low_cov_depth_freq
125 --low-cov-depth-freq $qc_vals.low_cov_depth_freq
126 #end if
127 #if $qc_vals.max_missing_tiles
128 --max-missing-tiles $qc_vals.max_missing_tiles
129 #end if
130 #if $qc_vals.min_ambiguous_tiles
131 --min-ambiguous-tiles $qc_vals.min_ambiguous_tiles
132 #end if
133 #if $qc_vals.max_intermediate_tiles
134 --max-intermediate-tiles $qc_vals.max_intermediate_tiles
135 #end if
136 #if $qc_vals.low_coverage_warning
137 --low-cov-warning $qc_vals.low_coverage_warning
138 #end if
139 ]]>
140 </command>
141 <inputs>
142 <conditional name="input">
143 <param name="type" type="select" label="Sequence Data Type">
144 <option value="fasta">Contigs (FASTA)</option>
145 <option value="paired">Paired-end reads (FASTQ)</option>
146 <option value="single">Single-end reads (FASTQ)</option>
147 <option value="paired_collection">Paired-end reads collection (FASTQ)</option>
148 </param>
149 <when value="fasta">
150 <param name="fasta"
151 type="data" format="fasta"
152 optional="false"
153 label="Contigs (FASTA)"
154 />
155 </when>
156 <when value="paired">
157 <param name="forward"
158 type="data" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz"
159 optional="false"
160 label="Forward reads (FASTQ)"
161 help="Must have ASCII encoded quality scores"
162 />
163 <param name="reverse"
164 type="data" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz"
165 optional="false"
166 label="Reverse reads (FASTQ)"
167 help="File format must match the Forward FASTQ file"
168 />
169 </when>
170 <when value="single">
171 <param name="single"
172 type="data" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz"
173 optional="false"
174 label="Single-end reads (FASTQ)"
175 />
176 </when>
177 <when value="paired_collection">
178 <param name="paired_collection"
179 type="data_collection" format="fastq,fastqsanger,fastq.gz,fastqsanger.gz,txt"
180 collection_type="paired"
181 optional="false"
182 label="Paired-end reads collection (FASTQ)"
183 />
184 </when>
185 </conditional>
186 <conditional name="type_of_scheme">
187 <param name="scheme_type" type="select"
188 label="SNP Subtyping Scheme"
189 help="Select the SNP subtyping scheme you wish to subtype with">
190 <option value="heidelberg">Salmonella Heidelberg subtyping scheme</option>
191 <option value="enteritidis">Salmonella Enteritidis subtyping scheme</option>
192 <option value="custom">Specify your own custom scheme</option>
193 </param>
194 <when value="heidelberg"/>
195 <when value="enteritidis"/>
196 <when value="custom">
197 <param name="scheme_input"
198 type="data" format="fasta"
199 label="Your biohansel SNP Subtyping Scheme"/>
200 </when>
201 </conditional>
202 <param name="subtype_metadata"
203 type="data" format="tabular,csv"
204 optional="true"
205 label="Scheme Subtype Metadata Table [Optional]"
206 help="CSV or tab-delimited format only. Must contain a 'subtype' column."
207 />
208 <!-- K-mer frequencies. -->
209 <section name="kmer_vals" title="K-mer Frequency Thresholds" expanded="False">
210 <param name="kmer_min" type="integer"
211 argument="--min-kmer-freq"
212 optional="true"
213 min="0" value="8"
214 label="Min k-mer frequency/coverage"
215 help="default = 8"/>
216 <param name="kmer_max" type="integer"
217 argument="--max-kmer-freq"
218 optional="true"
219 min="1" value="1000"
220 label="Max k-mer frequency/coverage"
221 help="default = 1000"/>
222 </section>
223 <!-- Quality Checking Parameters -->
224 <section name="qc_vals" title="Quality Checking Thresholds" expanded="False">
225 <param name="low_cov_depth_freq" type="integer"
226 argument="--low-cov-depth-freq"
227 value="20" min="0"
228 optional="true"
229 label="QC: Frequency below this coverage are considered low coverage"
230 help="default = 20"/>
231 <param name="min_ambiguous_tiles" type="integer"
232 argument="--min-ambiguous-tiles"
233 optional="true"
234 value="3" min="0"
235 label="QC: Min number of tiles missing for Ambiguous Result"
236 help="default = 3"/>
237 <param name="max_missing_tiles" type="float"
238 argument="--max-missing-tiles"
239 optional="true"
240 value="0.05" min="0" max="1"
241 label="QC: Decimal Proportion of max allowed missing tiles" help="default = 0.05, valid values {0.0 - 1.0}"/>
242 <param name="max_intermediate_tiles" type="float"
243 argument="--max-intermediate-tiles"
244 optional="true"
245 value="0.05" min="0" max="1"
246 label="QC: Decimal Proportion of max allowed missing tiles for an intermediate subtype"
247 help="default = 0.05, valid values {0.0 - 1.0}"/>
248 <param name="low_coverage_warning" type="integer"
249 argument="--low-cov-warning"
250 optional="true"
251 value="20"
252 label="QC: Overall tile coverage below this value will trigger a low coverage warning"
253 help="default = 20"/>
254 </section>
255 <section name="dev_args" title="Developer Options" expanded="False">
256 <param name="use_json"
257 type="boolean"
258 checked="false"
259 truevalue="--json"
260 falsevalue=""
261 label="Output JSON results"
262 help="Use this option of you need json representations of analysis' details"/>
263 </section>
264 </inputs>
265 <outputs>
266 <data format="tabular" name="results.tab" from_work_dir="results.tab" label="results.tab"/>
267 <data format="tabular" name="match_results.tab" from_work_dir="match_results.tab" label="match_results.tab"/>
268 <data format="tabular" name="tech_results.tab" from_work_dir="tech_results.tab" label="tech_results.tab"/>
269 <data format="json" name="results.json" from_work_dir="results.tab.json" label="results.json">
270 <filter>dev_args['use_json']</filter>
271 </data>
272 <data format="json" name="match_results.json" from_work_dir="match_results.tab.json" label="match_results.json">
273 <filter>dev_args['use_json']</filter>
274 </data>
275 <data format="json" name="tech_results.json" from_work_dir="tech_results.tab.json" label="tech_results.json">
276 <filter>dev_args['use_json']</filter>
277 </data>
278 </outputs>
279 <tests>
280 <test>
281 <conditional name="input">
282 <param name="type" value="fasta"/>
283 <param name="fasta" value="SRR1002850_SMALL.fasta"/>
284 </conditional>
285 <param name="type_of_scheme" value="heidelberg"/>
286 <output name="results.tab"
287 value="SRR1002850_SMALL.fasta-results.tab"
288 ftype="tabular"
289 compare="sim_size"
290 delta="1000">
291 </output>
292 <output name="match_results.tab"
293 value="SRR1002850_SMALL.fasta-match_results.tab"
294 ftype="tabular"
295 compare="sim_size"
296 delta="16000">
297 </output>
298 <output name="tech_results.tab"
299 value="SRR1002850_SMALL.fasta-tech_results.tab"
300 ftype="tabular"
301 lines_diff="0">
302 </output>
303 </test>
304 <test>
305 <conditional name="input">
306 <param name="type" value="paired"/>
307 <param name="forward" value="SRR5646583_SMALL_1.fastq"/>
308 <param name="reverse" value="SRR5646583_SMALL_2.fastq"/>
309 </conditional>
310 <param name="type_of_scheme" value="heidelberg"/>
311 <output name="tech_results.tab"
312 value="SRR5646583_SMALL-tech_results.tab"
313 ftype="tabular"
314 lines_diff="0">
315 </output>
316 <output name="results.tab"
317 value="SRR5646583_SMALL-results.tab"
318 ftype="tabular"
319 compare="sim_size"
320 delta="1000">
321 </output>
322 <output name="match_results.tab"
323 value="SRR5646583_SMALL-match_results.tab"
324 ftype="tabular"
325 compare="sim_size"
326 delta="16000">
327 </output>
328 </test>
329 </tests>
330 <help><![CDATA[
331 Subtype microbial whole-genome sequencing (WGS) data using single-nucleotide polymorphism (SNP) targeting k-mer subtyping schemes.
332
333 **Usage**
334
335 1) Select the sequence data you wish to subtype (FASTAs or FASTQs)
336 2) Select subtyping scheme (e.g. *Salmonella enterica* serovar Heidelberg subtyping scheme, *Salmonella enterica* serovar Enteritidis subtyping scheme, or your own custom `biohansel` compatible subtyping scheme)
337 3) [Optional] Select your subtype metadata information table to include subtype metadata along with your subtype results
338 4) Click ``Execute``
339
340 For more information, visit `the biohansel project page <https://github.com/phac-nml/biohansel>`_.
341
342
343 **Example analysis results of a single FASTA file**
344
345 Contents of ``results.tab``:
346
347 +------------------+------------+----------------+-------------+------------------------------------------------+---------------------------------------------------------------+-------------------------+-----------------------+----------------------+-------------------------------+---------------------------+------------------------------------+--------------------------+-----------------------------------+------------------------+-----------+------------+
348 | sample | scheme | scheme_version | subtype | all_subtypes | tiles_matching_subtype | are_subtypes_consistent | inconsistent_subtypes | n_tiles_matching_all | n_tiles_matching_all_expected | n_tiles_matching_positive | n_tiles_matching_positive_expected | n_tiles_matching_subtype | n_tiles_matching_subtype_expected | file_path | qc_status | qc_message |
349 +------------------+------------+----------------+-------------+------------------------------------------------+---------------------------------------------------------------+-------------------------+-----------------------+----------------------+-------------------------------+---------------------------+------------------------------------+--------------------------+-----------------------------------+------------------------+-----------+------------+
350 | SRR1002850_SMALL | heidelberg | 0.5.0 | 2.2.2.2.1.4 | 2; 2.2; 2.2.2; 2.2.2.2; 2.2.2.2.1; 2.2.2.2.1.4 | 2154958-2.2.2.2.1.4; 1037658-2.2.2.2.1.4; 3785187-2.2.2.2.1.4 | True | | 202 | 202 | 17 | 17 | 3 | 3 | SRR1002850_SMALL.fasta | PASS | |
351 +------------------+------------+----------------+-------------+------------------------------------------------+---------------------------------------------------------------+-------------------------+-----------------------+----------------------+-------------------------------+---------------------------+------------------------------------+--------------------------+-----------------------------------+------------------------+-----------+------------+
352
353
354 Contents of ``match_results.tab``:
355
356 +---------------------------+-----------------------------------+------------+---------------------------------------+-------------+-------------+-------------+-------------+------------------+------------------------+------------+----------------+-----------+------------+
357 | tilename | seq | is_revcomp | contig_id | match_index | refposition | subtype | is_pos_tile | sample | file_path | scheme | scheme_version | qc_status | qc_message |
358 +---------------------------+-----------------------------------+------------+---------------------------------------+-------------+-------------+-------------+-------------+------------------+------------------------+------------+----------------+-----------+------------+
359 | 2154958-2.2.2.2.1.4 | GGCGCGCCACGGTTACTCCCCGGTGGTCAGCCG | True | NODE_1_length_726282_cov_40.4705_ID_1 | 13732 | 2154958 | 2.2.2.2.1.4 | True | SRR1002850_SMALL | SRR1002850_SMALL.fasta | heidelberg | 0.5.0 | PASS | |
360 +---------------------------+-----------------------------------+------------+---------------------------------------+-------------+-------------+-------------+-------------+------------------+------------------------+------------+----------------+-----------+------------+
361 | negative2131791-2.2.3.1.3 | GCTGGGCGAAATGATGCAGTTCACCACTTGCTC | True | NODE_1_length_726282_cov_40.4705_ID_1 | 36900 | 2131791 | 2.2.3.1.3 | False | SRR1002850_SMALL | SRR1002850_SMALL.fasta | heidelberg | 0.5.0 | PASS | |
362 +---------------------------+-----------------------------------+------------+---------------------------------------+-------------+-------------+-------------+-------------+------------------+------------------------+------------+----------------+-----------+------------+
363
364 *Next 201 lines omitted.*
365
366
367
368 **Example analysis results of a single FASTQ readset**
369
370
371 Contents of ``results.tab``:
372
373 +------------------+------------+----------------+-------------+------------------------------------------------+------------------------------------------+-------------------------+-----------------------+----------------------+-------------------------------+---------------------------+------------------------------------+--------------------------+-----------------------------------+----------------------------------------------------------+-------------------+-----------+------------+
374 | sample | scheme | scheme_version | subtype | all_subtypes | tiles_matching_subtype | are_subtypes_consistent | inconsistent_subtypes | n_tiles_matching_all | n_tiles_matching_all_expected | n_tiles_matching_positive | n_tiles_matching_positive_expected | n_tiles_matching_subtype | n_tiles_matching_subtype_expected | file_path | avg_tile_coverage | qc_status | qc_message |
375 +------------------+------------+----------------+-------------+------------------------------------------------+------------------------------------------+-------------------------+-----------------------+----------------------+-------------------------------+---------------------------+------------------------------------+--------------------------+-----------------------------------+----------------------------------------------------------+-------------------+-----------+------------+
376 | SRR5646583_SMALL | heidelberg | 0.5.0 | 2.2.1.1.1.1 | 2; 2.2; 2.2.1; 2.2.1.1; 2.2.1.1.1; 2.2.1.1.1.1 | 1983064-2.2.1.1.1.1; 4211912-2.2.1.1.1.1 | True | | 202 | 202 | 20 | 20 | 2 | 2 | ['SRR5646583_SMALL_1.fastq', 'SRR5646583_SMALL_2.fastq'] | 42.631 | PASS | |
377 +------------------+------------+----------------+-------------+------------------------------------------------+------------------------------------------+-------------------------+-----------------------+----------------------+-------------------------------+---------------------------+------------------------------------+--------------------------+-----------------------------------+----------------------------------------------------------+-------------------+-----------+------------+
378
379 Contents of ``match_results.tab``:
380
381 +---------------------+-----------------------------------+------+-------------+-----------+-------------+-------------------+------------------+------------+----------------+-----------+------------+
382 | tilename | seq | freq | refposition | subtype | is_pos_tile | is_kmer_freq_okay | sample | scheme | scheme_version | qc_status | qc_message |
383 +---------------------+-----------------------------------+------+-------------+-----------+-------------+-------------------+------------------+------------+----------------+-----------+------------+
384 | negative4642573-1.2 | TACCAGGAAGTGCTGGAAGAGTTTAACGAACAT | 62 | 4642573 | 1.2 | False | True | SRR5646583_SMALL | heidelberg | 0.5.0 | PASS | |
385 +---------------------+-----------------------------------+------+-------------+-----------+-------------+-------------------+------------------+------------+----------------+-----------+------------+
386 | 21097-2.2.1.1.1 | GCAAATCGCGCCAGTCAAGTCCTCTTTTACCGT | 42 | 21097 | 2.2.1.1.1 | True | True | SRR5646583_SMALL | heidelberg | 0.5.0 | PASS | |
387 +---------------------+-----------------------------------+------+-------------+-----------+-------------+-------------------+------------------+------------+----------------+-----------+------------+
388
389 *Next 202 lines omitted.*
390
391
392 **Example Subtype Metadata**
393
394 A column with name `subtype` must exist and should have subtype designations that would appear in your biohansel results. There are no requirements for the number of columns or contents of those columns in the table - they can contain whatever you want.
395
396
397 +-------------+-------+--------+------------------+
398 | subtype | clade | source | disease_symptoms |
399 +-------------+-------+--------+------------------+
400 | 1 | I | geese | death |
401 +-------------+-------+--------+------------------+
402 | 1.1 | I | moose | burns |
403 +-------------+-------+--------+------------------+
404 | 2.2.1.1.1 | II | mouse | boils |
405 +-------------+-------+--------+------------------+
406 | 2.2.2.2.1.4 | IIa | house | rash |
407 +-------------+-------+--------+------------------+
408
409 The `biohansel` results table will be joined with the subtype metadata table on the `subtype` field so if there are subtype metadata for your `biohansel` results, it will show up in the final output table. For example, if you have a sample that produces a result with subtype "1", there will also be columns "clade", "source" and "disease_symptoms" with "I", "geese" and "death", respectively.
410
411
412 Galaxy wrapper written by Matthew Gopez and Peter Kruczkiewicz at the Public Health Agency of Canada, National Microbiology Laboratory.
413
414 ]]></help>
415 <citations>
416 <citation type="bibtex">@ARTICLE{a1,
417 title = {A robust genotyping scheme for Salmonella enterica serovar Heidelberg clones circulating in North America},
418 author = {Geneviève Labbé, James Robertson, Peter Kruczkiewicz, Marisa Rankin, Matthew Gopez, Chad R. Laing, Philip Mabon, Kim Ziebell, Aleisha R. Reimer, Lorelee Tschetter, Gary Van Domselaar, Sadjia Bekal, Kimberley A. MacDonald, Linda Hoang, Linda Chui, Danielle Daignault, Durda Slavic, Frank Pollari, E. Jane Parmley, Elissa Giang, Lok Kan Lee, Jonathan Moffat, Joanne MacKinnon, Roger Johnson, John H.E. Nash},
419 url = {https://github.com/phac-nml/bio_hansel}
420 }
421 }</citation>
422 </citations>
423 </tool>