comparison vsnp_determine_ref_from_data.xml @ 0:12f2b14549f6 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 524a39e08f2bea8b8754284df606ff8dd27ed24b"
author iuc
date Wed, 02 Dec 2020 09:11:24 +0000
parents
children b03e88e7bb1d
comparison
equal deleted inserted replaced
-1:000000000000 0:12f2b14549f6
1 <tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="1.0.0">
2 <description>from input data</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <requirements>
7 <requirement type="package" version="1.76">biopython</requirement>
8 <requirement type="package" version="5.3">pyyaml</requirement>
9 </requirements>
10 <command detect_errors="exit_code"><![CDATA[
11 #import re
12 #set gzipped = 'false'
13 #set input_type = $input_type_cond.input_type
14
15 #if $input_type in ["single", "pair"]:
16 #set read1 = $input_type_cond.read1
17 #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier))
18 ln -s '${read1}' '${read1_identifier}' &&
19 #if $input_type == "pair":
20 #set read2 = $input_type_cond.read2
21 #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
22 ln -s '${read2}' '${read2_identifier}' &&
23 #else:
24 #set read2 = None
25 #end if
26 #else:
27 #set read1 = $input_type_cond.reads_collection['forward']
28 #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier))
29 ln -s '${read1}' '${read1_identifier}' &&
30 #set read2 = $input_type_cond.reads_collection['reverse']
31 #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
32 ln -s '${read2}' '${read2_identifier}' &&
33 #end if
34
35 python '$__tool_directory__/vsnp_determine_ref_from_data.py'
36 --read1 '${read1_identifier}'
37 #if $read2 is not None
38 --read2 '${read2_identifier}'
39 #end if
40 --output_dbkey '$output_dbkey'
41 --output_metrics '$output_metrics'
42 #if $read1.is_of_type('fastqsanger.gz'):
43 --gzipped
44 #end if
45 #set $dnaprint_fields = $__app__.tool_data_tables['vsnp_dnaprints'].get_fields()
46 #for $i in $dnaprint_fields:
47 --dnaprint_fields '${i[0]}' '${i[2]}'
48 #end for
49 ]]></command>
50 <inputs>
51 <conditional name="input_type_cond">
52 <param name="input_type" type="select" label="Choose the category of the files to be analyzed">
53 <option value="single" selected="true">Single files</option>
54 <option value="paired">Paired reads</option>
55 <option value="pair">Paired reads in separate data sets</option>
56 </param>
57 <when value="single">
58 <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
59 </when>
60 <when value="paired">
61 <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="paired" label="Collection of fastqsanger paired read files"/>
62 </when>
63 <when value="pair">
64 <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
65 <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/>
66 </when>
67 </conditional>
68 </inputs>
69 <outputs>
70 <data name="output_dbkey" format="txt" label="${tool.name} (dbkey) on ${on_string}"/>
71 <data name="output_metrics" format="txt" label="${tool.name} (metrics) on ${on_string}"/>
72 </outputs>
73 <tests>
74 <!-- 1 single read -->
75 <test expect_num_outputs="2">
76 <param name="input_type" value="single"/>
77 <param name="read1" value="Mcap_Deer_DE_SRR650221.fastq.gz" ftype="fastqsanger.gz"/>
78 <output name="output_dbkey" file="output_dbkey.txt" ftype="txt"/>
79 <output name="output_metrics" file="output_metrics.txt" ftype="txt"/>
80 </test>
81 <!-- 1 set of paired reads -->
82 <test expect_num_outputs="2">
83 <param name="input_type" value="pair"/>
84 <param name="read1" value="forward.fastq.gz" ftype="fastqsanger.gz"/>
85 <param name="read2" value="reverse.fastq.gz" ftype="fastqsanger.gz"/>
86 <output name="output_dbkey" file="paired_dbkey.txt" ftype="txt"/>
87 <output name="output_metrics" file="paired_metrics.txt" ftype="txt"/>
88 </test>
89 <!-- A collection of paired reads -->
90 <test expect_num_outputs="2">
91 <param name="input_type" value="paired"/>
92 <param name="reads_collection">
93 <collection type="paired">
94 <element name="forward" value="forward.fastq.gz" ftype="fastqsanger.gz"/>
95 <element name="reverse" value="reverse.fastq.gz" ftype="fastqsanger.gz"/>
96 </collection>
97 </param>
98 <output name="output_dbkey" file="paired_dbkey.txt" ftype="txt"/>
99 <output name="output_metrics" file="paired_metrics.txt" ftype="txt"/>
100 </test>
101 </tests>
102 <help>
103 **What it does**
104
105 Accepts a single fastqsanger read, a set of paired reads, or a collection of single or paired reads (bacterial samples) and
106 inspects the data to discover the best reference genome for aligning the reads.
107
108 The information needed to discover the best reference is maintained by the USDA in this repository_. References are curreently
109
110 .. _repository: https://github.com/USDA-VS/vSNP_reference_options
111
112 limited to TB complex, paraTB, and Brucella, but information for additional references will be added. The information for each
113 reference is a string consisting of zeros and ones, compiled by USDA researchers, which we call a "DNA print". These strings
114 are maintained in yaml files for use in Galaxy, and are installed via the **vSNP DNAprints data manager** tool.
115
116 This tool creates an in-memory dictionary of these DNA print strings for matching with a string generated by inspecting the
117 input sample data. During inspection, this tool accrues sequence counts for supported species, ultimately generating a string
118 consisting of zeros and ones based on the counts, (i.e., a DNA print). This string is then compared to the strings contained
119 in the in-memory dictionary of DNA prints to find a match.
120
121 The strings in the in-memory dictionary are each associated with a Galaxy "dbkey" (i.e., genome build), so when a match is found,
122 the associated "dbkey" is passed to a mapper (e.g., **Map with BWA-MEM**), typically within a workflow via an expression tool,
123 to align the reads to the associated reference.
124
125 This tool produces 2 text files, a "dbkey" file that contains the dbkey string and a "metrics" file that provides information
126 about the sequence counts that were discovered in the input sample data that produced the "DNA print" string.
127
128 This tool is important for samples containing bacterial species because many of the samples have a "mixed bag" of species,
129 and discovering the primary species is critical. DNA print matching is currently supported for the following genomes.
130
131 * Mycobacterium bovis AF2122/97
132 * Brucella abortus bv. 1 str. 9-941
133 * Brucella abortus strain BER
134 * Brucella canis ATCC 23365
135 * Brucella ceti TE10759-12
136 * Brucella melitensis bv. 1 str. 16M
137 * Brucella melitensis bv. 3 str. Ether
138 * Brucella melitensis BwIM_SOM_36b
139 * Brucella melitensis ATCC 23457
140 * Brucella ovis ATCC 25840
141 * Brucella suis 1330
142 * Mycobacterium tuberculosis H37Rv
143 * Mycobacterium avium subsp. paratuberculosis strain Telford
144 * Mycobacterium avium subsp. paratuberculosis K-10
145 * Brucella suis ATCC 23445
146 * Brucella suis bv. 3 str. 686
147
148 **Required Options**
149
150 * **Choose the category of the files to be analyzed** - select "Single files" or "Collection of files", then select the appropriate history items (single or paired fastqsanger reads or a collection of fastqsanger reads) based on the selected option.
151 </help>
152 <expand macro="citations"/>
153 </tool>
154