comparison vsnp_determine_ref_from_data.xml @ 4:36bdf8b439ed draft

Uploaded
author greg
date Sun, 03 Jan 2021 16:13:22 +0000
parents 6116deacb2c7
children d5e66f9fe086
comparison
equal deleted inserted replaced
3:6116deacb2c7 4:36bdf8b439ed
1 <tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="1.0.0"> 1 <tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="@WRAPPER_VERSION@.1" profile="@PROFILE@">
2 <description>from input data</description> 2 <description>from input data</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
3 <requirements> 6 <requirements>
4 <requirement type="package" version="1.76">biopython</requirement> 7 <requirement type="package" version="1.76">biopython</requirement>
5 <requirement type="package" version="5.3">pyyaml</requirement> 8 <requirement type="package" version="5.3">pyyaml</requirement>
6 </requirements> 9 </requirements>
7 <command detect_errors="exit_code"><![CDATA[ 10 <command detect_errors="exit_code"><![CDATA[
8 #import os
9 #import re 11 #import re
10 #set gzipped = 'false' 12 #set gzipped = 'false'
11 #set input_type = $input_type_cond.input_type 13 #set input_type = $input_type_cond.input_type
12 #set input_reads_dir = 'input_reads' 14
13 #set output_dbkey_dir = 'output_dbkey' 15 #if $input_type in ["single", "pair"]:
14 #set output_metrics_dir = 'output_metrics' 16 #set read1 = $input_type_cond.read1
15 mkdir -p $input_reads_dir &&
16 mkdir -p $output_dbkey_dir &&
17 mkdir -p $output_metrics_dir &&
18 #if str($input_type) == "single":
19 #set read_type_cond = $input_type_cond.read_type_cond
20 #set read1 = $read_type_cond.read1
21 #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier)) 17 #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier))
22 #if str($read_type_cond.read_type) == "single": 18 ln -s '${read1}' '${read1_identifier}' &&
23 ln -s '${read1}' '${read1_identifier}' && 19 #if $input_type == "pair":
24 #if $read1.is_of_type('fastqsanger.gz'): 20 #set read2 = $input_type_cond.read2
25 #set gzipped = 'true' 21 #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
26 #end if 22 ln -s '${read2}' '${read2_identifier}' &&
27 #else: 23 #else:
28 #set read2 = $read_type_cond.read2 24 #set read2 = None
29 #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
30 ln -s '${read1}' '${read1_identifier}' &&
31 ln -s '${read2}' '${read2_identifier}' &&
32 #if $read1.is_of_type('fastqsanger.gz') and $read2.is_of_type('fastqsanger.gz'):
33 #set gzipped = 'true'
34 #end if
35 #end if 25 #end if
36 #else: 26 #else:
37 #set collection_type = $input_type_cond.collection_type_cond.collection_type 27 #set read1 = $input_type_cond.reads_collection['forward']
38 #for $i in $input_type_cond.collection_type_cond.reads_collection: 28 #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.name))
39 #if $i.is_of_type('fastqsanger.gz'): 29 ln -s '${read1}' '${read1_identifier}' &&
40 #set gzipped = 'true' 30 #set read2 = $input_type_cond.reads_collection['reverse']
41 #end if 31 #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.name))
42 #set filename = $i.file_name 32 ln -s '${read2}' '${read2_identifier}' &&
43 #if str($collection_type) == 'single_reads':
44 #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
45 #else:
46 ## Galaxy builds lists of pairs as nested lists with elements
47 ## named forward and reverse. When flattened, these lists
48 ## will work as inputs to the Parse parameter value expression
49 ## tool in workflows. However, the output list created by the
50 ## expression tool will not function correctly with the bwa_mem
51 ## mapper. Naming the identifier as follows is a solution.
52 #set identifier = re.sub('[^\s\w\-]', '_', str($i.name))
53 #end if
54 ln -s '$filename' '$input_reads_dir/$identifier' &&
55 #end for
56 #end if 33 #end if
34
57 python '$__tool_directory__/vsnp_determine_ref_from_data.py' 35 python '$__tool_directory__/vsnp_determine_ref_from_data.py'
58 #if str($input_type) == "single": 36 --read1 '${read1_identifier}'
59 #if str($read_type_cond.read_type) == "single": 37 #if $read2 is not None
60 --read1 '${read1_identifier}' 38 --read2 '${read2_identifier}'
61 #else:
62 --read1 '${read1_identifier}'
63 --read2 '${read2_identifier}'
64 #end if 39 #end if
65 --output_dbkey '$output_dbkey' 40 --output_dbkey '$output_dbkey'
66 --output_metrics '$output_metrics' 41 --output_metrics '$output_metrics'
42 #if $read1.is_of_type('fastqsanger.gz'):
43 --gzipped
67 #end if 44 #end if
68 --gzipped $gzipped 45 #set $dnaprint_fields = $__app__.tool_data_tables['vsnp_dnaprints'].get_fields()
69 --processes $processes 46 #for $i in $dnaprint_fields:
70 #if str($in_test_mode) == "false": 47 --dnaprint_fields '${i[0]}' '${i[2]}'
71 #set $dnaprint_fields = $__app__.tool_data_tables['vsnp_dnaprints'].get_fields() 48 #end for
72 #for $i in $dnaprint_fields:
73 --dnaprint_fields '${i[0]}' '${i[2]}'
74 #end for
75 #else:
76 --in_test_mode '$in_test_mode'
77 #end if
78 ]]></command> 49 ]]></command>
79 <inputs> 50 <inputs>
80 <conditional name="input_type_cond"> 51 <conditional name="input_type_cond">
81 <param name="input_type" type="select" label="Choose the category of the files to be analyzed"> 52 <param name="input_type" type="select" label="Choose the category of the files to be analyzed">
82 <option value="single" selected="true">Single files</option> 53 <option value="single" selected="true">Single files</option>
83 <option value="collection">Collection of files</option> 54 <option value="paired">Paired reads</option>
55 <option value="pair">Paired reads in separate data sets</option>
84 </param> 56 </param>
85 <when value="single"> 57 <when value="single">
86 <conditional name="read_type_cond"> 58 <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
87 <param name="read_type" type="select" label="Choose the read type">
88 <option value="paired" selected="true">Paired</option>
89 <option value="single">Single</option>
90 </param>
91 <when value="paired">
92 <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
93 <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/>
94 </when>
95 <when value="single">
96 <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
97 </when>
98 </conditional>
99 </when> 59 </when>
100 <when value="collection"> 60 <when value="paired">
101 <conditional name="collection_type_cond"> 61 <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="paired" label="Collection of fastqsanger paired read files"/>
102 <param name="collection_type" type="select" label="Collection of single reads or paired reads?"> 62 </when>
103 <option value="single_reads" selected="true">Single reads</option> 63 <when value="pair">
104 <option value="paired_reads">Paired reads</option> 64 <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
105 </param> 65 <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/>
106 <when value="single_reads">
107 <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="list" label="Collection of fastqsanger files"/>
108 </when>
109 <when value="paired_reads">
110 <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="paired" label="Collection of fastqsanger paired read files"/>
111 </when>
112 </conditional>
113 </when> 66 </when>
114 </conditional> 67 </conditional>
115 <param name="processes" type="integer" min="1" max="20" value="8" label="Number of processes for job splitting"/>
116 <!-- Functional testing -->
117 <param name="in_test_mode" type="hidden" value="false"/>
118 </inputs> 68 </inputs>
119 <outputs> 69 <outputs>
120 <data name="output_dbkey" format="txt" label="${tool.name} (dbkey) on ${on_string}"> 70 <data name="output_dbkey" format="txt" label="${tool.name} on ${on_string} (dbkey)"/>
121 <filter>input_type_cond['input_type'] == 'single'</filter> 71 <data name="output_metrics" format="txt" label="${tool.name} on ${on_string} (metrics)"/>
122 </data>
123 <data name="output_metrics" format="txt" label="${tool.name} (metrics) on ${on_string}">
124 <filter>input_type_cond['input_type'] == 'single'</filter>
125 </data>
126 <collection name="output_dbkey_collection" type="list" label="${tool.name} (dbkey) on ${on_string}">
127 <discover_datasets pattern="__name__" directory="output_dbkey" format="txt"/>
128 <filter>input_type_cond['input_type'] == 'collection'</filter>
129 </collection>
130 <collection name="output_metrics_collection" type="list" label="${tool.name} (metrics) on ${on_string}">
131 <discover_datasets pattern="__name__" directory="output_metrics" format="txt"/>
132 <filter>input_type_cond['input_type'] == 'collection'</filter>
133 </collection>
134 </outputs> 72 </outputs>
135 <tests> 73 <tests>
136 <test> 74 <!-- 1 single read -->
137 <param name="in_test_mode" value="true"/> 75 <test expect_num_outputs="2">
138 <param name="read_type" value="single"/> 76 <param name="input_type" value="single"/>
139 <param name="read1" value="Mcap_Deer_DE_SRR650221.fastq.gz" ftype="fastqsanger.gz"/> 77 <param name="read1" value="Mcap_Deer_DE_SRR650221.fastq.gz" ftype="fastqsanger.gz"/>
140 <output name="output_dbkey" file="output_dbkey.txt" ftype="txt"/> 78 <output name="output_dbkey" file="output_dbkey.txt" ftype="txt"/>
141 <output name="output_metrics" file="output_metrics.txt" ftype="txt"/> 79 <output name="output_metrics" file="output_metrics.txt" ftype="txt"/>
142 </test> 80 </test>
143 <test> 81 <!-- 1 set of paired reads -->
144 <param name="in_test_mode" value="true"/> 82 <test expect_num_outputs="2">
145 <param name="input_type" value="collection"/> 83 <param name="input_type" value="pair"/>
146 <param name="collection_type" value="paired_reads"/> 84 <param name="read1" value="CMC_20E1_R1.fastq.gz" ftype="fastqsanger.gz"/>
85 <param name="read2" value="CMC_20E1_R2.fastq.gz" ftype="fastqsanger.gz"/>
86 <output name="output_dbkey" file="paired_dbkey.txt" ftype="txt"/>
87 <output name="output_metrics" file="paired_metrics.txt" ftype="txt"/>
88 </test>
89 <!-- A collection of paired reads -->
90 <test expect_num_outputs="2">
91 <param name="input_type" value="paired"/>
147 <param name="reads_collection"> 92 <param name="reads_collection">
148 <collection type="paired"> 93 <collection type="paired">
149 <element name="forward" value="forward.fastq.gz" ftype="fastqsanger.gz"/> 94 <element name="forward" value="CMC_20E1_R1.fastq.gz" ftype="fastqsanger.gz"/>
150 <element name="reverse" value="reverse.fastq.gz" ftype="fastqsanger.gz"/> 95 <element name="reverse" value="CMC_20E1_R2.fastq.gz" ftype="fastqsanger.gz"/>
151 </collection> 96 </collection>
152 </param> 97 </param>
153 <output_collection name="output_dbkey_collection" type="list"> 98 <output name="output_dbkey" file="paired_dbkey.txt" ftype="txt"/>
154 <element name="forward.txt" file="forward_dbkey.txt" ftype="txt"/> 99 <output name="output_metrics" file="paired_collection_metrics.txt" ftype="txt"/>
155 <element name="reverse.txt" file="reverse_dbkey.txt" ftype="txt"/>
156 </output_collection>
157 <output_collection name="output_metrics_collection" type="list">
158 <element name="forward.txt" file="forward_metrics.txt" ftype="txt"/>
159 <element name="reverse.txt" file="reverse_metrics.txt" ftype="txt"/>
160 </output_collection>
161 </test> 100 </test>
162 </tests> 101 </tests>
163 <help> 102 <help>
164 **What it does** 103 **What it does**
165 104
166 Accepts a single fastqsanger read, a set of paired reads, or a collection of reads and inspects the data to discover the 105 Accepts a single fastqsanger read, a set of paired reads, or a collection of single or paired reads (bacterial samples) and
167 best reference genome for aligning the reads. This tool is, in essence, a DNA sniffer, and is the first Galaxy tool to 106 inspects the data to discover the best reference genome for aligning the reads.
168 perform this task. While inspecting the data, a string of 0's and 1's is compiled based on the data contents, and we call
169 the complete string a "DNA print". All of the "DNA prints" files installed by the complementary **vSNP DNAprints data
170 manager** tool are then inspected to find a match for the compiled "DNA print" string. These files are each associated
171 with a Galaxy "dbkey" (i.e., genome build), so when a metach is found, the associated "dbkey" is passed to a mapper (e.g.,
172 **Map with BWA-MEM**) to align the reads to the associated reference.
173 107
174 The tool produces 2 text files, a "dbkey" file that contains the dbkey string and a "metrics" file that provides information 108 The information needed to discover the best reference is maintained by the USDA in this repository_. References are curreently
175 used to compile the "DNA print" string. 109
110 .. _repository: https://github.com/USDA-VS/vSNP_reference_options
111
112 limited to TB complex, paraTB, and Brucella, but information for additional references will be added. The information for each
113 reference is a string consisting of zeros and ones, compiled by USDA researchers, which we call a "DNA print". These strings
114 are maintained in yaml files for use in Galaxy, and are installed via the **vSNP DNAprints data manager** tool.
115
116 This tool creates an in-memory dictionary of these DNA print strings for matching with a string generated by inspecting the
117 input sample data. During inspection, this tool accrues sequence counts for supported species, ultimately generating a string
118 consisting of zeros and ones based on the counts, (i.e., a DNA print). This string is then compared to the strings contained
119 in the in-memory dictionary of DNA prints to find a match.
120
121 The strings in the in-memory dictionary are each associated with a Galaxy "dbkey" (i.e., genome build), so when a match is found,
122 the associated "dbkey" is passed to a mapper (e.g., **Map with BWA-MEM**), typically within a workflow via an expression tool,
123 to align the reads to the associated reference.
124
125 This tool produces 2 text files, a "dbkey" file that contains the dbkey string and a "metrics" file that provides information
126 about the sequence counts that were discovered in the input sample data that produced the "DNA print" string.
176 127
177 This tool is important for samples containing bacterial species because many of the samples have a "mixed bag" of species, 128 This tool is important for samples containing bacterial species because many of the samples have a "mixed bag" of species,
178 and discovering the primary species is critical. DNA print matchig is currently supported for the following genomes. 129 and discovering the primary species is critical. DNA print matching is currently supported for the following genomes.
179 130
180 * Mycobacterium bovis AF2122/97 131 * Mycobacterium bovis AF2122/97
181 * Brucella abortus bv. 1 str. 9-941 132 * Brucella abortus bv. 1 str. 9-941
182 * Brucella abortus strain BER 133 * Brucella abortus strain BER
183 * Brucella canis ATCC 23365 134 * Brucella canis ATCC 23365
195 * Brucella suis bv. 3 str. 686 146 * Brucella suis bv. 3 str. 686
196 147
197 **Required Options** 148 **Required Options**
198 149
199 * **Choose the category of the files to be analyzed** - select "Single files" or "Collection of files", then select the appropriate history items (single or paired fastqsanger reads or a collection of fastqsanger reads) based on the selected option. 150 * **Choose the category of the files to be analyzed** - select "Single files" or "Collection of files", then select the appropriate history items (single or paired fastqsanger reads or a collection of fastqsanger reads) based on the selected option.
200 * **Number of processes for job splitting** - Select the number of processes for splitting the job to shorten execution time.
201 </help> 151 </help>
202 <citations> 152 <expand macro="citations"/>
203 <citation type="bibtex">
204 @misc{None,
205 journal = {None},
206 author = {1. Stuber T},
207 title = {Manuscript in preparation},
208 year = {None},
209 url = {https://github.com/USDA-VS/vSNP},}
210 </citation>
211 </citations>
212 </tool> 153 </tool>
213 154