Mercurial > repos > greg > vsnp_determine_ref_from_data
comparison vsnp_determine_ref_from_data.xml @ 4:36bdf8b439ed draft
Uploaded
author | greg |
---|---|
date | Sun, 03 Jan 2021 16:13:22 +0000 |
parents | 6116deacb2c7 |
children | d5e66f9fe086 |
comparison
equal
deleted
inserted
replaced
3:6116deacb2c7 | 4:36bdf8b439ed |
---|---|
1 <tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="1.0.0"> | 1 <tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="@WRAPPER_VERSION@.1" profile="@PROFILE@"> |
2 <description>from input data</description> | 2 <description>from input data</description> |
3 <macros> | |
4 <import>macros.xml</import> | |
5 </macros> | |
3 <requirements> | 6 <requirements> |
4 <requirement type="package" version="1.76">biopython</requirement> | 7 <requirement type="package" version="1.76">biopython</requirement> |
5 <requirement type="package" version="5.3">pyyaml</requirement> | 8 <requirement type="package" version="5.3">pyyaml</requirement> |
6 </requirements> | 9 </requirements> |
7 <command detect_errors="exit_code"><![CDATA[ | 10 <command detect_errors="exit_code"><![CDATA[ |
8 #import os | |
9 #import re | 11 #import re |
10 #set gzipped = 'false' | 12 #set gzipped = 'false' |
11 #set input_type = $input_type_cond.input_type | 13 #set input_type = $input_type_cond.input_type |
12 #set input_reads_dir = 'input_reads' | 14 |
13 #set output_dbkey_dir = 'output_dbkey' | 15 #if $input_type in ["single", "pair"]: |
14 #set output_metrics_dir = 'output_metrics' | 16 #set read1 = $input_type_cond.read1 |
15 mkdir -p $input_reads_dir && | |
16 mkdir -p $output_dbkey_dir && | |
17 mkdir -p $output_metrics_dir && | |
18 #if str($input_type) == "single": | |
19 #set read_type_cond = $input_type_cond.read_type_cond | |
20 #set read1 = $read_type_cond.read1 | |
21 #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier)) | 17 #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier)) |
22 #if str($read_type_cond.read_type) == "single": | 18 ln -s '${read1}' '${read1_identifier}' && |
23 ln -s '${read1}' '${read1_identifier}' && | 19 #if $input_type == "pair": |
24 #if $read1.is_of_type('fastqsanger.gz'): | 20 #set read2 = $input_type_cond.read2 |
25 #set gzipped = 'true' | 21 #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier)) |
26 #end if | 22 ln -s '${read2}' '${read2_identifier}' && |
27 #else: | 23 #else: |
28 #set read2 = $read_type_cond.read2 | 24 #set read2 = None |
29 #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier)) | |
30 ln -s '${read1}' '${read1_identifier}' && | |
31 ln -s '${read2}' '${read2_identifier}' && | |
32 #if $read1.is_of_type('fastqsanger.gz') and $read2.is_of_type('fastqsanger.gz'): | |
33 #set gzipped = 'true' | |
34 #end if | |
35 #end if | 25 #end if |
36 #else: | 26 #else: |
37 #set collection_type = $input_type_cond.collection_type_cond.collection_type | 27 #set read1 = $input_type_cond.reads_collection['forward'] |
38 #for $i in $input_type_cond.collection_type_cond.reads_collection: | 28 #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.name)) |
39 #if $i.is_of_type('fastqsanger.gz'): | 29 ln -s '${read1}' '${read1_identifier}' && |
40 #set gzipped = 'true' | 30 #set read2 = $input_type_cond.reads_collection['reverse'] |
41 #end if | 31 #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.name)) |
42 #set filename = $i.file_name | 32 ln -s '${read2}' '${read2_identifier}' && |
43 #if str($collection_type) == 'single_reads': | |
44 #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier)) | |
45 #else: | |
46 ## Galaxy builds lists of pairs as nested lists with elements | |
47 ## named forward and reverse. When flattened, these lists | |
48 ## will work as inputs to the Parse parameter value expression | |
49 ## tool in workflows. However, the output list created by the | |
50 ## expression tool will not function correctly with the bwa_mem | |
51 ## mapper. Naming the identifier as follows is a solution. | |
52 #set identifier = re.sub('[^\s\w\-]', '_', str($i.name)) | |
53 #end if | |
54 ln -s '$filename' '$input_reads_dir/$identifier' && | |
55 #end for | |
56 #end if | 33 #end if |
34 | |
57 python '$__tool_directory__/vsnp_determine_ref_from_data.py' | 35 python '$__tool_directory__/vsnp_determine_ref_from_data.py' |
58 #if str($input_type) == "single": | 36 --read1 '${read1_identifier}' |
59 #if str($read_type_cond.read_type) == "single": | 37 #if $read2 is not None |
60 --read1 '${read1_identifier}' | 38 --read2 '${read2_identifier}' |
61 #else: | |
62 --read1 '${read1_identifier}' | |
63 --read2 '${read2_identifier}' | |
64 #end if | 39 #end if |
65 --output_dbkey '$output_dbkey' | 40 --output_dbkey '$output_dbkey' |
66 --output_metrics '$output_metrics' | 41 --output_metrics '$output_metrics' |
42 #if $read1.is_of_type('fastqsanger.gz'): | |
43 --gzipped | |
67 #end if | 44 #end if |
68 --gzipped $gzipped | 45 #set $dnaprint_fields = $__app__.tool_data_tables['vsnp_dnaprints'].get_fields() |
69 --processes $processes | 46 #for $i in $dnaprint_fields: |
70 #if str($in_test_mode) == "false": | 47 --dnaprint_fields '${i[0]}' '${i[2]}' |
71 #set $dnaprint_fields = $__app__.tool_data_tables['vsnp_dnaprints'].get_fields() | 48 #end for |
72 #for $i in $dnaprint_fields: | |
73 --dnaprint_fields '${i[0]}' '${i[2]}' | |
74 #end for | |
75 #else: | |
76 --in_test_mode '$in_test_mode' | |
77 #end if | |
78 ]]></command> | 49 ]]></command> |
79 <inputs> | 50 <inputs> |
80 <conditional name="input_type_cond"> | 51 <conditional name="input_type_cond"> |
81 <param name="input_type" type="select" label="Choose the category of the files to be analyzed"> | 52 <param name="input_type" type="select" label="Choose the category of the files to be analyzed"> |
82 <option value="single" selected="true">Single files</option> | 53 <option value="single" selected="true">Single files</option> |
83 <option value="collection">Collection of files</option> | 54 <option value="paired">Paired reads</option> |
55 <option value="pair">Paired reads in separate data sets</option> | |
84 </param> | 56 </param> |
85 <when value="single"> | 57 <when value="single"> |
86 <conditional name="read_type_cond"> | 58 <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/> |
87 <param name="read_type" type="select" label="Choose the read type"> | |
88 <option value="paired" selected="true">Paired</option> | |
89 <option value="single">Single</option> | |
90 </param> | |
91 <when value="paired"> | |
92 <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/> | |
93 <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/> | |
94 </when> | |
95 <when value="single"> | |
96 <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/> | |
97 </when> | |
98 </conditional> | |
99 </when> | 59 </when> |
100 <when value="collection"> | 60 <when value="paired"> |
101 <conditional name="collection_type_cond"> | 61 <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="paired" label="Collection of fastqsanger paired read files"/> |
102 <param name="collection_type" type="select" label="Collection of single reads or paired reads?"> | 62 </when> |
103 <option value="single_reads" selected="true">Single reads</option> | 63 <when value="pair"> |
104 <option value="paired_reads">Paired reads</option> | 64 <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/> |
105 </param> | 65 <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/> |
106 <when value="single_reads"> | |
107 <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="list" label="Collection of fastqsanger files"/> | |
108 </when> | |
109 <when value="paired_reads"> | |
110 <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="paired" label="Collection of fastqsanger paired read files"/> | |
111 </when> | |
112 </conditional> | |
113 </when> | 66 </when> |
114 </conditional> | 67 </conditional> |
115 <param name="processes" type="integer" min="1" max="20" value="8" label="Number of processes for job splitting"/> | |
116 <!-- Functional testing --> | |
117 <param name="in_test_mode" type="hidden" value="false"/> | |
118 </inputs> | 68 </inputs> |
119 <outputs> | 69 <outputs> |
120 <data name="output_dbkey" format="txt" label="${tool.name} (dbkey) on ${on_string}"> | 70 <data name="output_dbkey" format="txt" label="${tool.name} on ${on_string} (dbkey)"/> |
121 <filter>input_type_cond['input_type'] == 'single'</filter> | 71 <data name="output_metrics" format="txt" label="${tool.name} on ${on_string} (metrics)"/> |
122 </data> | |
123 <data name="output_metrics" format="txt" label="${tool.name} (metrics) on ${on_string}"> | |
124 <filter>input_type_cond['input_type'] == 'single'</filter> | |
125 </data> | |
126 <collection name="output_dbkey_collection" type="list" label="${tool.name} (dbkey) on ${on_string}"> | |
127 <discover_datasets pattern="__name__" directory="output_dbkey" format="txt"/> | |
128 <filter>input_type_cond['input_type'] == 'collection'</filter> | |
129 </collection> | |
130 <collection name="output_metrics_collection" type="list" label="${tool.name} (metrics) on ${on_string}"> | |
131 <discover_datasets pattern="__name__" directory="output_metrics" format="txt"/> | |
132 <filter>input_type_cond['input_type'] == 'collection'</filter> | |
133 </collection> | |
134 </outputs> | 72 </outputs> |
135 <tests> | 73 <tests> |
136 <test> | 74 <!-- 1 single read --> |
137 <param name="in_test_mode" value="true"/> | 75 <test expect_num_outputs="2"> |
138 <param name="read_type" value="single"/> | 76 <param name="input_type" value="single"/> |
139 <param name="read1" value="Mcap_Deer_DE_SRR650221.fastq.gz" ftype="fastqsanger.gz"/> | 77 <param name="read1" value="Mcap_Deer_DE_SRR650221.fastq.gz" ftype="fastqsanger.gz"/> |
140 <output name="output_dbkey" file="output_dbkey.txt" ftype="txt"/> | 78 <output name="output_dbkey" file="output_dbkey.txt" ftype="txt"/> |
141 <output name="output_metrics" file="output_metrics.txt" ftype="txt"/> | 79 <output name="output_metrics" file="output_metrics.txt" ftype="txt"/> |
142 </test> | 80 </test> |
143 <test> | 81 <!-- 1 set of paired reads --> |
144 <param name="in_test_mode" value="true"/> | 82 <test expect_num_outputs="2"> |
145 <param name="input_type" value="collection"/> | 83 <param name="input_type" value="pair"/> |
146 <param name="collection_type" value="paired_reads"/> | 84 <param name="read1" value="CMC_20E1_R1.fastq.gz" ftype="fastqsanger.gz"/> |
85 <param name="read2" value="CMC_20E1_R2.fastq.gz" ftype="fastqsanger.gz"/> | |
86 <output name="output_dbkey" file="paired_dbkey.txt" ftype="txt"/> | |
87 <output name="output_metrics" file="paired_metrics.txt" ftype="txt"/> | |
88 </test> | |
89 <!-- A collection of paired reads --> | |
90 <test expect_num_outputs="2"> | |
91 <param name="input_type" value="paired"/> | |
147 <param name="reads_collection"> | 92 <param name="reads_collection"> |
148 <collection type="paired"> | 93 <collection type="paired"> |
149 <element name="forward" value="forward.fastq.gz" ftype="fastqsanger.gz"/> | 94 <element name="forward" value="CMC_20E1_R1.fastq.gz" ftype="fastqsanger.gz"/> |
150 <element name="reverse" value="reverse.fastq.gz" ftype="fastqsanger.gz"/> | 95 <element name="reverse" value="CMC_20E1_R2.fastq.gz" ftype="fastqsanger.gz"/> |
151 </collection> | 96 </collection> |
152 </param> | 97 </param> |
153 <output_collection name="output_dbkey_collection" type="list"> | 98 <output name="output_dbkey" file="paired_dbkey.txt" ftype="txt"/> |
154 <element name="forward.txt" file="forward_dbkey.txt" ftype="txt"/> | 99 <output name="output_metrics" file="paired_collection_metrics.txt" ftype="txt"/> |
155 <element name="reverse.txt" file="reverse_dbkey.txt" ftype="txt"/> | |
156 </output_collection> | |
157 <output_collection name="output_metrics_collection" type="list"> | |
158 <element name="forward.txt" file="forward_metrics.txt" ftype="txt"/> | |
159 <element name="reverse.txt" file="reverse_metrics.txt" ftype="txt"/> | |
160 </output_collection> | |
161 </test> | 100 </test> |
162 </tests> | 101 </tests> |
163 <help> | 102 <help> |
164 **What it does** | 103 **What it does** |
165 | 104 |
166 Accepts a single fastqsanger read, a set of paired reads, or a collection of reads and inspects the data to discover the | 105 Accepts a single fastqsanger read, a set of paired reads, or a collection of single or paired reads (bacterial samples) and |
167 best reference genome for aligning the reads. This tool is, in essence, a DNA sniffer, and is the first Galaxy tool to | 106 inspects the data to discover the best reference genome for aligning the reads. |
168 perform this task. While inspecting the data, a string of 0's and 1's is compiled based on the data contents, and we call | |
169 the complete string a "DNA print". All of the "DNA prints" files installed by the complementary **vSNP DNAprints data | |
170 manager** tool are then inspected to find a match for the compiled "DNA print" string. These files are each associated | |
171 with a Galaxy "dbkey" (i.e., genome build), so when a metach is found, the associated "dbkey" is passed to a mapper (e.g., | |
172 **Map with BWA-MEM**) to align the reads to the associated reference. | |
173 | 107 |
174 The tool produces 2 text files, a "dbkey" file that contains the dbkey string and a "metrics" file that provides information | 108 The information needed to discover the best reference is maintained by the USDA in this repository_. References are curreently |
175 used to compile the "DNA print" string. | 109 |
110 .. _repository: https://github.com/USDA-VS/vSNP_reference_options | |
111 | |
112 limited to TB complex, paraTB, and Brucella, but information for additional references will be added. The information for each | |
113 reference is a string consisting of zeros and ones, compiled by USDA researchers, which we call a "DNA print". These strings | |
114 are maintained in yaml files for use in Galaxy, and are installed via the **vSNP DNAprints data manager** tool. | |
115 | |
116 This tool creates an in-memory dictionary of these DNA print strings for matching with a string generated by inspecting the | |
117 input sample data. During inspection, this tool accrues sequence counts for supported species, ultimately generating a string | |
118 consisting of zeros and ones based on the counts, (i.e., a DNA print). This string is then compared to the strings contained | |
119 in the in-memory dictionary of DNA prints to find a match. | |
120 | |
121 The strings in the in-memory dictionary are each associated with a Galaxy "dbkey" (i.e., genome build), so when a match is found, | |
122 the associated "dbkey" is passed to a mapper (e.g., **Map with BWA-MEM**), typically within a workflow via an expression tool, | |
123 to align the reads to the associated reference. | |
124 | |
125 This tool produces 2 text files, a "dbkey" file that contains the dbkey string and a "metrics" file that provides information | |
126 about the sequence counts that were discovered in the input sample data that produced the "DNA print" string. | |
176 | 127 |
177 This tool is important for samples containing bacterial species because many of the samples have a "mixed bag" of species, | 128 This tool is important for samples containing bacterial species because many of the samples have a "mixed bag" of species, |
178 and discovering the primary species is critical. DNA print matchig is currently supported for the following genomes. | 129 and discovering the primary species is critical. DNA print matching is currently supported for the following genomes. |
179 | 130 |
180 * Mycobacterium bovis AF2122/97 | 131 * Mycobacterium bovis AF2122/97 |
181 * Brucella abortus bv. 1 str. 9-941 | 132 * Brucella abortus bv. 1 str. 9-941 |
182 * Brucella abortus strain BER | 133 * Brucella abortus strain BER |
183 * Brucella canis ATCC 23365 | 134 * Brucella canis ATCC 23365 |
195 * Brucella suis bv. 3 str. 686 | 146 * Brucella suis bv. 3 str. 686 |
196 | 147 |
197 **Required Options** | 148 **Required Options** |
198 | 149 |
199 * **Choose the category of the files to be analyzed** - select "Single files" or "Collection of files", then select the appropriate history items (single or paired fastqsanger reads or a collection of fastqsanger reads) based on the selected option. | 150 * **Choose the category of the files to be analyzed** - select "Single files" or "Collection of files", then select the appropriate history items (single or paired fastqsanger reads or a collection of fastqsanger reads) based on the selected option. |
200 * **Number of processes for job splitting** - Select the number of processes for splitting the job to shorten execution time. | |
201 </help> | 151 </help> |
202 <citations> | 152 <expand macro="citations"/> |
203 <citation type="bibtex"> | |
204 @misc{None, | |
205 journal = {None}, | |
206 author = {1. Stuber T}, | |
207 title = {Manuscript in preparation}, | |
208 year = {None}, | |
209 url = {https://github.com/USDA-VS/vSNP},} | |
210 </citation> | |
211 </citations> | |
212 </tool> | 153 </tool> |
213 | 154 |