annotate vsnp_determine_ref_from_data.xml @ 0:ebc08e5ce646 draft

Uploaded
author greg
date Tue, 21 Apr 2020 10:08:28 -0400
parents
children bca267738b33
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
1 <tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="1.0.0">
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
2 <description>from input data</description>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
3 <requirements>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
4 <requirement type="package" version="1.76">biopython</requirement>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
5 <requirement type="package" version="5.3">pyyaml</requirement>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
6 </requirements>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
7 <command detect_errors="exit_code"><![CDATA[
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
8 #import os
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
9 #import re
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
10 #set $dnaprint_fields = $__app__.tool_data_tables['vsnp_dnaprints'].get_fields()
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
11 #set gzipped = 'false'
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
12 #set input_type = $input_type_cond.input_type
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
13 #set input_reads_dir = 'input_reads'
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
14 #set output_dbkey_dir = 'output_dbkey'
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
15 #set output_metrics_dir = 'output_metrics'
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
16 mkdir -p $input_reads_dir &&
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
17 mkdir -p $output_dbkey_dir &&
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
18 mkdir -p $output_metrics_dir &&
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
19 #if str($input_type) == "single":
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
20 #set read_type_cond = $input_type_cond.read_type_cond
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
21 #set read1 = $read_type_cond.read1
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
22 #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier))
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
23 #if str($read_type_cond.read_type) == "single":
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
24 ln -s '${read1}' '${read1_identifier}' &&
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
25 #if $read1.is_of_type('fastqsanger.gz'):
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
26 #set gzipped = 'true'
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
27 #end if
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
28 #else:
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
29 #set read2 = $read_type_cond.read2
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
30 #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
31 ln -s '${read1}' '${read1_identifier}' &&
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
32 ln -s '${read2}' '${read2_identifier}' &&
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
33 #if $read1.is_of_type('fastqsanger.gz') and $read2.is_of_type('fastqsanger.gz'):
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
34 #set gzipped = 'true'
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
35 #end if
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
36 #end if
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
37 #else:
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
38 #for $i in $input_type_cond.reads_collection:
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
39 #if $i.is_of_type('fastqsanger.gz'):
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
40 #set gzipped = 'true'
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
41 #end if
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
42 #set filename = $i.file_name
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
43 #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
44 ln -s $filename $input_reads_dir/$identifier &&
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
45 #end for
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
46 #end if
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
47 python '$__tool_directory__/vsnp_determine_ref_from_data.py'
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
48 #if str($input_type) == "single":
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
49 #if str($read_type_cond.read_type) == "single":
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
50 --read1 '${read1_identifier}'
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
51 #else:
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
52 --read1 '${read1_identifier}'
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
53 --read2 '${read2_identifier}'
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
54 #end if
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
55 --output_dbkey '$output_dbkey'
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
56 --output_metrics '$output_metrics'
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
57 #end if
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
58 --gzipped $gzipped
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
59 --processes $processes
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
60 #for $i in $dnaprint_fields:
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
61 --dnaprint_fields '${i[0]}' '${i[2]}'
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
62 #end for
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
63 ]]></command>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
64 <inputs>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
65 <conditional name="input_type_cond">
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
66 <param name="input_type" type="select" label="Choose the category of the files to be analyzed">
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
67 <option value="single" selected="true">Single files</option>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
68 <option value="collection">Collections of files</option>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
69 </param>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
70 <when value="single">
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
71 <conditional name="read_type_cond">
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
72 <param name="read_type" type="select" label="Choose the read type">
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
73 <option value="paired" selected="true">Paired</option>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
74 <option value="single">Single</option>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
75 </param>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
76 <when value="paired">
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
77 <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
78 <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
79 </when>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
80 <when value="single">
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
81 <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
82 </when>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
83 </conditional>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
84 </when>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
85 <when value="collection">
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
86 <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="list" label="Collection of fastqsanger files"/>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
87 </when>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
88 </conditional>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
89 <param name="processes" type="integer" min="1" max="20" value="8" label="Number of processes for job splitting"/>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
90 </inputs>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
91 <outputs>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
92 <data name="output_dbkey" format="txt" label="${tool.name} (dbkey) on ${on_string}">
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
93 <filter>input_type_cond['input_type'] == 'single'</filter>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
94 </data>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
95 <data name="output_metrics" format="txt" label="${tool.name} (metrics) on ${on_string}">
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
96 <filter>input_type_cond['input_type'] == 'single'</filter>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
97 </data>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
98 <collection name="output_dbkey_collection" type="list">
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
99 <discover_datasets pattern="__name__" directory="output_dbkey" format="txt" />
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
100 <filter>input_type_cond['input_type'] == 'collection'</filter>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
101 </collection>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
102 <collection name="output_metrics_collection" type="list">
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
103 <discover_datasets pattern="__name__" directory="output_metrics" format="txt" />
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
104 <filter>input_type_cond['input_type'] == 'collection'</filter>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
105 </collection>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
106 </outputs>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
107 <tests>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
108 <test>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
109 <!-- Need to figure out how to test installed data tables -->
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
110 <param name="read1" value="reads.fastqsanger" ftype="fastqsanger" dbkey="89"/>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
111 <param name="read2" value="read2.fastqsanger" ftype="fastqsanger" dbkey="89"/>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
112 <output name="output_dbkey" file="output_dbkey.txt" ftype="txt"/>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
113 <output name="output_metrics" file="output_metrics.txt" ftype="txt"/>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
114 </test>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
115 </tests>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
116 <help>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
117 **What it does**
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
118
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
119 Accepts a single fastqsanger read, a set of paired reads, or a collections of reads and inspects the data to discover the
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
120 best reference genome for aligning the reads. This tool is, in essence, a DNA sniffer, and is the first Galaxy tool to
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
121 perform this task. While inspecting the data, a string of 0's and 1's is compiled based on the data contents, and we call
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
122 the complete string a "DNA print". All of the "DNA prints" files installed by the complementary **vSNP DNAprints data
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
123 manager** tool are then inspected to find a match for the compiled "DNA print" string. These files are each associated
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
124 with a Galaxy "dbkey" (i.e., genome build), so when a metach is found, the associated "dbkey" is passed to a mapper (e.g.,
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
125 **Map with BWA-MEM**) to align the reads to the associated reference.
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
126
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
127 The tool produces 2 text files, a "dbkey" file that contains the dbkey string and a "metrics" file that provides information
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
128 used to compile the "DNA print" string.
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
129
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
130 This tool is important for samples containing bacterial species because many of the samples have a "mixed bag" of species,
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
131 and discovering the primary species is critical. DNA print matchig is currently supported for the following genomes.
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
132
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
133 * Mycobacterium bovis AF2122/97
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
134 * Brucella abortus bv. 1 str. 9-941
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
135 * Brucella abortus strain BER
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
136 * Brucella canis ATCC 23365
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
137 * Brucella ceti TE10759-12
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
138 * Brucella melitensis bv. 1 str. 16M
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
139 * Brucella melitensis bv. 3 str. Ether
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
140 * Brucella melitensis BwIM_SOM_36b
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
141 * Brucella melitensis ATCC 23457
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
142 * Brucella ovis ATCC 25840
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
143 * Brucella suis 1330
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
144 * Mycobacterium tuberculosis H37Rv
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
145 * Mycobacterium avium subsp. paratuberculosis strain Telford
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
146 * Mycobacterium avium subsp. paratuberculosis K-10
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
147 * Brucella suis ATCC 23445
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
148 * Brucella suis bv. 3 str. 686
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
149
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
150 **Required Options**
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
151
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
152 * **Choose the category of the files to be analyzed** - select "Single files" or "Collections of files", then select the appropriate history items (single or paired fastqsanger reads or collections of fastqsanger reads) based on the selected option.
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
153 * **Number of processes for job splitting** - Select the number of processes for splitting the job to shorten execution time.
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
154 </help>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
155 <citations>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
156 <citation type="bibtex">
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
157 @misc{None,
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
158 journal = {None},
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
159 author = {1. Stuber T},
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
160 title = {Manuscript in preparation},
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
161 year = {None},
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
162 url = {https://github.com/USDA-VS/vSNP},}
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
163 </citation>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
164 </citations>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
165 </tool>
ebc08e5ce646 Uploaded
greg
parents:
diff changeset
166