comparison vsnp_determine_ref_from_data.xml @ 0:ebc08e5ce646 draft

Uploaded
author greg
date Tue, 21 Apr 2020 10:08:28 -0400
parents
children bca267738b33
comparison
equal deleted inserted replaced
-1:000000000000 0:ebc08e5ce646
1 <tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="1.0.0">
2 <description>from input data</description>
3 <requirements>
4 <requirement type="package" version="1.76">biopython</requirement>
5 <requirement type="package" version="5.3">pyyaml</requirement>
6 </requirements>
7 <command detect_errors="exit_code"><![CDATA[
8 #import os
9 #import re
10 #set $dnaprint_fields = $__app__.tool_data_tables['vsnp_dnaprints'].get_fields()
11 #set gzipped = 'false'
12 #set input_type = $input_type_cond.input_type
13 #set input_reads_dir = 'input_reads'
14 #set output_dbkey_dir = 'output_dbkey'
15 #set output_metrics_dir = 'output_metrics'
16 mkdir -p $input_reads_dir &&
17 mkdir -p $output_dbkey_dir &&
18 mkdir -p $output_metrics_dir &&
19 #if str($input_type) == "single":
20 #set read_type_cond = $input_type_cond.read_type_cond
21 #set read1 = $read_type_cond.read1
22 #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier))
23 #if str($read_type_cond.read_type) == "single":
24 ln -s '${read1}' '${read1_identifier}' &&
25 #if $read1.is_of_type('fastqsanger.gz'):
26 #set gzipped = 'true'
27 #end if
28 #else:
29 #set read2 = $read_type_cond.read2
30 #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
31 ln -s '${read1}' '${read1_identifier}' &&
32 ln -s '${read2}' '${read2_identifier}' &&
33 #if $read1.is_of_type('fastqsanger.gz') and $read2.is_of_type('fastqsanger.gz'):
34 #set gzipped = 'true'
35 #end if
36 #end if
37 #else:
38 #for $i in $input_type_cond.reads_collection:
39 #if $i.is_of_type('fastqsanger.gz'):
40 #set gzipped = 'true'
41 #end if
42 #set filename = $i.file_name
43 #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
44 ln -s $filename $input_reads_dir/$identifier &&
45 #end for
46 #end if
47 python '$__tool_directory__/vsnp_determine_ref_from_data.py'
48 #if str($input_type) == "single":
49 #if str($read_type_cond.read_type) == "single":
50 --read1 '${read1_identifier}'
51 #else:
52 --read1 '${read1_identifier}'
53 --read2 '${read2_identifier}'
54 #end if
55 --output_dbkey '$output_dbkey'
56 --output_metrics '$output_metrics'
57 #end if
58 --gzipped $gzipped
59 --processes $processes
60 #for $i in $dnaprint_fields:
61 --dnaprint_fields '${i[0]}' '${i[2]}'
62 #end for
63 ]]></command>
64 <inputs>
65 <conditional name="input_type_cond">
66 <param name="input_type" type="select" label="Choose the category of the files to be analyzed">
67 <option value="single" selected="true">Single files</option>
68 <option value="collection">Collections of files</option>
69 </param>
70 <when value="single">
71 <conditional name="read_type_cond">
72 <param name="read_type" type="select" label="Choose the read type">
73 <option value="paired" selected="true">Paired</option>
74 <option value="single">Single</option>
75 </param>
76 <when value="paired">
77 <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
78 <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/>
79 </when>
80 <when value="single">
81 <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
82 </when>
83 </conditional>
84 </when>
85 <when value="collection">
86 <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="list" label="Collection of fastqsanger files"/>
87 </when>
88 </conditional>
89 <param name="processes" type="integer" min="1" max="20" value="8" label="Number of processes for job splitting"/>
90 </inputs>
91 <outputs>
92 <data name="output_dbkey" format="txt" label="${tool.name} (dbkey) on ${on_string}">
93 <filter>input_type_cond['input_type'] == 'single'</filter>
94 </data>
95 <data name="output_metrics" format="txt" label="${tool.name} (metrics) on ${on_string}">
96 <filter>input_type_cond['input_type'] == 'single'</filter>
97 </data>
98 <collection name="output_dbkey_collection" type="list">
99 <discover_datasets pattern="__name__" directory="output_dbkey" format="txt" />
100 <filter>input_type_cond['input_type'] == 'collection'</filter>
101 </collection>
102 <collection name="output_metrics_collection" type="list">
103 <discover_datasets pattern="__name__" directory="output_metrics" format="txt" />
104 <filter>input_type_cond['input_type'] == 'collection'</filter>
105 </collection>
106 </outputs>
107 <tests>
108 <test>
109 <!-- Need to figure out how to test installed data tables -->
110 <param name="read1" value="reads.fastqsanger" ftype="fastqsanger" dbkey="89"/>
111 <param name="read2" value="read2.fastqsanger" ftype="fastqsanger" dbkey="89"/>
112 <output name="output_dbkey" file="output_dbkey.txt" ftype="txt"/>
113 <output name="output_metrics" file="output_metrics.txt" ftype="txt"/>
114 </test>
115 </tests>
116 <help>
117 **What it does**
118
119 Accepts a single fastqsanger read, a set of paired reads, or a collections of reads and inspects the data to discover the
120 best reference genome for aligning the reads. This tool is, in essence, a DNA sniffer, and is the first Galaxy tool to
121 perform this task. While inspecting the data, a string of 0's and 1's is compiled based on the data contents, and we call
122 the complete string a "DNA print". All of the "DNA prints" files installed by the complementary **vSNP DNAprints data
123 manager** tool are then inspected to find a match for the compiled "DNA print" string. These files are each associated
124 with a Galaxy "dbkey" (i.e., genome build), so when a metach is found, the associated "dbkey" is passed to a mapper (e.g.,
125 **Map with BWA-MEM**) to align the reads to the associated reference.
126
127 The tool produces 2 text files, a "dbkey" file that contains the dbkey string and a "metrics" file that provides information
128 used to compile the "DNA print" string.
129
130 This tool is important for samples containing bacterial species because many of the samples have a "mixed bag" of species,
131 and discovering the primary species is critical. DNA print matchig is currently supported for the following genomes.
132
133 * Mycobacterium bovis AF2122/97
134 * Brucella abortus bv. 1 str. 9-941
135 * Brucella abortus strain BER
136 * Brucella canis ATCC 23365
137 * Brucella ceti TE10759-12
138 * Brucella melitensis bv. 1 str. 16M
139 * Brucella melitensis bv. 3 str. Ether
140 * Brucella melitensis BwIM_SOM_36b
141 * Brucella melitensis ATCC 23457
142 * Brucella ovis ATCC 25840
143 * Brucella suis 1330
144 * Mycobacterium tuberculosis H37Rv
145 * Mycobacterium avium subsp. paratuberculosis strain Telford
146 * Mycobacterium avium subsp. paratuberculosis K-10
147 * Brucella suis ATCC 23445
148 * Brucella suis bv. 3 str. 686
149
150 **Required Options**
151
152 * **Choose the category of the files to be analyzed** - select "Single files" or "Collections of files", then select the appropriate history items (single or paired fastqsanger reads or collections of fastqsanger reads) based on the selected option.
153 * **Number of processes for job splitting** - Select the number of processes for splitting the job to shorten execution time.
154 </help>
155 <citations>
156 <citation type="bibtex">
157 @misc{None,
158 journal = {None},
159 author = {1. Stuber T},
160 title = {Manuscript in preparation},
161 year = {None},
162 url = {https://github.com/USDA-VS/vSNP},}
163 </citation>
164 </citations>
165 </tool>
166