7
|
1 <tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="@WRAPPER_VERSION@.1+galaxy0" profile="@PROFILE@">
|
0
|
2 <description>from input data</description>
|
4
|
3 <macros>
|
|
4 <import>macros.xml</import>
|
|
5 </macros>
|
0
|
6 <requirements>
|
8
|
7 <expand macro="biopython_requirement"/>
|
|
8 <expand macro="pyyaml_requirement"/>
|
0
|
9 </requirements>
|
|
10 <command detect_errors="exit_code"><![CDATA[
|
|
11 #import re
|
|
12 #set gzipped = 'false'
|
|
13 #set input_type = $input_type_cond.input_type
|
4
|
14
|
|
15 #if $input_type in ["single", "pair"]:
|
|
16 #set read1 = $input_type_cond.read1
|
0
|
17 #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier))
|
4
|
18 ln -s '${read1}' '${read1_identifier}' &&
|
|
19 #if $input_type == "pair":
|
|
20 #set read2 = $input_type_cond.read2
|
|
21 #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
|
|
22 ln -s '${read2}' '${read2_identifier}' &&
|
0
|
23 #else:
|
4
|
24 #set read2 = None
|
0
|
25 #end if
|
|
26 #else:
|
4
|
27 #set read1 = $input_type_cond.reads_collection['forward']
|
|
28 #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.name))
|
|
29 ln -s '${read1}' '${read1_identifier}' &&
|
|
30 #set read2 = $input_type_cond.reads_collection['reverse']
|
|
31 #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.name))
|
|
32 ln -s '${read2}' '${read2_identifier}' &&
|
0
|
33 #end if
|
4
|
34
|
0
|
35 python '$__tool_directory__/vsnp_determine_ref_from_data.py'
|
4
|
36 --read1 '${read1_identifier}'
|
|
37 #if $read2 is not None
|
|
38 --read2 '${read2_identifier}'
|
0
|
39 #end if
|
|
40 --output_dbkey '$output_dbkey'
|
|
41 --output_metrics '$output_metrics'
|
4
|
42 #if $read1.is_of_type('fastqsanger.gz'):
|
|
43 --gzipped
|
0
|
44 #end if
|
4
|
45 #set $dnaprint_fields = $__app__.tool_data_tables['vsnp_dnaprints'].get_fields()
|
|
46 #for $i in $dnaprint_fields:
|
|
47 --dnaprint_fields '${i[0]}' '${i[2]}'
|
|
48 #end for
|
0
|
49 ]]></command>
|
|
50 <inputs>
|
|
51 <conditional name="input_type_cond">
|
|
52 <param name="input_type" type="select" label="Choose the category of the files to be analyzed">
|
|
53 <option value="single" selected="true">Single files</option>
|
4
|
54 <option value="paired">Paired reads</option>
|
|
55 <option value="pair">Paired reads in separate data sets</option>
|
0
|
56 </param>
|
|
57 <when value="single">
|
4
|
58 <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
|
0
|
59 </when>
|
4
|
60 <when value="paired">
|
|
61 <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="paired" label="Collection of fastqsanger paired read files"/>
|
|
62 </when>
|
|
63 <when value="pair">
|
|
64 <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
|
|
65 <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/>
|
0
|
66 </when>
|
|
67 </conditional>
|
|
68 </inputs>
|
|
69 <outputs>
|
4
|
70 <data name="output_dbkey" format="txt" label="${tool.name} on ${on_string} (dbkey)"/>
|
|
71 <data name="output_metrics" format="txt" label="${tool.name} on ${on_string} (metrics)"/>
|
0
|
72 </outputs>
|
|
73 <tests>
|
4
|
74 <!-- 1 single read -->
|
|
75 <test expect_num_outputs="2">
|
|
76 <param name="input_type" value="single"/>
|
1
|
77 <param name="read1" value="Mcap_Deer_DE_SRR650221.fastq.gz" ftype="fastqsanger.gz"/>
|
0
|
78 <output name="output_dbkey" file="output_dbkey.txt" ftype="txt"/>
|
|
79 <output name="output_metrics" file="output_metrics.txt" ftype="txt"/>
|
|
80 </test>
|
4
|
81 <!-- 1 set of paired reads -->
|
|
82 <test expect_num_outputs="2">
|
|
83 <param name="input_type" value="pair"/>
|
|
84 <param name="read1" value="CMC_20E1_R1.fastq.gz" ftype="fastqsanger.gz"/>
|
|
85 <param name="read2" value="CMC_20E1_R2.fastq.gz" ftype="fastqsanger.gz"/>
|
|
86 <output name="output_dbkey" file="paired_dbkey.txt" ftype="txt"/>
|
|
87 <output name="output_metrics" file="paired_metrics.txt" ftype="txt"/>
|
|
88 </test>
|
|
89 <!-- A collection of paired reads -->
|
|
90 <test expect_num_outputs="2">
|
|
91 <param name="input_type" value="paired"/>
|
1
|
92 <param name="reads_collection">
|
|
93 <collection type="paired">
|
4
|
94 <element name="forward" value="CMC_20E1_R1.fastq.gz" ftype="fastqsanger.gz"/>
|
|
95 <element name="reverse" value="CMC_20E1_R2.fastq.gz" ftype="fastqsanger.gz"/>
|
1
|
96 </collection>
|
|
97 </param>
|
4
|
98 <output name="output_dbkey" file="paired_dbkey.txt" ftype="txt"/>
|
|
99 <output name="output_metrics" file="paired_collection_metrics.txt" ftype="txt"/>
|
1
|
100 </test>
|
0
|
101 </tests>
|
|
102 <help>
|
|
103 **What it does**
|
|
104
|
4
|
105 Accepts a single fastqsanger read, a set of paired reads, or a collection of single or paired reads (bacterial samples) and
|
|
106 inspects the data to discover the best reference genome for aligning the reads.
|
|
107
|
|
108 The information needed to discover the best reference is maintained by the USDA in this repository_. References are curreently
|
|
109
|
|
110 .. _repository: https://github.com/USDA-VS/vSNP_reference_options
|
|
111
|
|
112 limited to TB complex, paraTB, and Brucella, but information for additional references will be added. The information for each
|
|
113 reference is a string consisting of zeros and ones, compiled by USDA researchers, which we call a "DNA print". These strings
|
|
114 are maintained in yaml files for use in Galaxy, and are installed via the **vSNP DNAprints data manager** tool.
|
0
|
115
|
4
|
116 This tool creates an in-memory dictionary of these DNA print strings for matching with a string generated by inspecting the
|
|
117 input sample data. During inspection, this tool accrues sequence counts for supported species, ultimately generating a string
|
|
118 consisting of zeros and ones based on the counts, (i.e., a DNA print). This string is then compared to the strings contained
|
|
119 in the in-memory dictionary of DNA prints to find a match.
|
|
120
|
|
121 The strings in the in-memory dictionary are each associated with a Galaxy "dbkey" (i.e., genome build), so when a match is found,
|
|
122 the associated "dbkey" is passed to a mapper (e.g., **Map with BWA-MEM**), typically within a workflow via an expression tool,
|
|
123 to align the reads to the associated reference.
|
|
124
|
|
125 This tool produces 2 text files, a "dbkey" file that contains the dbkey string and a "metrics" file that provides information
|
|
126 about the sequence counts that were discovered in the input sample data that produced the "DNA print" string.
|
0
|
127
|
|
128 This tool is important for samples containing bacterial species because many of the samples have a "mixed bag" of species,
|
4
|
129 and discovering the primary species is critical. DNA print matching is currently supported for the following genomes.
|
0
|
130
|
|
131 * Mycobacterium bovis AF2122/97
|
|
132 * Brucella abortus bv. 1 str. 9-941
|
|
133 * Brucella abortus strain BER
|
|
134 * Brucella canis ATCC 23365
|
|
135 * Brucella ceti TE10759-12
|
|
136 * Brucella melitensis bv. 1 str. 16M
|
|
137 * Brucella melitensis bv. 3 str. Ether
|
|
138 * Brucella melitensis BwIM_SOM_36b
|
|
139 * Brucella melitensis ATCC 23457
|
|
140 * Brucella ovis ATCC 25840
|
|
141 * Brucella suis 1330
|
|
142 * Mycobacterium tuberculosis H37Rv
|
|
143 * Mycobacterium avium subsp. paratuberculosis strain Telford
|
|
144 * Mycobacterium avium subsp. paratuberculosis K-10
|
|
145 * Brucella suis ATCC 23445
|
|
146 * Brucella suis bv. 3 str. 686
|
|
147
|
|
148 **Required Options**
|
|
149
|
1
|
150 * **Choose the category of the files to be analyzed** - select "Single files" or "Collection of files", then select the appropriate history items (single or paired fastqsanger reads or a collection of fastqsanger reads) based on the selected option.
|
0
|
151 </help>
|
4
|
152 <expand macro="citations"/>
|
0
|
153 </tool>
|
|
154
|