vsnp_determine_ref_from_data: vsnp_determine_ref_from

comparison vsnp_determine_ref_from_data.xml @ 4:36bdf8b439ed draft

Uploaded

author	greg
date	Sun, 03 Jan 2021 16:13:22 +0000
parents	6116deacb2c7
children	d5e66f9fe086

comparison

equal deleted inserted replaced

-:6116deacb2c7
+:36bdf8b439ed
-<tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="1.0.0">
+<tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="@WRAPPER_VERSION@.1" profile="@PROFILE@">
 <description>from input data</description>
+<macros>
+<import>macros.xml</import>
+</macros>
 <requirements>
 <requirement type="package" version="1.76">biopython</requirement>
 <requirement type="package" version="5.3">pyyaml</requirement>
 </requirements>
 <command detect_errors="exit_code"><![CDATA[
-#import os
 #import re
 #set gzipped = 'false'
 #set input_type = $input_type_cond.input_type
-#set input_reads_dir = 'input_reads'
-#set output_dbkey_dir = 'output_dbkey'
+#if $input_type in ["single", "pair"]:
-#set output_metrics_dir = 'output_metrics'
+#set read1 = $input_type_cond.read1
-mkdir -p $input_reads_dir &&
-mkdir -p $output_dbkey_dir &&
-mkdir -p $output_metrics_dir &&
-#if str($input_type) == "single":
-#set read_type_cond = $input_type_cond.read_type_cond
-#set read1 = $read_type_cond.read1
 #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier))
-#if str($read_type_cond.read_type) == "single":
+ln -s '${read1}' '${read1_identifier}' &&
-ln -s '${read1}' '${read1_identifier}' &&
+#if $input_type == "pair":
-#if $read1.is_of_type('fastqsanger.gz'):
+#set read2 = $input_type_cond.read2
-#set gzipped = 'true'
+#set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
-#end if
+ln -s '${read2}' '${read2_identifier}' &&
 #else:
-#set read2 = $read_type_cond.read2
+#set read2 = None
-#set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
-ln -s '${read1}' '${read1_identifier}' &&
-ln -s '${read2}' '${read2_identifier}' &&
-#if $read1.is_of_type('fastqsanger.gz') and $read2.is_of_type('fastqsanger.gz'):
-#set gzipped = 'true'
-#end if
 #end if
 #else:
-#set collection_type = $input_type_cond.collection_type_cond.collection_type
+#set read1 = $input_type_cond.reads_collection['forward']
-#for $i in $input_type_cond.collection_type_cond.reads_collection:
+#set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.name))
-#if $i.is_of_type('fastqsanger.gz'):
+ln -s '${read1}' '${read1_identifier}' &&
-#set gzipped = 'true'
+#set read2 = $input_type_cond.reads_collection['reverse']
-#end if
+#set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.name))
-#set filename = $i.file_name
+ln -s '${read2}' '${read2_identifier}' &&
-#if str($collection_type) == 'single_reads':
-#set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
-#else:
-## Galaxy builds lists of pairs as nested lists with elements
-## named forward and reverse.  When flattened, these lists
-## will work as inputs to the Parse parameter value expression
-## tool in workflows.  However, the output list created by the
-## expression tool will not function correctly with the bwa_mem
-## mapper.  Naming the identifier as follows is a solution.
-#set identifier = re.sub('[^\s\w\-]', '_', str($i.name))
-#end if
-ln -s '$filename' '$input_reads_dir/$identifier' &&
-#end for
 #end if
 python '$__tool_directory__/vsnp_determine_ref_from_data.py'
-#if str($input_type) == "single":
+--read1 '${read1_identifier}'
-#if str($read_type_cond.read_type) == "single":
+#if $read2 is not None
---read1 '${read1_identifier}'
+--read2 '${read2_identifier}'
-#else:
---read1 '${read1_identifier}'
---read2 '${read2_identifier}'
 #end if
 --output_dbkey '$output_dbkey'
 --output_metrics '$output_metrics'
+#if $read1.is_of_type('fastqsanger.gz'):
+--gzipped
 #end if
---gzipped $gzipped
+#set $dnaprint_fields = $__app__.tool_data_tables['vsnp_dnaprints'].get_fields()
---processes $processes
+#for $i in $dnaprint_fields:
-#if str($in_test_mode) == "false":
+--dnaprint_fields '${i[0]}' '${i[2]}'
-#set $dnaprint_fields = $__app__.tool_data_tables['vsnp_dnaprints'].get_fields()
+#end for
-#for $i in $dnaprint_fields:
---dnaprint_fields '${i[0]}' '${i[2]}'
-#end for
-#else:
---in_test_mode '$in_test_mode'
-#end if
 ]]></command>
 <inputs>
 <conditional name="input_type_cond">
 <param name="input_type" type="select" label="Choose the category of the files to be analyzed">
 <option value="single" selected="true">Single files</option>
-<option value="collection">Collection of files</option>
+		<option value="paired">Paired reads</option>
+		<option value="pair">Paired reads in separate data sets</option>
 </param>
 <when value="single">
-<conditional name="read_type_cond">
+<param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
-<param name="read_type" type="select" label="Choose the read type">
-<option value="paired" selected="true">Paired</option>
-<option value="single">Single</option>
-</param>
-<when value="paired">
-<param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
-<param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/>
-</when>
-<when value="single">
-<param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
-</when>
-</conditional>
 </when>
-<when value="collection">
+<when value="paired">
-<conditional name="collection_type_cond">
+<param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="paired" label="Collection of fastqsanger paired read files"/>
-<param name="collection_type" type="select" label="Collection of single reads or paired reads?">
+</when>
-<option value="single_reads" selected="true">Single reads</option>
+<when value="pair">
-<option value="paired_reads">Paired reads</option>
+<param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
-</param>
+<param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/>
-<when value="single_reads">
-<param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="list" label="Collection of fastqsanger files"/>
-</when>
-<when value="paired_reads">
-<param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="paired" label="Collection of fastqsanger paired read files"/>
-</when>
-</conditional>
 </when>
 </conditional>
-<param name="processes" type="integer" min="1" max="20" value="8" label="Number of processes for job splitting"/>
-<!-- Functional testing -->
-<param name="in_test_mode" type="hidden" value="false"/>
 </inputs>
 <outputs>
-<data name="output_dbkey" format="txt"  label="${tool.name} (dbkey) on ${on_string}">
+<data name="output_dbkey" format="txt" label="${tool.name} on ${on_string} (dbkey)"/>
-<filter>input_type_cond['input_type'] == 'single'</filter>
+<data name="output_metrics" format="txt" label="${tool.name} on ${on_string} (metrics)"/>
-</data>
-<data name="output_metrics" format="txt"  label="${tool.name} (metrics) on ${on_string}">
-<filter>input_type_cond['input_type'] == 'single'</filter>
-</data>
-<collection name="output_dbkey_collection" type="list" label="${tool.name} (dbkey) on ${on_string}">
-<discover_datasets pattern="__name__" directory="output_dbkey" format="txt"/>
-<filter>input_type_cond['input_type'] == 'collection'</filter>
-</collection>
-<collection name="output_metrics_collection" type="list" label="${tool.name} (metrics) on ${on_string}">
-<discover_datasets pattern="__name__" directory="output_metrics" format="txt"/>
-<filter>input_type_cond['input_type'] == 'collection'</filter>
-</collection>
 </outputs>
 <tests>
-<test>
+<!-- 1 single read -->
-<param name="in_test_mode" value="true"/>
+<test expect_num_outputs="2">
-<param name="read_type" value="single"/>
+<param name="input_type" value="single"/>
 <param name="read1" value="Mcap_Deer_DE_SRR650221.fastq.gz" ftype="fastqsanger.gz"/>
 <output name="output_dbkey" file="output_dbkey.txt" ftype="txt"/>
 <output name="output_metrics" file="output_metrics.txt" ftype="txt"/>
 </test>
-<test>
+<!-- 1 set of paired reads -->
-<param name="in_test_mode" value="true"/>
+<test expect_num_outputs="2">
-<param name="input_type" value="collection"/>
+<param name="input_type" value="pair"/>
-<param name="collection_type" value="paired_reads"/>
+<param name="read1" value="CMC_20E1_R1.fastq.gz" ftype="fastqsanger.gz"/>
+<param name="read2" value="CMC_20E1_R2.fastq.gz" ftype="fastqsanger.gz"/>
+<output name="output_dbkey" file="paired_dbkey.txt" ftype="txt"/>
+<output name="output_metrics" file="paired_metrics.txt" ftype="txt"/>
+</test>
+<!-- A collection of paired reads -->
+<test expect_num_outputs="2">
+<param name="input_type" value="paired"/>
 <param name="reads_collection">
 <collection type="paired">
-<element name="forward" value="forward.fastq.gz" ftype="fastqsanger.gz"/>
+<element name="forward" value="CMC_20E1_R1.fastq.gz" ftype="fastqsanger.gz"/>
-<element name="reverse" value="reverse.fastq.gz" ftype="fastqsanger.gz"/>
+<element name="reverse" value="CMC_20E1_R2.fastq.gz" ftype="fastqsanger.gz"/>
 </collection>
 </param>
-<output_collection name="output_dbkey_collection" type="list">
+<output name="output_dbkey" file="paired_dbkey.txt" ftype="txt"/>
-<element name="forward.txt" file="forward_dbkey.txt" ftype="txt"/>
+<output name="output_metrics" file="paired_collection_metrics.txt" ftype="txt"/>
-<element name="reverse.txt" file="reverse_dbkey.txt" ftype="txt"/>
-</output_collection>
-<output_collection name="output_metrics_collection" type="list">
-<element name="forward.txt" file="forward_metrics.txt" ftype="txt"/>
-<element name="reverse.txt" file="reverse_metrics.txt" ftype="txt"/>
-</output_collection>
 </test>
 </tests>
 <help>
 **What it does**
-Accepts a single fastqsanger read, a set of paired reads, or a collection of reads and inspects the data to discover the
+Accepts a single fastqsanger read, a set of paired reads, or a collection of single or paired reads (bacterial samples) and
-best reference genome for aligning the reads.  This tool is, in essence, a DNA sniffer, and is the first Galaxy tool to
+inspects the data to discover the best reference genome for aligning the reads.
-perform this task.  While inspecting the data, a string of 0's and 1's is compiled based on the data contents, and we call
-the complete string a "DNA print".  All of the "DNA prints" files installed by the complementary **vSNP DNAprints data
-manager** tool are then inspected to find a match for the compiled "DNA print" string.  These files are each associated
-with a Galaxy "dbkey" (i.e., genome build), so when a metach is found, the associated "dbkey" is passed to a mapper (e.g.,
-**Map with BWA-MEM**) to align the reads to the associated reference.
-The tool produces 2 text files, a "dbkey" file that contains the dbkey string and a "metrics" file that provides information
+The information needed to discover the best reference is maintained by the USDA in this repository_.  References are curreently
-used to compile the "DNA print" string.
+.. _repository:  https://github.com/USDA-VS/vSNP_reference_options
+limited to TB complex, paraTB, and Brucella, but information for additional references will be added.  The information for each
+reference is a string consisting of zeros and ones, compiled by USDA researchers, which we call a "DNA print".   These strings
+are maintained in yaml files for use in Galaxy, and are installed via the **vSNP DNAprints data manager** tool.
+This tool creates an in-memory dictionary of these DNA print strings for matching with a string generated by inspecting the
+input sample data.  During inspection, this tool accrues sequence counts for supported species, ultimately generating a string
+consisting of zeros and ones based on the counts, (i.e., a DNA print).  This string is then compared to the strings contained
+in the in-memory dictionary of DNA prints to find a match.
+The strings in the in-memory dictionary are each associated with a Galaxy "dbkey" (i.e., genome build), so when a match is found,
+the associated "dbkey" is passed to a mapper (e.g., **Map with BWA-MEM**), typically within a workflow via an expression tool,
+to align the reads to the associated reference.
+This tool produces 2 text files, a "dbkey" file that contains the dbkey string and a "metrics" file that provides information
+about the sequence counts that were discovered in the input sample data that produced the "DNA print" string.
 This tool is important for samples containing bacterial species because many of the samples have a "mixed bag" of species,
-and discovering the primary species is critical.  DNA print matchig is currently supported for the following genomes.
+and discovering the primary species is critical.  DNA print matching is currently supported for the following genomes.
 * Mycobacterium bovis AF2122/97
 * Brucella abortus bv. 1 str. 9-941
 * Brucella abortus strain BER
 * Brucella canis ATCC 23365
 * Brucella suis bv. 3 str. 686
 **Required Options**
 * **Choose the category of the files to be analyzed** - select "Single files" or "Collection of files", then select the appropriate history items (single or paired fastqsanger reads or a collection of fastqsanger reads) based on the selected option.
-* **Number of processes for job splitting** - Select the number of processes for splitting the job to shorten execution time.
 </help>
-<citations>
+<expand macro="citations"/>
-<citation type="bibtex">
-@misc{None,
-journal = {None},
-author = {1. Stuber T},
-title = {Manuscript in preparation},
-year = {None},
-url = {https://github.com/USDA-VS/vSNP},}
-</citation>
-</citations>
 </tool>

Mercurial > repos > greg > vsnp_determine_ref_from_data

comparison vsnp_determine_ref_from_data.xml @ 4:36bdf8b439ed draft