diff vsnp_determine_ref_from_data.xml @ 0:ebc08e5ce646 draft

Uploaded
author greg
date Tue, 21 Apr 2020 10:08:28 -0400
parents
children bca267738b33
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vsnp_determine_ref_from_data.xml	Tue Apr 21 10:08:28 2020 -0400
@@ -0,0 +1,166 @@
+<tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="1.0.0">
+    <description>from input data</description>
+    <requirements>
+        <requirement type="package" version="1.76">biopython</requirement>
+        <requirement type="package" version="5.3">pyyaml</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+#import os
+#import re
+#set $dnaprint_fields = $__app__.tool_data_tables['vsnp_dnaprints'].get_fields()
+#set gzipped = 'false'
+#set input_type = $input_type_cond.input_type
+#set input_reads_dir = 'input_reads'
+#set output_dbkey_dir = 'output_dbkey'
+#set output_metrics_dir = 'output_metrics'
+mkdir -p $input_reads_dir &&
+mkdir -p $output_dbkey_dir &&
+mkdir -p $output_metrics_dir &&
+#if str($input_type) == "single":
+    #set read_type_cond = $input_type_cond.read_type_cond
+    #set read1 = $read_type_cond.read1
+    #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier))
+    #if str($read_type_cond.read_type) == "single":
+        ln -s '${read1}' '${read1_identifier}' &&
+        #if $read1.is_of_type('fastqsanger.gz'):
+            #set gzipped = 'true'
+        #end if
+    #else:
+        #set read2 = $read_type_cond.read2
+        #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier))
+        ln -s '${read1}' '${read1_identifier}' &&
+        ln -s '${read2}' '${read2_identifier}' &&
+        #if $read1.is_of_type('fastqsanger.gz') and $read2.is_of_type('fastqsanger.gz'):
+            #set gzipped = 'true'
+        #end if
+    #end if
+#else:
+    #for $i in $input_type_cond.reads_collection:
+        #if $i.is_of_type('fastqsanger.gz'):
+            #set gzipped = 'true'
+        #end if
+        #set filename = $i.file_name
+        #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier))
+        ln -s $filename $input_reads_dir/$identifier &&
+    #end for
+#end if
+python '$__tool_directory__/vsnp_determine_ref_from_data.py'
+#if str($input_type) == "single":
+    #if str($read_type_cond.read_type) == "single":
+        --read1 '${read1_identifier}'
+    #else:
+        --read1 '${read1_identifier}'
+        --read2 '${read2_identifier}'
+    #end if
+    --output_dbkey '$output_dbkey'
+    --output_metrics '$output_metrics'
+#end if
+--gzipped $gzipped
+--processes $processes
+#for $i in $dnaprint_fields:
+    --dnaprint_fields '${i[0]}' '${i[2]}'
+#end for
+]]></command>
+    <inputs>
+        <conditional name="input_type_cond">
+            <param name="input_type" type="select" label="Choose the category of the files to be analyzed">
+                <option value="single" selected="true">Single files</option>
+                <option value="collection">Collections of files</option>
+            </param>
+            <when value="single">
+                <conditional name="read_type_cond">
+                    <param name="read_type" type="select" label="Choose the read type">
+                        <option value="paired" selected="true">Paired</option>
+                        <option value="single">Single</option>
+                    </param>
+                    <when value="paired">
+                        <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
+                        <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/>
+                    </when>
+                    <when value="single">
+                        <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/>
+                    </when>
+                </conditional>
+            </when>
+            <when value="collection">
+                <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="list" label="Collection of fastqsanger files"/>
+            </when>
+        </conditional>
+        <param name="processes" type="integer" min="1" max="20" value="8" label="Number of processes for job splitting"/>
+    </inputs>
+    <outputs>
+        <data name="output_dbkey" format="txt"  label="${tool.name} (dbkey) on ${on_string}">
+            <filter>input_type_cond['input_type'] == 'single'</filter>
+        </data>
+        <data name="output_metrics" format="txt"  label="${tool.name} (metrics) on ${on_string}">
+            <filter>input_type_cond['input_type'] == 'single'</filter>
+        </data>
+        <collection name="output_dbkey_collection" type="list">
+            <discover_datasets pattern="__name__" directory="output_dbkey" format="txt" />
+            <filter>input_type_cond['input_type'] == 'collection'</filter>
+        </collection>
+        <collection name="output_metrics_collection" type="list">
+            <discover_datasets pattern="__name__" directory="output_metrics" format="txt" />
+            <filter>input_type_cond['input_type'] == 'collection'</filter>
+        </collection>
+    </outputs>
+    <tests>
+        <test>
+            <!-- Need to figure out how to test installed data tables -->
+            <param name="read1" value="reads.fastqsanger" ftype="fastqsanger" dbkey="89"/>
+            <param name="read2" value="read2.fastqsanger" ftype="fastqsanger" dbkey="89"/>
+            <output name="output_dbkey" file="output_dbkey.txt" ftype="txt"/>
+            <output name="output_metrics" file="output_metrics.txt" ftype="txt"/>
+        </test>
+    </tests>
+    <help>
+**What it does**
+
+Accepts a single fastqsanger read, a set of paired reads, or a collections of reads and inspects the data to discover the
+best reference genome for aligning the reads.  This tool is, in essence, a DNA sniffer, and is the first Galaxy tool to
+perform this task.  While inspecting the data, a string of 0's and 1's is compiled based on the data contents, and we call
+the complete string a "DNA print".  All of the "DNA prints" files installed by the complementary **vSNP DNAprints data
+manager** tool are then inspected to find a match for the compiled "DNA print" string.  These files are each associated
+with a Galaxy "dbkey" (i.e., genome build), so when a metach is found, the associated "dbkey" is passed to a mapper (e.g.,
+**Map with BWA-MEM**) to align the reads to the associated reference.
+
+The tool produces 2 text files, a "dbkey" file that contains the dbkey string and a "metrics" file that provides information
+used to compile the "DNA print" string.
+
+This tool is important for samples containing bacterial species because many of the samples have a "mixed bag" of species,
+and discovering the primary species is critical.  DNA print matchig is currently supported for the following genomes.
+
+ * Mycobacterium bovis AF2122/97
+ * Brucella abortus bv. 1 str. 9-941
+ * Brucella abortus strain BER
+ * Brucella canis ATCC 23365
+ * Brucella ceti TE10759-12
+ * Brucella melitensis bv. 1 str. 16M
+ * Brucella melitensis bv. 3 str. Ether
+ * Brucella melitensis BwIM_SOM_36b
+ * Brucella melitensis ATCC 23457
+ * Brucella ovis ATCC 25840
+ * Brucella suis 1330
+ * Mycobacterium tuberculosis H37Rv
+ * Mycobacterium avium subsp. paratuberculosis strain Telford
+ * Mycobacterium avium subsp. paratuberculosis K-10
+ * Brucella suis ATCC 23445
+ * Brucella suis bv. 3 str. 686
+
+**Required Options**
+
+ * **Choose the category of the files to be analyzed** - select "Single files" or "Collections of files", then select the appropriate history items (single or paired fastqsanger reads or collections of fastqsanger reads) based on the selected option.
+ * **Number of processes for job splitting** - Select the number of processes for splitting the job to shorten execution time.
+    </help>
+    <citations>
+        <citation type="bibtex">
+            @misc{None,
+            journal = {None},
+            author = {1. Stuber T},
+            title = {Manuscript in preparation},
+            year = {None},
+            url = {https://github.com/USDA-VS/vSNP},}
+        </citation>
+    </citations>
+</tool>
+