Mercurial > repos > greg > vsnp_determine_ref_from_data
diff vsnp_determine_ref_from_data.xml @ 0:ebc08e5ce646 draft
Uploaded
author | greg |
---|---|
date | Tue, 21 Apr 2020 10:08:28 -0400 |
parents | |
children | bca267738b33 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vsnp_determine_ref_from_data.xml Tue Apr 21 10:08:28 2020 -0400 @@ -0,0 +1,166 @@ +<tool id="vsnp_determine_ref_from_data" name="vSNP: determine reference" version="1.0.0"> + <description>from input data</description> + <requirements> + <requirement type="package" version="1.76">biopython</requirement> + <requirement type="package" version="5.3">pyyaml</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ +#import os +#import re +#set $dnaprint_fields = $__app__.tool_data_tables['vsnp_dnaprints'].get_fields() +#set gzipped = 'false' +#set input_type = $input_type_cond.input_type +#set input_reads_dir = 'input_reads' +#set output_dbkey_dir = 'output_dbkey' +#set output_metrics_dir = 'output_metrics' +mkdir -p $input_reads_dir && +mkdir -p $output_dbkey_dir && +mkdir -p $output_metrics_dir && +#if str($input_type) == "single": + #set read_type_cond = $input_type_cond.read_type_cond + #set read1 = $read_type_cond.read1 + #set read1_identifier = re.sub('[^\s\w\-]', '_', str($read1.element_identifier)) + #if str($read_type_cond.read_type) == "single": + ln -s '${read1}' '${read1_identifier}' && + #if $read1.is_of_type('fastqsanger.gz'): + #set gzipped = 'true' + #end if + #else: + #set read2 = $read_type_cond.read2 + #set read2_identifier = re.sub('[^\s\w\-]', '_', str($read2.element_identifier)) + ln -s '${read1}' '${read1_identifier}' && + ln -s '${read2}' '${read2_identifier}' && + #if $read1.is_of_type('fastqsanger.gz') and $read2.is_of_type('fastqsanger.gz'): + #set gzipped = 'true' + #end if + #end if +#else: + #for $i in $input_type_cond.reads_collection: + #if $i.is_of_type('fastqsanger.gz'): + #set gzipped = 'true' + #end if + #set filename = $i.file_name + #set identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier)) + ln -s $filename $input_reads_dir/$identifier && + #end for +#end if +python '$__tool_directory__/vsnp_determine_ref_from_data.py' +#if str($input_type) == "single": + #if str($read_type_cond.read_type) == "single": + --read1 '${read1_identifier}' + #else: + --read1 '${read1_identifier}' + --read2 '${read2_identifier}' + #end if + --output_dbkey '$output_dbkey' + --output_metrics '$output_metrics' +#end if +--gzipped $gzipped +--processes $processes +#for $i in $dnaprint_fields: + --dnaprint_fields '${i[0]}' '${i[2]}' +#end for +]]></command> + <inputs> + <conditional name="input_type_cond"> + <param name="input_type" type="select" label="Choose the category of the files to be analyzed"> + <option value="single" selected="true">Single files</option> + <option value="collection">Collections of files</option> + </param> + <when value="single"> + <conditional name="read_type_cond"> + <param name="read_type" type="select" label="Choose the read type"> + <option value="paired" selected="true">Paired</option> + <option value="single">Single</option> + </param> + <when value="paired"> + <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/> + <param name="read2" type="data" format="fastqsanger.gz,fastqsanger" label="Read2 fastq file"/> + </when> + <when value="single"> + <param name="read1" type="data" format="fastqsanger.gz,fastqsanger" label="Read1 fastq file"/> + </when> + </conditional> + </when> + <when value="collection"> + <param name="reads_collection" type="data_collection" format="fastqsanger,fastqsanger.gz" collection_type="list" label="Collection of fastqsanger files"/> + </when> + </conditional> + <param name="processes" type="integer" min="1" max="20" value="8" label="Number of processes for job splitting"/> + </inputs> + <outputs> + <data name="output_dbkey" format="txt" label="${tool.name} (dbkey) on ${on_string}"> + <filter>input_type_cond['input_type'] == 'single'</filter> + </data> + <data name="output_metrics" format="txt" label="${tool.name} (metrics) on ${on_string}"> + <filter>input_type_cond['input_type'] == 'single'</filter> + </data> + <collection name="output_dbkey_collection" type="list"> + <discover_datasets pattern="__name__" directory="output_dbkey" format="txt" /> + <filter>input_type_cond['input_type'] == 'collection'</filter> + </collection> + <collection name="output_metrics_collection" type="list"> + <discover_datasets pattern="__name__" directory="output_metrics" format="txt" /> + <filter>input_type_cond['input_type'] == 'collection'</filter> + </collection> + </outputs> + <tests> + <test> + <!-- Need to figure out how to test installed data tables --> + <param name="read1" value="reads.fastqsanger" ftype="fastqsanger" dbkey="89"/> + <param name="read2" value="read2.fastqsanger" ftype="fastqsanger" dbkey="89"/> + <output name="output_dbkey" file="output_dbkey.txt" ftype="txt"/> + <output name="output_metrics" file="output_metrics.txt" ftype="txt"/> + </test> + </tests> + <help> +**What it does** + +Accepts a single fastqsanger read, a set of paired reads, or a collections of reads and inspects the data to discover the +best reference genome for aligning the reads. This tool is, in essence, a DNA sniffer, and is the first Galaxy tool to +perform this task. While inspecting the data, a string of 0's and 1's is compiled based on the data contents, and we call +the complete string a "DNA print". All of the "DNA prints" files installed by the complementary **vSNP DNAprints data +manager** tool are then inspected to find a match for the compiled "DNA print" string. These files are each associated +with a Galaxy "dbkey" (i.e., genome build), so when a metach is found, the associated "dbkey" is passed to a mapper (e.g., +**Map with BWA-MEM**) to align the reads to the associated reference. + +The tool produces 2 text files, a "dbkey" file that contains the dbkey string and a "metrics" file that provides information +used to compile the "DNA print" string. + +This tool is important for samples containing bacterial species because many of the samples have a "mixed bag" of species, +and discovering the primary species is critical. DNA print matchig is currently supported for the following genomes. + + * Mycobacterium bovis AF2122/97 + * Brucella abortus bv. 1 str. 9-941 + * Brucella abortus strain BER + * Brucella canis ATCC 23365 + * Brucella ceti TE10759-12 + * Brucella melitensis bv. 1 str. 16M + * Brucella melitensis bv. 3 str. Ether + * Brucella melitensis BwIM_SOM_36b + * Brucella melitensis ATCC 23457 + * Brucella ovis ATCC 25840 + * Brucella suis 1330 + * Mycobacterium tuberculosis H37Rv + * Mycobacterium avium subsp. paratuberculosis strain Telford + * Mycobacterium avium subsp. paratuberculosis K-10 + * Brucella suis ATCC 23445 + * Brucella suis bv. 3 str. 686 + +**Required Options** + + * **Choose the category of the files to be analyzed** - select "Single files" or "Collections of files", then select the appropriate history items (single or paired fastqsanger reads or collections of fastqsanger reads) based on the selected option. + * **Number of processes for job splitting** - Select the number of processes for splitting the job to shorten execution time. + </help> + <citations> + <citation type="bibtex"> + @misc{None, + journal = {None}, + author = {1. Stuber T}, + title = {Manuscript in preparation}, + year = {None}, + url = {https://github.com/USDA-VS/vSNP},} + </citation> + </citations> +</tool> +