diff vapor.xml @ 4:244812f5bd1f draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vapor commit e0546646548259ccc4cfd65dbb793fc910fea583
author iuc
date Wed, 16 Nov 2022 13:41:01 +0000
parents f11d2dd29b2b
children
line wrap: on
line diff
--- a/vapor.xml	Mon Oct 17 07:28:50 2022 +0000
+++ b/vapor.xml	Wed Nov 16 13:41:01 2022 +0000
@@ -1,6 +1,6 @@
-<tool id="vapor" name="VAPOR" version="@TOOL_VERSION@+galaxy2" profile="21.05">
+<tool id="vapor" name="VAPOR" version="@TOOL_VERSION@+galaxy3" profile="21.05">
     <description>
-        Classify Influenza samples from raw short read sequence data
+        Classify Influenza samples from short reads sequence data
     </description>
     <macros>
         <token name="@TOOL_VERSION@">1.0.2</token>
@@ -10,8 +10,22 @@
     </xrefs>
     <requirements>
         <requirement type="package" version="@TOOL_VERSION@">vapor</requirement>
+        <!-- gawk only required for circumventing current bug in command line
+        tool => remove once fixed (see command section below) -->
+        <requirement type="package" version="5.1.0">gawk</requirement>
     </requirements>
     <command detect_errors="exit_code"><![CDATA[
+        #set $total_refs = int($fasta_file.metadata.sequences)
+
+        ## The next two lines for on the fly uppercasing are a workaround for a bug
+        ## in vapor 1.0.2, which caused sequence comparisons to be case-sensitive.
+        ## Got fixed upstream in:
+        ## https://github.com/connor-lab/vapor/commit/b5ec5857cbf53ed45ca7487dac2b4b85ecfe33ea
+        ## but unfortunately no release has been tagged since.
+        ## Remove with next release!!
+        awk '{ if ($0 !~ />/) {print toupper($0)} else {print $0} }' '$fasta_file' > ref_upper.fa &&
+        #set $fasta_file = 'ref_upper.fa'
+
         #if str($fastq_input.fastq_input_selector) == "paired"
             #set r1_ext = $fastq_input.fastq1.extension
             #set r2_ext = $fastq_input.fastq2.extension
@@ -27,54 +41,56 @@
             ln -s '$fastq_input.fastq_single' fastq1.$r1_ext &&
         #end if
         vapor.py
-            --return_best_n $opt.return_best_n
+            #if int($return_best_n) > 0
+                --return_best_n $return_best_n
+            #else
+                --return_best_n $total_refs
+            #end if
             #if $output_type == "fasta"
                 --return_seqs
             #end if
-            -k '$opt.kmer_length'
-            -t '$opt.score_threshold'
-            -c '$opt.min_kmer_cov'
-            -m '$opt.min_kmer_prop'
+            -k $opt.kmer_length
+            -t $opt.threshold
+            -c $opt.min_kmer_cov
+            -m $opt.min_kmer_prop
             -fa '$fasta_file'
-            -fq 
-            fastq1.$r1_ext
+            -fq fastq1.$r1_ext
             #if str($fastq_input.fastq_input_selector) in ["paired", "paired_collection"]
                 fastq2.$r2_ext
             #end if
-            -f '$opt.top_seed_frac'
-            -q
+            -f $opt.top_seed_frac
         > out_file
-    ]]>    </command>
+    ]]></command>
     <inputs>
-        <param name="fasta_file" format="fasta" type="data" label="FASTA file" help="Raw short read sequences (full length reference segment sequences)" />
+        <param name="fasta_file" format="fasta" type="data" label="Reference sequences" help="Select a multisample fasta dataset with reference sequences to base classification on." />
         <conditional name="fastq_input">
-            <param name="fastq_input_selector" type="select" label="Single or Paired-end reads" help="Select between paired and single end data">
-                <option value="single">Single</option>
-                <option value="paired">Paired</option>
-                <option value="paired_collection">Paired Collection</option>
+            <param name="fastq_input_selector" type="select" label="Type of sequencing data">
+                <option value="single">Single-end</option>
+                <option value="paired">Paired-end</option>
+                <option value="paired_collection">Paired-end as collection</option>
             </param>
             <when value="single">
-                <param name="fastq_single" format="fastqsanger,fastqsanger.gz" type="data" label="FASTQ file" help="Raw short read sequences (full length reference segment sequences)" />
+                <param name="fastq_single" format="fastqsanger,fastqsanger.gz" type="data" label="Sequenced reads" help="Specify the sequenced reads dataset." />
             </when>
             <when value="paired">
-                <param name="fastq1" type="data" format="fastqsanger,fastqsanger.gz" label="Select first set of reads" help="Specify reads dataset with forward reads"/>
-                <param name="fastq2" type="data" format="fastqsanger,fastqsanger.gz" label="Select second set of reads" help="Specify reads dataset with reverse reads"/>
+                <param name="fastq1" type="data" format="fastqsanger,fastqsanger.gz" label="Forward reads" help="Specify the sequenced reads dataset with forward reads."/>
+                <param name="fastq2" type="data" format="fastqsanger,fastqsanger.gz" label="Reverse reads" help="Specify the sequenced reads dataset with reverse reads."/>
             </when>
             <when value="paired_collection">
-                <param name="fastq_pairs" format="fastqsanger,fastqsanger.gz" type="data_collection" collection_type="paired" label="Select a paired collection" help="Dataset collection with forward and reverse reads"/>
+                <param name="fastq_pairs" format="fastqsanger,fastqsanger.gz" type="data_collection" collection_type="paired" label="Paired collection of sequenced reads" help="Select a collection with forward and reverse reads."/>
             </when>
         </conditional>
-        <param name="output_type" type="select" label="Output type">
-            <option value="scores" selected="true">Return scores only</option>
-            <option value="fasta">Return FASTA only</option>
+        <param name="output_type" type="select" label="Desired output">
+            <option value="scores" selected="true">Return scores of best matches</option>
+            <option value="fasta">Return FASTA sequences of best matches</option>
         </param>
+        <param name="return_best_n" type="integer" min="0" value="1" label="Limit number of reported matches to" help="Determines the maximum number of candidate matches sorted by score that will be reported. Set to zero to get all candidate matches reported." />
         <section name="opt" title="Optional arguments" expanded="true">
-            <param name="return_best_n" type="integer" min="1" value="1" label="Returns the highest scoring n queries" help="A list of the best n queries instead of only the highest scoring query" />
-            <param name="kmer_length" type="integer" min="5" max="30" value="21" label="Kmer Length" help="" />
-            <param name="score_threshold" type="float" min="0.0" max="1.0" value="0.2" label="Read kmer filtering threshold" help="" />
-            <param name="min_kmer_cov" type="integer" value="5" label="Min coverage kmer culling" help="Minimum coverage kmer culling" />
-            <param name="min_kmer_prop" type="float" value="0.1" label="Min kmer proportion" help="Minimum proportion of matched kmers allowed for queries" />
-            <param name="top_seed_frac" type="float" min="0.0" max="1.0" value="0.2" label="Fraction of best seeds to extend" help="" />
+            <param argument="-k" name="kmer_length" type="integer" min="5" max="30" value="21" label="Kmer Length" help="Generate k-mers of this length from the reference sequences and the sequenced reads. Note: smaller k-mer sizes come at the cost of decreased specificity and k-mer sizes below 21 have an increased risk of contaminating sequences getting analyzed. Only decrease the default (21) if you know your sample is pure (i.e., sequenced reads represent viral reads only), or if you have increased --threshold sufficiently." />
+            <param argument="--threshold" type="float" min="0" max="1" value="0.2" label="Read kmer filtering threshold" help="Sequenced reads that don't have at least this proportion of their k-mers matching k-mers generated from the reference sequences will not be considered in the analysis; default: 0.2" />
+            <param argument="--min_kmer_cov" type="integer" min="1" value="5" label="Coverage threshold for k-mer culling" help="Minimum coverage by sequenced reads for a reference k-mer to be kept during culling; default: 5" />
+            <param argument="--min_kmer_prop" type="float" min="0" max="1" value="0.1" label="Minimum k-mer proportion" help="Minimum proportion of matching kmers required for queries; default: 0.1" />
+            <param argument="--top_seed_frac" type="float" min="0" max="1" value="1" label="Fraction of best seeds to extend" help="Of the queries still considered after applying the --min_kmer_prop filter above, the tool will calculate and report scores only for this fraction. Lowering this value leads to shorter runtime because fewer scores will have to be calculated, but also to fewer results getting reported. CAVEAT: this version of the tool will round down the result of applying this parameter so it is possible to end up with zero queries to be considered further. Change only if runtime is an issue!" />
         </section>
     </inputs>
     <outputs>
@@ -99,6 +115,15 @@
         </test>
         <test expect_num_outputs="1">
             <conditional name="fastq_input">
+                <param name="fastq_input_selector" value="single" />
+                <param name="fastq_single" ftype="fastq" value="test_reads.fq" />
+            </conditional>
+            <param name="fasta_file" value="HA_sample.fa" />
+            <param name="return_best_n" value="0" />
+            <output name="output_scores" file="output1_full.tab" />
+        </test>
+        <test expect_num_outputs="1">
+            <conditional name="fastq_input">
                 <param name="fastq_input_selector" value="paired" />
                 <param name="fastq1" ftype="fastq" value="test_reads.fq" />
                 <param name="fastq2" ftype="fastq" value="test_reads2.fq" />
@@ -122,7 +147,7 @@
             <param name="fasta_file" value="HA_sample.fa" />
             <section name="opt">
                 <param name="kmer_length" value="29" />
-                <param name="score_threshold" value="0.5" />
+                <param name="threshold" value="0.5" />
                 <param name="min_kmer_cov" value="7" />
                 <param name="min_kmer_prop" value="0.5" />
                 <param name="top_seed_frac" value="0.5" />
@@ -136,9 +161,7 @@
             </conditional>
             <param name="fasta_file" value="HA_sample.fa" />
             <param name="output_type" value="fasta" />
-            <section name="opt">
-                <param name="return_best_n" value="3" />
-            </section>
+            <param name="return_best_n" value="3" />
             <output name="output_fasta" file="output5.fa" />
         </test>
     </tests>
@@ -146,11 +169,11 @@
 **What it does**
 
 VAPOR is a tool for classification of Influenza samples from raw short read sequence data for downstream bioinformatics analysis.
-VAPOR is provided with a fasta file of full-length sequences (> 20,000) for a given segment, a set of reads, and attempts to retrieve a reference that is closest to the sample strain.
+VAPOR works on a fasta file of full-length reference sequences for a given genome segment and a set of sequenced reads, and attempts to retrieve the reference that is closest to the sequenced strain.
 
 `sub_sample` is not an option here (compared to the tool on GitHub), since you can always build a workflow that preprocesses your reads to a (random) subsample. You can use this output as your reads file for VAPOR.
     ]]>    </help>
     <citations>
         <citation type="doi">10.1093/bioinformatics/btz814</citation>
     </citations>
-</tool>
\ No newline at end of file
+</tool>