Mercurial > repos > iuc > vapor

<tool id="vapor" name="VAPOR" version="@TOOL_VERSION@+galaxy3" profile="21.05">
    <description>
        Classify Influenza samples from short reads sequence data
    </description>
    <macros>
        <token name="@TOOL_VERSION@">1.0.2</token>
    </macros>
    <xrefs>
        <xref type="bio.tools">vapor</xref>
    </xrefs>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">vapor</requirement>
        <!-- gawk only required for circumventing current bug in command line
        tool => remove once fixed (see command section below) -->
        <requirement type="package" version="5.1.0">gawk</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
        #set $total_refs = int($fasta_file.metadata.sequences)

        ## The next two lines for on the fly uppercasing are a workaround for a bug
        ## in vapor 1.0.2, which caused sequence comparisons to be case-sensitive.
        ## Got fixed upstream in:
        ## https://github.com/connor-lab/vapor/commit/b5ec5857cbf53ed45ca7487dac2b4b85ecfe33ea
        ## but unfortunately no release has been tagged since.
        ## Remove with next release!!
        awk '{ if ($0 !~ />/) {print toupper($0)} else {print $0} }' '$fasta_file' > ref_upper.fa &&
        #set $fasta_file = 'ref_upper.fa'

        #if str($fastq_input.fastq_input_selector) == "paired"
            #set r1_ext = $fastq_input.fastq1.extension
            #set r2_ext = $fastq_input.fastq2.extension
            ln -s '$fastq_input.fastq1' fastq1.$r1_ext &&
            ln -s '$fastq_input.fastq2' fastq2.$r2_ext &&
        #elif str($fastq_input.fastq_input_selector) == "paired_collection"
            #set r1_ext = $fastq_input.fastq_pairs.forward.extension
            #set r2_ext = $fastq_input.fastq_pairs.reverse.extension
            ln -s '$fastq_input.fastq_pairs.forward' fastq1.$r1_ext &&
            ln -s '$fastq_input.fastq_pairs.reverse' fastq2.$r2_ext &&
        #else
            #set r1_ext = $fastq_input.fastq_single.extension
            ln -s '$fastq_input.fastq_single' fastq1.$r1_ext &&
        #end if
        vapor.py
            #if int($return_best_n) > 0
                --return_best_n $return_best_n
            #else
                --return_best_n $total_refs
            #end if
            #if $output_type == "fasta"
                --return_seqs
            #end if
            -k $opt.kmer_length
            -t $opt.threshold
            -c $opt.min_kmer_cov
            -m $opt.min_kmer_prop
            -fa '$fasta_file'
            -fq fastq1.$r1_ext
            #if str($fastq_input.fastq_input_selector) in ["paired", "paired_collection"]
                fastq2.$r2_ext
            #end if
            -f $opt.top_seed_frac
        > out_file
    ]]></command>
    <inputs>
        <param name="fasta_file" format="fasta" type="data" label="Reference sequences" help="Select a multisample fasta dataset with reference sequences to base classification on." />
        <conditional name="fastq_input">
            <param name="fastq_input_selector" type="select" label="Type of sequencing data">
                <option value="single">Single-end</option>
                <option value="paired">Paired-end</option>
                <option value="paired_collection">Paired-end as collection</option>
            </param>
            <when value="single">
                <param name="fastq_single" format="fastqsanger,fastqsanger.gz" type="data" label="Sequenced reads" help="Specify the sequenced reads dataset." />
            </when>
            <when value="paired">
                <param name="fastq1" type="data" format="fastqsanger,fastqsanger.gz" label="Forward reads" help="Specify the sequenced reads dataset with forward reads."/>
                <param name="fastq2" type="data" format="fastqsanger,fastqsanger.gz" label="Reverse reads" help="Specify the sequenced reads dataset with reverse reads."/>
            </when>
            <when value="paired_collection">
                <param name="fastq_pairs" format="fastqsanger,fastqsanger.gz" type="data_collection" collection_type="paired" label="Paired collection of sequenced reads" help="Select a collection with forward and reverse reads."/>
            </when>
        </conditional>
        <param name="output_type" type="select" label="Desired output">
            <option value="scores" selected="true">Return scores of best matches</option>
            <option value="fasta">Return FASTA sequences of best matches</option>
        </param>
        <param name="return_best_n" type="integer" min="0" value="1" label="Limit number of reported matches to" help="Determines the maximum number of candidate matches sorted by score that will be reported. Set to zero to get all candidate matches reported." />
        <section name="opt" title="Optional arguments" expanded="true">
            <param argument="-k" name="kmer_length" type="integer" min="5" max="30" value="21" label="Kmer Length" help="Generate k-mers of this length from the reference sequences and the sequenced reads. Note: smaller k-mer sizes come at the cost of decreased specificity and k-mer sizes below 21 have an increased risk of contaminating sequences getting analyzed. Only decrease the default (21) if you know your sample is pure (i.e., sequenced reads represent viral reads only), or if you have increased --threshold sufficiently." />
            <param argument="--threshold" type="float" min="0" max="1" value="0.2" label="Read kmer filtering threshold" help="Sequenced reads that don't have at least this proportion of their k-mers matching k-mers generated from the reference sequences will not be considered in the analysis; default: 0.2" />
            <param argument="--min_kmer_cov" type="integer" min="1" value="5" label="Coverage threshold for k-mer culling" help="Minimum coverage by sequenced reads for a reference k-mer to be kept during culling; default: 5" />
            <param argument="--min_kmer_prop" type="float" min="0" max="1" value="0.1" label="Minimum k-mer proportion" help="Minimum proportion of matching kmers required for queries; default: 0.1" />
            <param argument="--top_seed_frac" type="float" min="0" max="1" value="1" label="Fraction of best seeds to extend" help="Of the queries still considered after applying the --min_kmer_prop filter above, the tool will calculate and report scores only for this fraction. Lowering this value leads to shorter runtime because fewer scores will have to be calculated, but also to fewer results getting reported. CAVEAT: this version of the tool will round down the result of applying this parameter so it is possible to end up with zero queries to be considered further. Change only if runtime is an issue!" />
        </section>
    </inputs>
    <outputs>
        <data name="output_scores" from_work_dir="out_file" format="tabular" label="${tool.name} on ${on_string}: closest reference scores">
            <filter>output_type == "scores"</filter>
            <actions>
                <action name="column_names" type="metadata" default="% of query bases in reads,Total score,Query length,Mean score,Reads after culling,Query description" />
            </actions>
        </data>
        <data name="output_fasta" from_work_dir="out_file" format="fasta" label="${tool.name} on ${on_string}: closest reference fasta">
            <filter>output_type == "fasta"</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="1">
            <conditional name="fastq_input">
                <param name="fastq_input_selector" value="single" />
                <param name="fastq_single" ftype="fastq" value="test_reads.fq" />
            </conditional>
            <param name="fasta_file" value="HA_sample.fa" />
            <output name="output_scores" file="output1.tab" />
        </test>
        <test expect_num_outputs="1">
            <conditional name="fastq_input">
                <param name="fastq_input_selector" value="single" />
                <param name="fastq_single" ftype="fastq" value="test_reads.fq" />
            </conditional>
            <param name="fasta_file" value="HA_sample.fa" />
            <param name="return_best_n" value="0" />
            <output name="output_scores" file="output1_full.tab" />
        </test>
        <test expect_num_outputs="1">
            <conditional name="fastq_input">
                <param name="fastq_input_selector" value="paired" />
                <param name="fastq1" ftype="fastq" value="test_reads.fq" />
                <param name="fastq2" ftype="fastq" value="test_reads2.fq" />
            </conditional>
            <param name="fasta_file" value="HA_sample.fa" />
            <output name="output_scores" file="output2.tab" />
        </test>
        <test expect_num_outputs="1">
            <conditional name="fastq_input">
                <param name="fastq_input_selector" value="single" />
                <param name="fastq_single" ftype="fastqsanger.gz" value="test_reads.fastqsanger.gz" />
            </conditional>
            <param name="fasta_file" value="HA_sample.fa" />
            <output name="output_scores" file="output1.tab" />
        </test>
        <test expect_num_outputs="1">
            <conditional name="fastq_input">
                <param name="fastq_input_selector" value="single" />
                <param name="fastq_single" value="test_reads.fq" />
            </conditional>
            <param name="fasta_file" value="HA_sample.fa" />
            <section name="opt">
                <param name="kmer_length" value="29" />
                <param name="threshold" value="0.5" />
                <param name="min_kmer_cov" value="7" />
                <param name="min_kmer_prop" value="0.5" />
                <param name="top_seed_frac" value="0.5" />
            </section>
            <output name="output_scores" file="output4.tab" />
        </test>
        <test expect_num_outputs="1">
            <conditional name="fastq_input">
                <param name="fastq_input_selector" value="single" />
                <param name="fastq_single" value="test_reads.fq" />
            </conditional>
            <param name="fasta_file" value="HA_sample.fa" />
            <param name="output_type" value="fasta" />
            <param name="return_best_n" value="3" />
            <output name="output_fasta" file="output5.fa" />
        </test>
    </tests>
    <help><![CDATA[
**What it does**

VAPOR is a tool for classification of Influenza samples from raw short read sequence data for downstream bioinformatics analysis.
VAPOR works on a fasta file of full-length reference sequences for a given genome segment and a set of sequenced reads, and attempts to retrieve the reference that is closest to the sequenced strain.

`sub_sample` is not an option here (compared to the tool on GitHub), since you can always build a workflow that preprocesses your reads to a (random) subsample. You can use this output as your reads file for VAPOR.
    ]]>    </help>
    <citations>
        <citation type="doi">10.1093/bioinformatics/btz814</citation>
    </citations>
</tool>
author	iuc
date	Wed, 16 Nov 2022 13:41:01 +0000
parents	f11d2dd29b2b
children