view mash.xml @ 0:572587cbb1dd draft default tip

"planemo upload for repository https://github.com/brinkmanlab/galaxy-tools/tree/master/<name of containing folder> commit 33b02e08cbc8f76fb4b8537f8c968393f85a1b5e"
author brinkmanlab
date Fri, 24 Jan 2020 17:41:36 -0500
parents
children
line wrap: on
line source

<tool id="mash" name="MASH" version="2.1" profile="18.09">
    <description>Fast genome and metagenome distance estimation using MinHash</description>
    <edam_topics>
        <edam_topic>topic_0091</edam_topic>
    </edam_topics>
    <edam_operations>
        <edam_operation>operation_0289</edam_operation>
        <edam_operation>operation_0296</edam_operation>
    </edam_operations>
    <macros>
        <xml name="sketching_options">
            <!-- sketching -->
            <param name="kmer_size" type="integer" min="1" max="32" value="21" argument="-k" label="K-mer size" />
            <param name="sketch_size" type="integer" min="1" value="1000" argument="-s" label="Sketch size" help="Each sketch will have at most this many non-redundant min-hashes." />
            <param name="hash_seed" type="integer" min="0" max="4294967296" value="42" argument="-S" label="Seed to provide to the hash function" />
            <param name="kmer_size_thresh" type="float" min="0" max="1" value="0.01" argument="-w" label="Probability threshold for warning about low k-mer size" />
            <conditional name="input_option">
                <param name="individual_seq" type="boolean" checked="false" argument="-i" label="Sketch individual sequences" help="Sketch individual sequences, rather than whole files, e.g. for multi-fastas of single-chromosome genomes or pair-wise gene comparisons." />
                <when value="false">
                    <param name="input_read_set" type="boolean" checked="false" truevalue="-r" falsevalue="" argument="-r" label="Input is a read set" help="Sketch individual sequences, rather than whole files, e.g. for multi-fastas of single-chromosome genomes or pair-wise gene comparisons." />
                    <!-- sketching reads -->
                    <param name="bloom_size" type="text" argument="-b" optional="true" label="Use a Bloom filter of this size (raw bytes or with K/M/G/T)" help="Use a Bloom filter of this size (raw bytes or with K/M/G/T) to filter out unique k-mers. This is useful if exact filtering uses too much memory. However, some unique k-mers may pass erroneously, and copies cannot be counted beyond 2." >
                        <validator type="regex"><![CDATA[^[0-9]+[KMGT]?$]]></validator>
                    </param>
                    <param name="kmer_count" type="integer" min="0" value="1" argument="-m" label="Minimum copies of each k-mer required to pass noise filter for reads." />
                    <param name="target_coverage" type="float" min="0" value="0" argument="-c" label="Target coverage" help="Sketching will conclude if this coverage is reached before the end of the input file (estimated by average k-mer multiplicity)." />
                    <param name="genome_size" type="text" argument="-g" optional="true" label="Genome size (raw bases or with K/M/G/T)" help="If specified, will be used for p-value calculation instead of an estimated size from k-mer content." >
                        <validator type="regex"><![CDATA[^[0-9]+[KMGT]?$]]></validator>
                    </param>
                </when>
            </conditional>
            <!-- alphabet -->
            <param name="preserve_strand" type="boolean" checked="false" truevalue="-n" falsevalue="" argument="-n" label="Preserve strand" help="By default, strand is ignored by using canonical DNA k-mers, which are alphabetical minima of forward-reverse pairs." />
            <param name="use_aaa" type="boolean" checked="false" truevalue="-a" falsevalue="" argument="-a" label="Use amino acid alphabet (A-Z, except BJOUXZ)" help="Implied kmer size of 9 and preserve strand" />
            <param name="alphabet" type="text" area="false" argument="-z" label="Alphabet to base hashes on" >
                <validator type="regex"><![CDATA[^[A-Za-z]*$]]></validator>
            </param>
            <param name="preserve_case" type="boolean" checked="false" truevalue="-Z" falsevalue="" argument="-Z" label="Preserve case in k-mers and alphabet" help="Sequence letters whose case is not in the current alphabet will be skipped when sketching." />
        </xml>
    </macros>
    <requirements>
        <requirement type="package" version="2.1">mash</requirement>
    </requirements>
    <version_command>mash --version</version_command>
    <command detect_errors="aggressive"><![CDATA[
        #import os
        ## =====  Declare reusable parameter output for different modes ===
        #def sketching
            -k $commands.kmer_size -s $commands.sketch_size
            #if $commands.input_option.individual_seq
                -i ##Hack around current conditional truevalue/falsevalue behavior
            #end if
            -S $commands.hash_seed -w $commands.kmer_size_thresh
            #unless $commands.input_option.individual_seq
                $commands.input_option.input_read_set
                #if $commands.input_option.bloom_size
                    -b $commands.input_option.bloom_size
                #end if
                -m $commands.input_option.kmer_count
                -c $commands.input_option.target_coverage
                #if $commands.input_option.genome_size
                    -g $commands.input_option.genome_size
                #end if
            #end unless
            $commands.preserve_strand $commands.use_aaa
            #if $commands.alphabet
                -z "$commands.alphabet"
            #end if
            $commands.preserve_case
        #end def

        ## ===== Change to extension MASH expects =====
        #if $commands.command in ["dist", "screen"] and $commands.reference.is_of_type("msh")
            ln -sf $commands.reference ${os.path.basename(str($commands.reference))}.msh &&
        #end if
        #if $commands.command in ["dist", "sketch", "triangle"]
            #for $input in $commands.inputs
                #if $input.is_of_type("msh")
                    ln -sf $input ${os.path.basename(str($input))}.msh &&
                #end if
            #end for
        #end if
        #if $commands.command in ["sketch"]
            ln -sf $output output.msh &&
        #end if

        ## ===== Execute MASH =====
        mash $commands.command

        ## ===== Enable threading =====
        #if False and $commands.command in ["dist", "screen", "sketch", "triangle"]
            -p \${GALAXY_SLOTS:-1} ## -p seems broken at the time of writing
        #end if

        ## ===== Select correct output parameters for selected mode =====
        #if $commands.command == "bounds"
            >> $output
        #else if $commands.command == "dist"
            $commands.table_output -v $commands.max_p -d $commands.max_dist
            #set $sketch=True
            #for $input in $commands.inputs
                #if $input.is_of_type("msh")
                    #set $sketch=False
                    #break
                #end if
            #end for
            #if $sketch
                $sketching()
            #end if
            #if $commands.reference.is_of_type("msh")
                ${os.path.basename(str($commands.reference))}.msh
            #else
                $commands.reference
            #end if
            #for $input in $commands.inputs
                #if $input.is_of_type("msh")
                    ${os.path.basename(str($input))}.msh
                #else
                    $input
                #end if
            #end for
            >> $output
        #else if $commands.command == "info"
            $commands.info_mode $commands.sketch >> $output
        #else if $commands.command == "paste"
            "$commands.out_prefix"
            #for $sketch in $commands.sketches
                $sketch
            #end for
            >> $output
        #else if $commands.command == "screen"
            $commands.winner_takes_all -i $commands.min_ident -v $commands.max_p
            #if $commands.reference.is_of_type("msh")
                ${os.path.basename(str($commands.reference))}.msh
            #else
                $commands.reference
            #end if
            #for $pool in $commands.pools
                $pool
            #end for
            >> $output
        #else if $commands.command == "sketch"
            -o output
            $sketching()
            #for $input in $commands.inputs
                $input
            #end for
        #else if $commands.command == "triangle"
            $commands.comment_fields
            $sketching()
            #for $input in $commands.inputs
                #if $input.is_of_type("msh")
                    ${os.path.basename(str($input))}.msh
                #else
                    $input
                #end if
            #end for
            >> $output
        #end if
    ]]></command>
    <inputs>
        <conditional name="commands">
            <param name="command" type="select" label="Mode">
                <option value="bounds">Bounds: Print a table of Mash error bounds</option>
                <option value="dist">Dist: Estimate the distance of query sequences to references</option>
                <option value="info">Info: Display information about sketch files</option>
                <option value="paste">Paste: Create a single sketch file from multiple sketch files</option>
                <option value="screen">Screen: Determine whether query sequences are within a larger pool of sequences</option>
                <option value="sketch">Sketch: Create sketches (reduced representations for fast operations)</option>
                <option value="triangle">Triangle: Estimate a lower-triangular distance matrix</option>
            </param>
            <when value="bounds">
                <param name="kmer_size" type="integer" min="1" max="32" value="21" argument="-k" label="K-mer size" />
                <param name="error_bound" type="float" min="0" max="1" value="0.99" argument="-p" label="Mash distance estimates will be within the given error bounds with this probability" />
            </when>
            <when value="dist">
                <!-- input -->
                <param name="reference" type="data" format="fasta,fastq,fasta.gz,fastq.gz,msh" label="Reference" />
                <param name="inputs" type="data" format="fasta,fastq,fasta.gz,fastq.gz,msh" multiple="true" label="Queries" />
                <!-- output -->
                <param name="table_output" type="boolean" checked="false" truevalue="-t" falsevalue="" argument="-t" label="Table output" help="Table output (will not report p-values, but fields will be blank if they do not meet the p-value threshold)." />
                <param name="max_p" type="float" min="0" max="1" value="1" argument="-v" label="Maximum p-value to report" />
                <param name="max_dist" type="float" min="0" max="1" value="1" argument="-d" label="Maximum distance to report" />
                <expand macro="sketching_options" />
            </when>
            <when value="info">
                <param name="info_mode" type="select" label="Mode">
                    <option value="" selected="true">Default</option>
                    <option value="-H">Only show header info, do not list each sketch</option>
                    <option value="-t">Tabular output (rather than padded), with no header</option>
                    <option value="-c">Show hash count histograms for each sketch</option>
                    <option value="-d">Dump sketches in JSON format</option>
                </param>
            </when>
            <when value="paste">
                <param name="sketches" type="data" format="msh" multiple="true" label="Sketches to merge" />
                <param name="out_prefix" type="text" area="false" label="Out prefix" />
            </when>
            <when value="screen">
                <param name="reference" type="data" format="msh" label="inputs" help="Use the sketch command to combine multiple sketch inputs." />
                <param name="pools" type="data" format="fasta,fastq,fasta.gz,fastq.gz" multiple="true" label="Pools" help="The pool sequences are assumed to be nucleotides, and will be 6-frame translated if the inputs are amino acids." />
                <param name="winner_takes_all" type="boolean" checked="false" truevalue="-w" falsevalue="" argument="-w" label="Winner-takes-all strategy for identity estimates" help="After counting hashes for each query, hashes that appear in multiple inputs will be removed from all except the one with the best identity (ties broken by larger query), and other identities will be reduced. This removes output redundancy, providing a rough compositional outline." />
                <param name="min_ident" type="float" min="-1" max="1" value="0" argument="-i" label="Minimum identity to report" help="Inclusive unless set to zero, in which case only identities greater than zero (i.e. with at least one shared hash) will be reported. Set to -1 to output everything." />
                <param name="max_p" type="float" min="0" max="1" value="1" argument="-v" label="Maximum p-value to report" />
            </when>
            <when value="sketch">
                <param name="inputs" type="data" format="fasta,fastq,fasta.gz,fastq.gz" multiple="true" label="Inputs" />
                <!-- TODO -I and -C -->
                <expand macro="sketching_options" />
            </when>
            <when value="triangle">
                <param name="inputs" type="data" format="fasta,fastq,fasta.gz,fastq.gz,msh" multiple="true" label="Input sequences" />
                <param name="comment_fields" type="boolean" checked="false" truevalue="-C" falsevalue="" label="Use comment fields for sequence names instead of IDs" />
                <expand macro="sketching_options" />
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data name="output" format="txt">
            <change_format>
                <when input="commands.command" value="bounds" format="tabular" />
                <when input="commands.command" value="dist" format="tsv" />
                <when input="commands.command" value="info" format="txt" />
                <when input="commands.command" value="paste" format="msh" />
                <when input="commands.command" value="screen" format="txt" />
                <when input="commands.command" value="sketch" format="msh" />
                <when input="commands.command" value="triangle" format="phylip" />
            </change_format>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="1">
            <conditional name="commands">
                <param name="command" value="bounds" />
            </conditional>
            <output name="output" ftype="tabular">
                <assert_contents>
                    <has_text text="Mash distance" />
                    <has_text text="Screen distance" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="1">
            <conditional name="commands">
                <param name="command" value="dist" />
                <param name="reference" value="test-data/15600_genome.fasta" ftype="fasta" />
                <param name="genomes" >
                    <collection type="list">
                        <element name="test1" value="test-data/15596_genome.fasta" ftype="fasta" />
                        <element name="test2" value="test-data/15602_genome.fasta" ftype="fasta" />
                        <element name="test3" value="test-data/15599_genome.fasta" ftype="fasta" />
                    </collection>
                </param>
                <conditional name="input_option">
                    <param name="individual_seq" value="true" />
                </conditional>
            </conditional>
            <output name="output" checksum="sha256:701ecc0d4fec3383267699ebd9d78dd7b84360ff57754aac4f2bc8bc00aee580" ftype="tsv" />
        </test>
    </tests>
    <help><![CDATA[
        https://mash.readthedocs.io/en/latest/
    ]]></help>
    <citations>
        <citation type="doi">10.5281/zenodo.3364789</citation>
        <citation type="doi">10.1186/s13059-016-0997-x</citation>
    </citations>
</tool>