Mercurial > repos > iuc > mmseqs2_taxonomy_assignment

<tool id="mmseqs2_taxonomy_assignment" name="MMseqs2 Taxonomy Assignments" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>
        of sequences by comparing them to a reference database
    </description>
    <macros>
        <import>macro.xml</import>
    </macros>
    <expand macro="biotools"/>
    <expand macro="requirements"/>
    <expand macro="version_command"/>
    <command detect_errors="exit_code"><![CDATA[
ln -s -f '${createdb.input_fasta}' 'input' &&
mmseqs createdb
    'input'
    'sequenceDB'
    --dbtype '$createdb.alph_type.dbtype'
    --shuffle $createdb.shuffle &&

##Used only for test
#if str($download_tax_db) == 'true':
cp -r '$database.database_type.mmseqs2_db_select.fields.path'/database* . &&

mmseqs createtaxdb
    database
    'tmp' &&
#end if
##

#if $filtertaxseqdb.taxon_list
    mmseqs filtertaxseqdb
        ##Used only for test
        #if str($download_tax_db) == 'true':
            'database'
        ##
        #else
            '$database.database_type.mmseqs2_db_select.fields.path'/database
        #end if
        'database_filtered'
        --taxon-list '$filtertaxseqdb.taxon_list'
            &&
#end if

mmseqs taxonomy
    'sequenceDB'
    #if $filtertaxseqdb.taxon_list
        'database_filtered'
    #else
        ##Used only for test
        #if str($download_tax_db) == 'true':
            'database'
        ##
        #else
            '$database.database_type.mmseqs2_db_select.fields.path'/database
        #end if
    #end if
    'output_taxonomy'
    'tmp'
    #if str($createdb.alph_type.dbtype) == "1"
        --comp-bias-corr-scale $createdb.alph_type.comp_bias_corr_scale
    #elif str($createdb.alph_type.dbtype) == "2"
        --zdrop $createdb.alph_type.zdrop
    #end if
    ##Pre-filter options
    --add-self-matches $taxonomy.prefilter.add_self_matches
    -s $taxonomy.prefilter.sensitivity
    -k $taxonomy.prefilter.kmer_length
    --target-search-mode $taxonomy.prefilter.target_search_mode
    ##--k-score TWIN                   k-mer threshold for generating similar k-mer lists [seq:2147483647,prof:2147483647]
    --max-seqs $taxonomy.prefilter.max_seqs
    --split $taxonomy.prefilter.split
    --split-mode $taxonomy.prefilter.split_mode
    ##--split-memory-limit BYTE        Set max memory per split. E.g. 800B, 5K, 10M, 1G. Default (0) to all available system memory [0]
    --diag-score $taxonomy.prefilter.diag_score
    --exact-kmer-matching $taxonomy.prefilter.exact_kmer_matching
    --mask $taxonomy.prefilter.mask
    --mask-prob $taxonomy.prefilter.mask_prob
    --mask-lower-case $taxonomy.prefilter.mask_lower_case
    --mask-n-repeat $taxonomy.prefilter.mask_n_repeat
    --min-ungapped-score $taxonomy.prefilter.min_ungapped_score
    --spaced-kmer-mode $taxonomy.prefilter.spaced_kmer_mode
    ##--spaced-kmer-pattern STR        User-specified spaced k-mer pattern []
    ##--local-tmp STR                  Path where some of the temporary files will be created []
    ##--disk-space-limit BYTE          Set max disk space to use for reverse profile searches. E.g. 800B, 5K, 10M, 1G. Default (0) to all available disk space in the temp folder [0]

    ##Align options
    -a $taxonomy.align.convertalis
    ##The next 2 parameters seems to be the same
    --alignment-mode $taxonomy.align.alignment_mode
    --alignment-output-mode $taxonomy.align.alignment_output_mode
    --wrapped-scoring $taxonomy.align.wrapped_scoring
    -e $taxonomy.align.evalue
    --min-seq-id $taxonomy.align.min_seq_id
    --min-aln-len $taxonomy.align.min_aln_len
    --seq-id-mode $taxonomy.align.seq_id_mode
    --alt-ali $taxonomy.align.alt_ali
    -c $taxonomy.align.cov
    --cov-mode $taxonomy.align.cov_mode
    --max-rejected $taxonomy.align.max_rejected
    --max-accept $taxonomy.align.max_accept
    --score-bias $taxonomy.align.score_bias
    --realign $taxonomy.align.realign
    --realign-score-bias $taxonomy.align.realign_score_bias
    --realign-max-seqs $taxonomy.align.realign_max_seqs
    --corr-score-weight $taxonomy.align.corr_score_weight
    --exhaustive-search-filter $taxonomy.align.exhaustive_search_filter

    ##Profile options
    ##--pca                            Pseudo count admixture strength []
    ##--pcb                            Pseudo counts: Neff at half of maximum admixture (range 0.0-inf) []
    --mask-profile $taxonomy.profile.mask_profile
    --e-profile $taxonomy.profile.e_profile
    --wg $taxonomy.profile.wg
    --filter-msa $taxonomy.profile.filter_msa
    --filter-min-enable $taxonomy.profile.filter_min_enable
    --max-seq-id $taxonomy.profile.max_seq_id
    --qid $taxonomy.profile.qid
    --qsc $taxonomy.profile.qsc
    --cov $taxonomy.profile.cov
    --diff $taxonomy.profile.diff
    --pseudo-cnt-mode $taxonomy.profile.pseudo_cnt_mode
    --exhaustive-search $taxonomy.profile.exhaustive_search
    --lca-search $taxonomy.profile.lca_search

    ##Misc options
    ##--orf-filter INT                 Prefilter query ORFs with non-selective search
    ##                             Only used during nucleotide-vs-protein classification
    ##                             NOTE: Consider disabling when classifying short reads [1]
    --orf-filter-e $taxonomy.misc.orf_filter_e
    --orf-filter-s $taxonomy.misc.orf_filter_s
    --lca-mode $taxonomy.misc.lca_mode
    --tax-output-mode $taxonomy.misc.tax_output_mode
    --majority $taxonomy.misc.majority
    --vote-mode $taxonomy.misc.vote_mode
    ##--lca-ranks STR                  Add column with specified ranks (',' separated) []
    --tax-lineage $taxonomy.misc.tax_lineage
    --blacklist $taxonomy.misc.blacklist
    --taxon-list $taxonomy.misc.taxon_list
    --rescore-mode $taxonomy.misc.rescore_mode
    --allow-deletion $taxonomy.misc.allow_deletion
    --min-length $taxonomy.misc.min_length
    --max-length $taxonomy.misc.max_length
    --max-gaps $taxonomy.misc.max_gaps
    --contig-start-mode $taxonomy.misc.contig_start_mode
    --contig-end-mode $taxonomy.misc.contig_end_mode
    --orf-start-mode $taxonomy.misc.orf_start_mode
    --forward-frames $taxonomy.misc.forward_frames
    --reverse-frames $taxonomy.misc.reverse_frames
    --translation-table $taxonomy.misc.translation_table
    --translate $taxonomy.misc.translate
    --use-all-table-starts $taxonomy.misc.use_all_table_starts
    --id-offset $taxonomy.misc.id_offset
    --sequence-overlap $taxonomy.misc.sequence_overlap
    --sequence-split-mode $taxonomy.misc.sequence_split_mode
    --headers-split-mode $taxonomy.misc.headers_split_mode
    --search-type $database.database_type.search_type
    --prefilter-mode $taxonomy.misc.prefilter_mode

    ##Common options
    ##--compressed INT                 Write compressed output [0]
    --threads "\${GALAXY_SLOTS:-1}"
    ##-v INT                           Verbosity level: 0: quiet, 1: +errors, 2: +warnings, 3: +info [3]
    --max-seq-len $taxonomy.common.max_seq_len
    ##--db-load-mode INT               Database preload mode 0: auto, 1: fread, 2: mmap, 3: mmap+touch [0]
    ##--mpi-runner STR                 Use MPI on compute cluster with this MPI command (e.g. "mpirun -np 42") []
    ##--force-reuse BOOL               Reuse tmp filse in tmp/latest folder ignoring parameters and version changes [0]
    ##--remove-tmp-files BOOL          Delete temporary files [0]

    ##Expert options
    --filter-hits $taxonomy.expert.filter_hits
    --sort-results $taxonomy.expert.sort_results
    ##--create-lookup INT              Create database lookup file (can be very large) [0]
    --chain-alignments $taxonomy.expert.chain_alignments
    --merge-query $taxonomy.expert.merge_query
    ##--strand INT                     Strand selection only works for DNA/DNA search 0: reverse, 1: forward, 2: both [1]
&&
mmseqs createtsv
    'sequenceDB'
    'output_taxonomy'
    'taxo_result.tsv'

    --first-seq-as-repr $createtsv.first_seq_as_repr
    --target-column $createtsv.target_column
    --full-header $createtsv.full_header
    --idx-seq-src $createtsv.idx_seq_src
    --threads "\${GALAXY_SLOTS:-1}"
    ##--compressed INT          Write compressed output [0]
    ##-v INT                    Verbosity level: 0: quiet, 1: +errors, 2: +warnings, 3: +info [3]
    ##--db-output BOOL          Return a result DB instead of a text file [0]

#if str($kraken_report.keep_report) == "Yes"
&&
    mmseqs taxonomyreport
    #if $filtertaxseqdb.taxon_list
        'database_filtered'
    #else
        ##Used only for test
        #if str($download_tax_db) == 'true':
            'database'
        ##
        #else
            '$database.database_type.mmseqs2_db_select.fields.path'/database
        #end if
    #end if
        'output_taxonomy'
        'taxo_result.txt'
        --report-mode 0
        --threads "\${GALAXY_SLOTS:-1}"
#end if
#if str($krona_report.keep_report) == "Yes"
&&
    mmseqs taxonomyreport
    #if $filtertaxseqdb.taxon_list
        'database_filtered'
    #else
        ##Used only for test
        #if str($download_tax_db) == 'true':
            'database'
        ##
        #else
            '$database.database_type.mmseqs2_db_select.fields.path'/database
        #end if
    #end if
        'output_taxonomy'
        'taxo_result.html'
        --report-mode 1
        --threads "\${GALAXY_SLOTS:-1}"
#end if
    ##-v INT              Verbosity level: 0: quiet, 1: +errors, 2: +warnings, 3: +info [3]

    ]]></command>
    <inputs>
        <!-- used only for tests, this makes it possible to download the taxonomy part of the db without having to have it in the test data directory, which is too large and cannot be reduced -->
        <param name="download_tax_db" type="hidden" value=""/>
        <!-- -->
        <section name="createdb" title="Convert FASTA/Q file(s) to MMseqs sequence DB format"  expanded="true">
            <param name="input_fasta" type="data" format="fasta,fastq,fasta.gz,fastq.gz" label="Input fasta file" help="" />
            <conditional name="alph_type">
                <param argument="--dbtype" type="select" label="Input type" help="" >
                    <option value="0" selected="true">Auto</option>
                    <option value="1">Amino acid</option>
                    <option value="2">Nucleotides</option>
                </param>
                <when value="0"/>
                <when value="1">
                    <param argument="--comp-bias-corr-scale" type="float" min="0" max="1" value="1" label="Scale composition bias correction" help=""/>
                </when>
                <when value="2">
                    <param argument="--zdrop" type="integer" min="0" value="40" label="Maximal allowed difference between score values before alignment is truncated" help=""/>
                </when>
            </conditional>
            <param argument="--shuffle" type="boolean" checked="true" label="Shuffle input database" truevalue="1" falsevalue="0" optional="true" help="" />
        </section>
        <section name="database" title="Choose the taxonomic reference database that you want to use" expanded="true">
            <conditional name="database_type">
                <param name="type" type="select" label="Database type" help="" >
                    <option value="amino_acid_tax" selected="true">Amino acid with taxonomy information</option>
                    <option value="nucleotides_tax">Nucleotides with taxonomy information</option>
                </param>
                <when value="amino_acid_tax">
                    <param name="mmseqs2_db_select" type="select" label="MMseqs2 databases">
                        <options from_data_table="mmseqs2_databases">
                            <filter type="static_value" value="aminoacid" column="type"/>
                            <filter type="static_value" value="yes" column="taxonomy"/>
                            <validator message="No mmseqs2 database is available" type="no_options"/>
                        </options>
                    </param>
                    <expand macro="search_type_aa" />
                </when>
                <when value="nucleotides_tax">
                    <param name="mmseqs2_db_select" type="select" label="MMseqs2 databases">
                        <options from_data_table="mmseqs2_databases">
                            <filter type="static_value" value="nucleotide" column="type"/>
                            <filter type="static_value" value="yes" column="taxonomy"/>
                            <validator message="No mmseqs2 database is available" type="no_options"/>
                        </options>
                    </param>
                    <expand macro="search_type_nt" />
                </when>
            </conditional>
        </section>
        <section name="filtertaxseqdb" title="Filter taxonomy sequence database">
            <param argument="--taxon-list" type="text" optional="true" value="" label="Taxonomy ID" help="Possibly multiple values separated by ','"/>
        </section>
        <section name="taxonomy" title="Taxonomy assignment by computing the lowest common ancestor of homologs">
            <section name="prefilter" title="Pre-filter">
                <expand macro="prefilter_common_parameters" />
                <param argument="--spaced-kmer-mode" type="select" label="Spaced k-mer mode" help="">
                    <option value="0">Use consecutive positions in k-mers</option>
                    <option value="1" selected="true">Use spaced k-mers</option>
                </param>
                <param argument="--min-ungapped-score" type="integer" min="0" value="15" label="Accept only matches with ungapped alignment score above threshold" help=""/>
                <param argument="-s" name="sensitivity" type="float" min="0" max="7.5" value="2" label="Sensitivity" help="1.0 faster; 4.0 fast; 7.5 sensitive"/>
                <param argument="--target-search-mode" type="select" label="Target search mode" help="" >
                    <option value="0" selected="true">Regular k-mer</option>
                    <option value="1">Similar k-mer</option>
                </param>
                <param argument="--max-seqs" type="integer" min="0" value="300" label="Maximum results per query sequence allowed to pass the prefilter" help="Affects sensitivity"/>
                <param argument="--split" type="integer" min="0" value="0" label="Split input into N equally distributed chunks" help="0: set the best split automatically"/>
                <param argument="--split-mode" type="select" label="Split mode" help="" >
                    <option value="0">Split target db</option>
                    <option value="1">Split query db</option>
                    <option value="2" selected="true">Auto, depending on main memory</option>
                </param>
                <param argument="--diag-score" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Use ungapped diagonal scoring during prefilter" help=""/>
                <param argument="--exact-kmer-matching" type="integer" min="0" max="1" value="0" label="Extract only exact k-mers for matching" help=""/>
            </section>
            <section name="align" title="Align">
                <expand macro="align_common_parameters" />
                <param argument="--alignment-mode" type="select" label="Alignment mode : How to compute the alignment" help="" >
                    <option value="0">Automatic</option>
                    <option value="1" selected="true">Only score and end_pos</option>
                    <option value="2">Also start_pos and cov</option>
                    <option value="3">Also seq.id</option>
                    <option value="4">Only ungapped alignment</option>
                </param>
                <param argument="-e" name="evalue" type="float" min="0" value="1" label="E-value threshold" help="List matches below this E-value"/>
                <param argument="--min-seq-id" type="float" min="0" max="1" value="0" label="Minimum sequence identity" help="List matches above this sequence identity for clustering"/>
                <param argument="-c" name="cov" type="float" min="0" value="0" label="List matches above this fraction of aligned (covered) residues" help=""/>
                <param argument="--cov-mode" type="select" label="Coverage mode" help="" >
                    <option value="0" selected="true">Coverage of query and target</option>
                    <option value="1">Coverage of target</option>
                    <option value="2">Coverage of query</option>
                    <option value="3">Target seq. length has to be at least x% of query length</option>
                    <option value="4">Query seq. length has to be at least x% of target length</option>
                    <option value="5">Short seq. needs to be at least x% of the other seq. length</option>
                </param>
                <param argument="--max-rejected" type="integer" min="0" value="5" label="Maximum rejected alignments before alignment calculation for a query is stopped" help=""/>
                <param argument="--max-accept" type="integer" min="0" value="30" label="Maximum accepted alignments before alignment calculation for a query is stopped" help=""/>
                <param argument="--exhaustive-search-filter" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Filter result during search ?" help=""/>
            </section>
            <section name="profile" title="Profile">
                <param argument="--mask-profile" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Mask query sequence of profile using tantan" help=""/>
                <param argument="--e-profile" type="float" min="0" value="1e-03" label="Include sequences matches with inf E-value threshold into the profile" help=""/>
                <param argument="--wg" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Use global sequence weighting for profile calculation" help=""/>
                <param argument="--filter-msa" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Filter MSA" help=""/>
                <param argument="--filter-min-enable" type="integer" min="0" value="0" label="Only filter MSAs with more than N sequences, 0 always filters" help=""/>
                <param argument="--max-seq-id" type="float" min="0" max="1" value="0.9" label="Reduce redundancy of output MSA using max. pairwise sequence identity" help=""/>
                <param argument="--qid" type="text" value="0" label="Reduce diversity of output MSAs using min.seq. identity with query sequences [0.0,1.0]" help="Alternatively, can be a list of multiple thresholds:
                                  E.g.: 0.15,0.30,0.50 to defines filter buckets of ]0.15-0.30] and ]0.30-0.50]"/>
                <param argument="--qsc" type="float" min="-50" max="100" value="-20" label="Reduce diversity of output MSAs using min. score per aligned residue with query sequences" help=""/>
                <param argument="--cov" type="float" min="0" max="1" value="0" label="Filter output MSAs using min. fraction of query residues covered by matched sequences" help=""/>
                <param argument="--diff" type="integer" min="0" value="1000" label="Filter MSAs by selecting most diverse set of sequences, keeping at least this many seqs in each MSA block of length 50" help=""/>
                <param argument="--pseudo-cnt-mode" type="select" label="Pseudo count mode" help="" >
                    <option value="0" selected="true">Substitution-matrix</option>
                    <option value="1">Context-specific pseudocounts</option>
                </param>
                <param argument="--exhaustive-search" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Exhaustive search" help=""/>
                <param argument="--lca-search" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Efficient search for LCA candidates" help=""/>
            </section>
            <section name="misc" title="Misc">
                <param argument="--orf-filter-e" type="float" min="0" value="1.000E+02" label="E-value threshold used for query ORF prefiltering" help=""/>
                <param argument="--orf-filter-s" type="float" min="0" value="2" label="Sensitivity used for query ORF prefiltering" help=""/>
                <param argument="--lca-mode" type="select" label="LCA mode" help="" >
                    <option value="1">Single search LCA</option>
                    <option value="3" selected="true">Approximate 2bLCA</option>
                    <option value="4">Top hit</option>
                </param>
                <param argument="--tax-output-mode" type="select" label="Taxonomy output mode" help="" >
                    <option value="0" selected="true">Output LCA</option>
                    <option value="1">Output alignment</option>
                    <option value="2">Output both</option>
                </param>
                <param argument="--majority" type="float" min="0" value="0.5" label="Minimal fraction of agreement among taxonomically assigned sequences of a set" help=""/>
                <param argument="--vote-mode" type="select" label="Mode of assigning weights to compute majority" help="" >
                    <option value="0">Uniform</option>
                    <option value="1" selected="true">Minus log E-value</option>
                    <option value="2">Score</option>
                </param>
                <param argument="--tax-lineage" type="select" label="Taxonomy lineage" help="" >
                    <option value="0" selected="true">Don't show</option>
                    <option value="1">Add all lineage names</option>
                    <option value="2">Add all lineage taxids</option>
                </param>
                <param argument="--blacklist" type="text" value="" label="Comma separated list of ignored taxa in LCA computation" help=""/>
                <param argument="--taxon-list" type="text" value="" label="Taxonomy ID, possibly multiple values separated by ','" help=""/>
                <param argument="--rescore-mode" type="select" label="Rescore diagonals with" help="" >
                    <option value="0" selected="true">Hamming distance</option>
                    <option value="1">Local alignment (score only)</option>
                    <option value="2">Local alignment</option>
                    <option value="3">Global alignment</option>
                    <option value="4">Longest alignment fulfilling window quality criterion</option>
                </param>
                <param argument="--allow-deletion" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Allow deletions in a MSA" help=""/>
                <param argument="--min-length" type="integer" min="0" value="30" label="Minimum codon number in open reading frames" help=""/>
                <param argument="--max-length" type="integer" min="0" value="32734" label="Maximum codon number in open reading frames" help=""/>
                <param argument="--max-gaps" type="integer" min="0" value="2147483647" label="Maximum number of codons with gaps or unknown residues before an open reading frame is rejected" help=""/>
                <param argument="--contig-start-mode" type="select" label="Contig start can be" help="" >
                    <option value="0">Incomplete</option>
                    <option value="1">Complete</option>
                    <option value="2" selected="true">Both</option>
                </param>
                <param argument="--contig-end-mode" type="select" label="Contig end can be" help="" >
                    <option value="0">Incomplete</option>
                    <option value="1">Complete</option>
                    <option value="2" selected="true">Both</option>
                </param>
                <param argument="--orf-start-mode" type="select" label="ORF fragment can be" help="" >
                    <option value="0">From start to stop</option>
                    <option value="1" selected="true">From any to stop</option>
                    <option value="2">From last encountered start to stop (no start in the middle)</option>
                </param>
                <param argument="--forward-frames" type="text" value="1,2,3" label="Comma-separated list of frames on the forward strand to be extracted" help=""/>
                <param argument="--reverse-frames" type="text" value="1,2,3" label="Comma-separated list of frames on the reverse strand to be extracted" help=""/>
                <param argument="--translation-table" type="select" label="Translation table" help="">
                    <option value="1" selected="true">Canonical</option>
                    <option value="2">The Vertebrate Mitochondrial Code</option>
                    <option value="3">The Yeast Mitochondrial Code</option>
                    <option value="4">The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
                    <option value="5">The Invertebrate Mitochondrial Code</option>
                    <option value="6">The Ciliate, Dasycladacean and Hexamita Nuclear Code</option>
                    <option value="9">The Echinoderm and Flatworm Mitochondrial Code</option>
                    <option value="10">The Euplotid Nuclear Code</option>
                    <option value="11">The Bacterial, Archaeal and Plant Plastid Code</option>
                    <option value="12">The Alternative Yeast Nuclear Code</option>
                    <option value="13">The Ascidian Mitochondrial Code</option>
                    <option value="14">The Alternative Flatworm Mitochondrial Code</option>
                    <option value="15">Blepharisma Nuclear Code</option>
                    <option value="16">Chlorophycean Mitochondrial Code</option>
                    <option value="21">Trematode Mitochondrial Code</option>
                    <option value="22">Scenedesmus obliquus Mitochondrial Code</option>
                    <option value="23">Thraustochytrium Mitochondrial Code</option>
                    <option value="24">Rhabdopleuridae Mitochondrial Code</option>
                    <option value="25">Candidate Division SR1 and Gracilibacteria Code</option>
                    <option value="26">Pachysolen tannophilus Nuclear Code</option>
                    <option value="27">Karyorelict Nuclear Code</option>
                    <option value="28">Condylostoma Nuclear Code</option>
                    <option value="29">Mesodinium Nuclear Code</option>
                    <option value="30">Peritrich Nuclear Code</option>
                    <option value="31">Blastocrithidia Nuclear Code</option>
                </param>
                <param argument="--translate" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Translate ORF to amino acid" help=""/>
                <param argument="--use-all-table-starts" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Use all alternatives for a start codon in the genetic table, if false - only ATG (AUG)" help=""/>
                <param argument="--id-offset" type="integer" min="0" value="0" label="Numeric ids in index file are offset by this value" help=""/>
                <param argument="--sequence-overlap" type="integer" min="0" value="0" label="Overlap between sequences" help=""/>
                <param argument="--sequence-split-mode" type="select" label="Sequence split mode" help="" >
                    <option value="0">Copy data</option>
                    <option value="1" selected="true">Soft link data and write new index</option>
                </param>
                <param argument="--headers-split-mode" type="select" label="Headers split mode" help="" >
                    <option value="0" selected="true">Split position</option>
                    <option value="1">Original header</option>
                </param>
                <param argument="--prefilter-mode" type="select" label="Prefilter mode" help="" >
                    <option value="0" selected="true">Kmer/ungapped</option>
                    <option value="1">Ungapped</option>
                    <option value="2">No filter</option>
                </param>
            </section>
            <expand macro="common_section"/>
            <section name="expert" title="Expert">
                <expand macro="expert_common_parameters" />
                <param argument="--chain-alignments" type="integer" min="0" value="0" label="Chain alignments" help=""/>
                <param argument="--merge-query" type="integer" min="0" value="1" label="Combine ORFs/split sequences to a single entry" help=""/>
            </section>
        </section>
        <section name="createtsv" title="Create a tsv report from taxonomy output ">
            <param argument="--first-seq-as-repr" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Use the first sequence of the clustering result as representative sequence" help=""/>
            <param argument="--target-column" type="integer" min="0" value="1" label="Select a target column, 0 if no target id exists" help="" />
            <param argument="--full-header" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Replace DB ID by its corresponding Full Header" help=""/>
            <param argument="--idx-seq-src" type="select" label="Index sequences source" help="">
                <option value="0" selected="true">Auto</option>
                <option value="1">Split/translated sequences</option>
                <option value="2">Input sequences</option>
            </param>
        </section>
        <conditional name="kraken_report">
            <param name="keep_report" type="select" label="Do you want a Kraken style report" help="" >
                <option value="Yes" selected="true">Yes</option>
                <option value="No">No</option>
            </param>
            <when value="Yes"/>
            <when value="No"/>
        </conditional>
                <conditional name="krona_report">
            <param name="keep_report" type="select" label="Do you want a Krona style report" help="" >
                <option value="Yes" selected="true">Yes</option>
                <option value="No">No</option>
            </param>
            <when value="Yes"/>
            <when value="No"/>
        </conditional>
    </inputs>
    <outputs>
        <data name="output_taxonomy_tsv" format="tabular" from_work_dir="taxo_result.tsv" label="${tool.name} on ${on_string}: Taxonomy Report"/>
        <data name="output_taxonomy_kraken" format="txt" from_work_dir="taxo_result.txt" label="${tool.name} on ${on_string}: Kraken Report">
            <filter>kraken_report['keep_report'] == "Yes"</filter>
        </data>
        <data name="output_taxonomy_krona" format="html" from_work_dir="taxo_result.html" label="${tool.name} on ${on_string}: Krona Report">
            <filter>krona_report['keep_report'] == "Yes"</filter>
        </data>
    </outputs>
    <tests>
        <!-- Test with Kraken report -->
        <test expect_num_outputs="2">
            <param name="download_tax_db" value="true"/>
            <section name="createdb">
                <param name="input_fasta" value="light_mystery_reads.fasta" ftype="fasta"/>
                <conditional name="alph_type">
                    <param name="dbtype" value="2"/>
                </conditional>
            </section>
            <section name="database">
                <conditional name="database_type">
                    <param name="type" value="amino_acid_tax"/>
                    <param name="mmseqs2_db_select" value="UniProtKB/Swiss-Prot-15.6f452-10022025" />
                </conditional>
            </section>
            <section name="filtertaxseqdb">
                <param name="taxon_list" value="2" />
            </section>
            <section name="taxonomy">
                <section name="prefilter">
                    <param name="mask_n_repeat" value="1" />
                </section>
            </section>
            <conditional name="krona_report">
                <param name="keep_report" value="No"/>
            </conditional>
            <output name="output_taxonomy_tsv" ftype="tabular">
                <assert_contents>
                    <has_line line="MYSTERY.64&#009;119060&#009;family&#009;Burkholderiaceae&#009;1&#009;1&#009;1&#009;1.000"/>
                    <has_n_columns n="8"/>
                </assert_contents>
            </output>
            <output name="output_taxonomy_kraken" ftype="txt">
                <assert_contents>
                    <has_text text="kingdom"/>
                    <has_text text="Pseudomonadati"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="2">
            <param name="download_tax_db" value="true"/>
            <section name="createdb">
                <param name="input_fasta" value="light_mystery_reads.fasta" ftype="fasta"/>
            </section>
            <section name="database">
                <conditional name="database_type">
                    <param name="type" value="amino_acid_tax"/>
                    <param name="mmseqs2_db_select" value="UniProtKB/Swiss-Prot-15.6f452-10022025" />
                </conditional>
            </section>
            <conditional name="kraken_report">
                <param name="keep_report" value="No"/>
            </conditional>
            <output name="output_taxonomy_tsv" ftype="tabular">
                <assert_contents>
                    <has_line line="MYSTERY.222&#009;1236&#009;class&#009;Gammaproteobacteria&#009;1&#009;1&#009;1&#009;1.000"/>
                    <has_line line="MYSTERY.64&#009;119060&#009;family&#009;Burkholderiaceae&#009;1&#009;1&#009;1&#009;1.000"/>
                    <has_n_columns n="8"/>
                </assert_contents>
            </output>
            <output name="output_taxonomy_krona" ftype="html">
                <assert_contents>
                    <has_line line="// Krona is a flexible tool for exploring the relative proportions of"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="3">
            <param name="download_tax_db" value="true"/>
            <section name="createdb">
                <param name="input_fasta" value="light_mystery_reads.fasta" ftype="fasta"/>
            </section>
            <section name="database">
                <conditional name="database_type">
                    <param name="type" value="amino_acid_tax"/>
                    <param name="mmseqs2_db_select" value="UniProtKB/Swiss-Prot-15.6f452-10022025" />
                </conditional>
            </section>
            <output name="output_taxonomy_tsv" ftype="tabular">
                <assert_contents>
                    <has_line line="MYSTERY.222&#009;1236&#009;class&#009;Gammaproteobacteria&#009;1&#009;1&#009;1&#009;1.000"/>
                    <has_line line="MYSTERY.64&#009;119060&#009;family&#009;Burkholderiaceae&#009;1&#009;1&#009;1&#009;1.000"/>
                    <has_n_columns n="8"/>
                </assert_contents>
            </output>
            <output name="output_taxonomy_krona" ftype="html">
                <assert_contents>
                    <has_line line="// Krona is a flexible tool for exploring the relative proportions of"/>
                </assert_contents>
            </output>
            <output name="output_taxonomy_kraken" ftype="txt">
                <assert_contents>
                    <has_text text="kingdom"/>
                    <has_text text="Pseudomonadati"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
**MMseqs2: ultra fast and sensitive sequence search and clustering suite**

MMseqs2 (Many-against-Many sequence searching) is a software suite to search and cluster huge protein and nucleotide sequence sets.
MMseqs2 is open source GPL-licensed software implemented in C++ for Linux, MacOS, and (as beta version, via cygwin) Windows.
The software is designed to run on multiple cores and servers and exhibits very good scalability.
MMseqs2 can run 10000 times faster than BLAST. At 100 times its speed it achieves almost the same sensitivity.
It can perform profile searches with the same sensitivity as PSI-BLAST at over 400 times its speed.

**Usage**

* Convert FASTA/Q file(s) to MMseqs sequence DB format
    *mmseqs createdb <i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]>|<i:stdin> <o:sequenceDB> [options]*

* Filter taxonomy sequence database
    *mmseqs filtertaxseqdb <i:taxSeqDB> <o:taxSeqDB> [options]*

* Taxonomy assignment by computing the lowest common ancestor of homologs
    *mmseqs taxonomy <i:queryDB> <i:targetDB> <o:taxaDB> <tmpDir> [options]*

* Convert result DB to tab-separated flat file
    *mmseqs createtsv <i:queryDB> [<i:targetDB>] <i:resultDB> <o:tsvFile> [options]*

* Create a taxonomy report in Kraken or Krona format
    *mmseqs taxonomyreport <i:seqTaxDB> <i:taxResultDB/resultDB/sequenceDB> <o:taxonomyReport> [options]*

https://github.com/soedinglab/MMseqs2

    ]]></help>
    <expand macro="citations"/>
</tool>
author	iuc
date	Mon, 14 Apr 2025 18:39:38 +0000
parents	d0acde079e2e
children