Mercurial > repos > iuc > mmseqs2_easy_linclust_clustering

<tool id="mmseqs2_easy_linclust_clustering" name="MMseqs2 Sequence Clustering" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>
        of very large datasets
    </description>
    <macros>
        <import>macro.xml</import>
    </macros>
    <expand macro="biotools"/>
    <expand macro="requirements"/>
    <expand macro="version_command"/>
    <command detect_errors="exit_code"><![CDATA[
mmseqs easy-linclust
    '$input_fasta'
    'result'
    'tmp'

    #if '$alph_type.dbtype' == "1"
        --comp-bias-corr-scale $alph_type.type.comp_bias_corr_scale
        --kmer-per-seq-scale $alph_type.type.kmer_per_seq_scale
    #elif '$alph_type.dbtype' == "2"
        --zdrop $alph_type.type.zdrop
        --kmer-per-seq-scale $alph_type.type.kmer_per_seq_scale
        --adjust-kmer-len $alph_type.type.adjust_kmer_len
    #end if
    ##Pre-filter options
    --add-self-matches $prefilter.add_self_matches
    -k $prefilter.kmer_length
    ##--split-memory-limit BYTE        Set max memory per split. E.g. 800B, 5K, 10M, 1G. Default (0) to all available system memory [0]
    --mask $prefilter.mask
    --mask-prob $prefilter.mask_prob
    --mask-lower-case $prefilter.mask_lower_case
    --spaced-kmer-mode $prefilter.spaced_kmer_mode
    ##--spaced-kmer-pattern STR        User-specified spaced k-mer pattern []
    ##--disk-space-limit BYTE          Set max disk space to use for reverse profile searches. E.g. 800B, 5K, 10M, 1G. Default (0) to all available disk space in the temp folder [0]

    ##Align options
    -a $align.convertalis
    ##The next 2 parameters seems to be the same
    --alignment-mode $align.alignment_mode
    --alignment-output-mode $align.alignment_output_mode
    --wrapped-scoring $align.wrapped_scoring
    -e $align.evalue
    --min-seq-id $min_seq_id
    --min-aln-len $align.min_aln_len
    --seq-id-mode $align.seq_id_mode
    --alt-ali $align.alt_ali
    -c $cov
    --cov-mode $cov_mode
    --max-rejected $align.max_rejected
    --max-accept $align.max_accept
    --score-bias $align.score_bias
    --realign $align.realign
    --realign-score-bias $align.realign_score_bias
    --realign-max-seqs $align.realign_max_seqs
    --corr-score-weight $align.corr_score_weight

    ##Clustering options
    --cluster-mode $cluster.cluster_mode
    --max-iterations $cluster.max_iterations
    --similarity-type $cluster.similarity_type

    ##kmermatcher options
    ##--weights STR                    Weights used for cluster priorization []
    --cluster-weight-threshold $kmermatcher.cluster_weight_threshold
    --kmer-per-seq $kmermatcher.kmer_per_seq
    --hash-shift $kmermatcher.hash_shift
    --include-only-extendable $kmermatcher.include_only_extendable
    --ignore-multi-kmer $kmermatcher.ignore_multi_kmer

    ##Profile options
    ##--pca                            Pseudo count admixture strength []
    ##--pcb                            Pseudo counts: Neff at half of maximum admixture (range 0.0-inf) []

    ##Misc options
    --rescore-mode $misc.rescore_mode
    --dbtype $alph_type.dbtype
    --shuffle $misc.shuffle
    --id-offset $misc.id_offset

    ##Common options
    ##--compressed INT                 Write compressed output [0]
    --threads "\${GALAXY_SLOTS:-1}"
    ##-v INT                           Verbosity level: 0: quiet, 1: +errors, 2: +warnings, 3: +info [3]
    --max-seq-len $common.max_seq_len
    ##--db-load-mode INT               Database preload mode 0: auto, 1: fread, 2: mmap, 3: mmap+touch [0]
    ##--mpi-runner STR                 Use MPI on compute cluster with this MPI command (e.g. "mpirun -np 42") []
    ##--force-reuse BOOL               Reuse tmp filse in tmp/latest folder ignoring parameters and version changes [0]
    ##--remove-tmp-files BOOL          Delete temporary files [0]

    ##Expert options
    --filter-hits $expert.filter_hits
    --sort-results $expert.sort_results
    ##--create-lookup INT              Create database lookup file (can be very large) [0]
    ]]></command>
    <inputs>
        <param name="input_fasta" type="data" format="fasta,fasta.gz" label="Input fasta file" help="" />
        <conditional name="alph_type">
            <param argument="--dbtype" type="select" label="Input data type" help="" >
                <option value="0" selected="true">Automatic</option>
                <option value="1">Amino acid</option>
                <option value="2">Nucleotides</option>
            </param>
            <when value="0"/>
            <when value="1">
                <param argument="--comp-bias-corr-scale" type="float" min="0" max="1" value="1" label="Scale composition bias correction" help=""/>
                <param argument="--kmer-per-seq-scale" type="float" min="0" value="0.000" label="Scale k-mer per sequence based on sequence length" help=""/>
            </when>
            <when value="2">
                <param argument="--zdrop" type="integer" min="0" value="40" label="Maximal allowed difference between score values before alignment is truncated" help=""/>
                <param argument="--kmer-per-seq-scale" type="float" min="0" value="0.200" label="Scale k-mer per sequence based on sequence length" help=""/>
                <param argument="--adjust-kmer-len" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Adjust k-mer length based on specificity" help=""/>
            </when>
        </conditional>
        <param argument="--min-seq-id" type="float" min="0" max="1" value="0" label="Minimum sequence identity" help="List matches above this sequence identity for clustering"/>
        <param argument="--cov-mode" type="select" label="Coverage mode" help="" >
                <option value="0" selected="true">Coverage of query and target</option>
                <option value="1">Coverage of target</option>
                <option value="2">Coverage of query</option>
                <option value="3">Target seq. length has to be at least x% of query length</option>
                <option value="4">Query seq. length has to be at least x% of target length</option>
                <option value="5">Short seq. needs to be at least x% of the other seq. length</option>
        </param>
        <param argument="-c" name="cov" type="float" min="0" value="0.800" label="List matches above this fraction of aligned (covered) residues" help=""/>
        <section name="prefilter" title="Pre-filter">
            <expand macro="prefilter_common_parameters" />
            <param argument="--spaced-kmer-mode" type="select" label="Spaced k-mer mode" help="">
                <option value="0" selected="true">Use consecutive positions in k-mers</option>
                <option value="1">Use spaced k-mers</option>
            </param>
        </section>
        <section name="align" title="Align">
            <expand macro="align_common_parameters" />
            <param argument="--alignment-mode" type="select" label="Alignment mode : How to compute the alignment" help="" >
                <option value="0" selected="true">Automatic</option>
                <option value="1">Only score and end_pos</option>
                <option value="2">Also start_pos and cov</option>
                <option value="3">Also seq.id</option>
                <option value="4">Only ungapped alignment</option>
            </param>
            <param argument="-e" name="evalue" type="float" min="0" value="1.000E-03" label="E-value threshold" help="List matches below this E-value"/>
            <param argument="--max-rejected" type="integer" min="0" value="2147483647" optional="true" label="Maximum rejected alignments before alignment calculation for a query is stopped" help=""/>
            <param argument="--max-accept" type="integer" min="0" value="2147483647" optional="true" label="Maximum accepted alignments before alignment calculation for a query is stopped" help=""/>
        </section>
        <section name="cluster" title="Clustering">
            <param argument="--cluster-mode" type="select" label="Cluster mode" help="" >
                <option value="0" selected="true">Set-Cover (greedy)</option>
                <option value="1">Connected component (BLASTclust)</option>
                <option value="2">Greedy clustering by sequence length (CDHIT)</option>
            </param>
            <param argument="--max-iterations" type="integer" min="0" value="1000" label="Maximum depth of breadth first search in connected component clustering" help=""/>
            <param argument="--similarity-type" type="select" label="Type of score used for clustering" help="" >
                <option value="1">Alignment score</option>
                <option value="2" selected="true">Sequence identity</option>
            </param>
        </section>
        <section name="kmermatcher" title="K-mer matcher">
            <param argument="--cluster-weight-threshold" type="float" min="0" value="0.900" label="Weight threshold used for cluster priorization" help=""/>
            <param argument="--kmer-per-seq" type="integer" min="0" value="21" label="Number of k-mers per sequence" help=""/>
            <param argument="--hash-shift" type="integer" min="0" value="67" label="Shift k-mer hash initialization" help=""/>
            <param argument="--include-only-extendable" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Include only extendable" help=""/>
            <param argument="--ignore-multi-kmer" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Skip k-mers occurring multiple times (>=2)" help=""/>
        </section>
        <section name="misc" title="Misc">
            <param argument="--rescore-mode" type="select" label="Rescore diagonals with" help="" >
                <option value="0" selected="true">Hamming distance</option>
                <option value="1">Local alignment (score only)</option>
                <option value="2">Local alignment</option>
                <option value="3">Global alignment</option>
                <option value="4">Longest alignment fulfilling window quality criterion</option>
            </param>
            <param argument="--shuffle" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Shuffle input database" help=""/>
            <param argument="--id-offset" type="integer" min="0" value="0" label="Numeric ids in index file are offset by this value" help=""/>
        </section>
        <expand macro="common_section"/>
        <section name="expert" title="Expert">
            <expand macro="expert_common_parameters" />
        </section>
        <section name="output_files" title="Selection of the output files">
          <param name="output_selection" type="select" min="1" display="checkboxes" multiple="true"  label="Output files selection">
              <option value="file_rep_seq" selected="true">Representatives sequences in fasta</option>
              <option value="file_all_seq" selected="true">FASTA-like per cluster</option>
              <option value="file_cluster_tsv" selected="true">Adjecency list in TSV</option>
          </param>
        </section>
    </inputs>
    <outputs>
        <data name="output_rep_seq" format="fasta" from_work_dir="result_rep_seq.fasta" label="${tool.name} on ${on_string} : Representatives sequences" >
            <filter>output_files['output_selection'] and "file_rep_seq" in output_files['output_selection']</filter>
        </data>
        <data name="output_all_seq" format="fasta" from_work_dir="result_all_seqs.fasta" label="${tool.name} on ${on_string} : FASTA-like per cluster" >
            <filter>output_files['output_selection'] and "file_all_seq" in output_files['output_selection']</filter>
        </data>
        <data name="output_cluster" format="tabular" from_work_dir="result_cluster.tsv" label="${tool.name} on ${on_string} : Adjecency list">
            <filter>output_files['output_selection'] and "file_cluster_tsv" in output_files['output_selection']</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="3">
            <param name="input_fasta" value="light_mystery_reads.fasta" ftype="fasta"/>
            <conditional name="alph_type">
                <param name="dbtype" value="2"/>
            </conditional>
            <output name="output_rep_seq" ftype="fasta">
                <assert_contents>
                    <has_text text="TACTTCTCAGCTGTACTGTTTCTTGGTGTAGGGTCAACAACCCTTCAATGGATGTTCTCTTACTACCCAACCGATTGGGCGCACTACCGGGTCACATATGC"/>
                    <has_size value="551000" delta="50000"/>
                </assert_contents>
            </output>
            <output name="output_all_seq" ftype="fasta">
                <assert_contents>
                    <has_text text="GAATAGCGGGACGCCAAGGGGCGGCCTTGCGTCCGCCCACGTGTGTGCTTGGCACGCGGGGCGTCCGCAAACCTTTGATCGGAACTTGCGATGGAGAAGCT"/>
                    <has_size value="627000" delta="20000"/>
                    <has_n_lines n="14806" delta="500"/>
                </assert_contents>
            </output>
            <output name="output_cluster" ftype="tabular">
                <assert_contents>
                    <has_line line="MYSTERY.13&#009;MYSTERY.13"/>
                    <has_n_columns n="2"/>
                    <has_size value="113000" delta="50000"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
**MMseqs2: ultra fast and sensitive sequence search and clustering suite**

MMseqs2 (Many-against-Many sequence searching) is a software suite to search and cluster huge protein and nucleotide sequence sets.
MMseqs2 is open source GPL-licensed software implemented in C++ for Linux, MacOS, and (as beta version, via cygwin) Windows.
The software is designed to run on multiple cores and servers and exhibits very good scalability.
MMseqs2 can run 10000 times faster than BLAST. At 100 times its speed it achieves almost the same sensitivity.
It can perform profile searches with the same sensitivity as PSI-BLAST at over 400 times its speed.

**Usage**
MMseqs easy-linclust is useful to clusters entries from a FASTA/FASTQ file using the cascaded clustering algorithm.
It offers an efficient clustering workflow, scaling linearly with input size. Similar to easy-cluster, but more suitable for handling very large datasets efficiently.

https://github.com/soedinglab/MMseqs2

    ]]></help>
    <expand macro="citations"/>
</tool>
author	iuc
date	Thu, 27 Mar 2025 16:42:57 +0000
parents	9f6869226de1
children