Mercurial > repos > iuc > mmseqs2_easy_linclust_clustering

diff mmseqs2_easy_linclust_clustering.xml @ 0:9f6869226de1 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/mmsesq2 commit 1400593429eb4e9c6e307df3621825a8b84a6fa7
author: iuc
date: Thu, 27 Mar 2025 14:37:56 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mmseqs2_easy_linclust_clustering.xml	Thu Mar 27 14:37:56 2025 +0000
@@ -0,0 +1,242 @@
+<tool id="mmseqs2_easy_linclust_clustering" name="MMseqs2 Sequence Clustering" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>
+        of very large datasets
+    </description>
+    <macros>
+        <import>macro.xml</import>
+    </macros>
+    <expand macro="biotools"/>
+    <expand macro="requirements"/>
+    <expand macro="version_command"/>
+    <command detect_errors="exit_code"><![CDATA[
+mmseqs easy-linclust
+    '$input_fasta'
+    'result'
+    'tmp'
+
+    #if '$alph_type.dbtype' == "1"
+        --comp-bias-corr-scale $alph_type.type.comp_bias_corr_scale
+        --kmer-per-seq-scale $alph_type.type.kmer_per_seq_scale
+    #elif '$alph_type.dbtype' == "2"
+        --zdrop $alph_type.type.zdrop
+        --kmer-per-seq-scale $alph_type.type.kmer_per_seq_scale
+        --adjust-kmer-len $alph_type.type.adjust_kmer_len
+    #end if
+    ##Pre-filter options
+    --add-self-matches $prefilter.add_self_matches
+    -k $prefilter.kmer_length
+    ##--split-memory-limit BYTE        Set max memory per split. E.g. 800B, 5K, 10M, 1G. Default (0) to all available system memory [0]
+    --mask $prefilter.mask
+    --mask-prob $prefilter.mask_prob
+    --mask-lower-case $prefilter.mask_lower_case
+    --spaced-kmer-mode $prefilter.spaced_kmer_mode
+    ##--spaced-kmer-pattern STR        User-specified spaced k-mer pattern []
+    ##--disk-space-limit BYTE          Set max disk space to use for reverse profile searches. E.g. 800B, 5K, 10M, 1G. Default (0) to all available disk space in the temp folder [0]
+    
+    ##Align options
+    -a $align.convertalis
+    ##The next 2 parameters seems to be the same
+    --alignment-mode $align.alignment_mode 
+    --alignment-output-mode $align.alignment_output_mode 
+    --wrapped-scoring $align.wrapped_scoring
+    -e $align.evalue
+    --min-seq-id $min_seq_id
+    --min-aln-len $align.min_aln_len
+    --seq-id-mode $align.seq_id_mode
+    --alt-ali $align.alt_ali
+    -c $cov
+    --cov-mode $cov_mode
+    --max-rejected $align.max_rejected
+    --max-accept $align.max_accept
+    --score-bias $align.score_bias
+    --realign $align.realign
+    --realign-score-bias $align.realign_score_bias
+    --realign-max-seqs $align.realign_max_seqs
+    --corr-score-weight $align.corr_score_weight
+
+    ##Clustering options
+    --cluster-mode $cluster.cluster_mode
+    --max-iterations $cluster.max_iterations
+    --similarity-type $cluster.similarity_type
+
+    ##kmermatcher options
+    ##--weights STR                    Weights used for cluster priorization []
+    --cluster-weight-threshold $kmermatcher.cluster_weight_threshold
+    --kmer-per-seq $kmermatcher.kmer_per_seq
+    --hash-shift $kmermatcher.hash_shift
+    --include-only-extendable $kmermatcher.include_only_extendable
+    --ignore-multi-kmer $kmermatcher.ignore_multi_kmer
+
+    ##Profile options
+    ##--pca                            Pseudo count admixture strength []
+    ##--pcb                            Pseudo counts: Neff at half of maximum admixture (range 0.0-inf) []
+
+    ##Misc options
+    --rescore-mode $misc.rescore_mode
+    --dbtype $alph_type.dbtype
+    --shuffle $misc.shuffle
+    --id-offset $misc.id_offset
+
+    ##Common options
+    ##--compressed INT                 Write compressed output [0]
+    --threads "\${GALAXY_SLOTS:-1}"
+    ##-v INT                           Verbosity level: 0: quiet, 1: +errors, 2: +warnings, 3: +info [3]
+    --max-seq-len $common.max_seq_len
+    ##--db-load-mode INT               Database preload mode 0: auto, 1: fread, 2: mmap, 3: mmap+touch [0]
+    ##--mpi-runner STR                 Use MPI on compute cluster with this MPI command (e.g. "mpirun -np 42") []
+    ##--force-reuse BOOL               Reuse tmp filse in tmp/latest folder ignoring parameters and version changes [0]
+    ##--remove-tmp-files BOOL          Delete temporary files [0]
+
+    ##Expert options
+    --filter-hits $expert.filter_hits
+    --sort-results $expert.sort_results
+    ##--create-lookup INT              Create database lookup file (can be very large) [0]
+    ]]></command>
+    <inputs>
+        <param name="input_fasta" type="data" format="fasta,fasta.gz" label="Input fasta file" help="" />
+        <conditional name="alph_type">
+            <param argument="--dbtype" type="select" label="Input data type" help="" >
+                <option value="0" selected="true">Automatic</option>
+                <option value="1">Amino acid</option>
+                <option value="2">Nucleotides</option>
+            </param>
+            <when value="0"/>
+            <when value="1">
+                <param argument="--comp-bias-corr-scale" type="float" min="0" max="1" value="1" label="Scale composition bias correction" help=""/>
+                <param argument="--kmer-per-seq-scale" type="float" min="0" value="0.000" label="Scale k-mer per sequence based on sequence length" help=""/>
+            </when>
+            <when value="2">
+                <param argument="--zdrop" type="integer" min="0" value="40" label="Maximal allowed difference between score values before alignment is truncated" help=""/>
+                <param argument="--kmer-per-seq-scale" type="float" min="0" value="0.200" label="Scale k-mer per sequence based on sequence length" help=""/>
+                <param argument="--adjust-kmer-len" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Adjust k-mer length based on specificity" help=""/>
+            </when>
+        </conditional>
+        <param argument="--min-seq-id" type="float" min="0" max="1" value="0" label="Minimum sequence identity" help="List matches above this sequence identity for clustering"/>
+        <param argument="--cov-mode" type="select" label="Coverage mode" help="" >
+                <option value="0" selected="true">Coverage of query and target</option>
+                <option value="1">Coverage of target</option>
+                <option value="2">Coverage of query</option>
+                <option value="3">Target seq. length has to be at least x% of query length</option>
+                <option value="4">Query seq. length has to be at least x% of target length</option>
+                <option value="5">Short seq. needs to be at least x% of the other seq. length</option>
+        </param>
+        <param argument="-c" name="cov" type="float" min="0" value="0.800" label="List matches above this fraction of aligned (covered) residues" help=""/>
+        <section name="prefilter" title="Pre-filter">
+            <expand macro="prefilter_common_parameters" />
+            <param argument="--spaced-kmer-mode" type="select" label="Spaced k-mer mode" help="">
+                <option value="0" selected="true">Use consecutive positions in k-mers</option>
+                <option value="1">Use spaced k-mers</option>
+            </param>
+        </section>
+        <section name="align" title="Align">
+            <expand macro="align_common_parameters" />
+            <param argument="--alignment-mode" type="select" label="Alignment mode : How to compute the alignment" help="" >
+                <option value="0" selected="true">Automatic</option>
+                <option value="1">Only score and end_pos</option>
+                <option value="2">Also start_pos and cov</option>
+                <option value="3">Also seq.id</option>
+                <option value="4">Only ungapped alignment</option>
+            </param>
+            <param argument="-e" name="evalue" type="float" min="0" value="1.000E-03" label="E-value threshold" help="List matches below this E-value"/>
+            <param argument="--max-rejected" type="integer" min="0" value="2147483647" optional="true" label="Maximum rejected alignments before alignment calculation for a query is stopped" help=""/>
+            <param argument="--max-accept" type="integer" min="0" value="2147483647" optional="true" label="Maximum accepted alignments before alignment calculation for a query is stopped" help=""/>
+        </section>
+        <section name="cluster" title="Clustering">
+            <param argument="--cluster-mode" type="select" label="Cluster mode" help="" >
+                <option value="0" selected="true">Set-Cover (greedy)</option>
+                <option value="1">Connected component (BLASTclust)</option>
+                <option value="2">Greedy clustering by sequence length (CDHIT)</option>
+            </param>
+            <param argument="--max-iterations" type="integer" min="0" value="1000" label="Maximum depth of breadth first search in connected component clustering" help=""/>
+            <param argument="--similarity-type" type="select" label="Type of score used for clustering" help="" >
+                <option value="1">Alignment score</option>
+                <option value="2" selected="true">Sequence identity</option>
+            </param>
+        </section>
+        <section name="kmermatcher" title="K-mer matcher">
+            <param argument="--cluster-weight-threshold" type="float" min="0" value="0.900" label="Weight threshold used for cluster priorization" help=""/>
+            <param argument="--kmer-per-seq" type="integer" min="0" value="21" label="Number of k-mers per sequence" help=""/>
+            <param argument="--hash-shift" type="integer" min="0" value="67" label="Shift k-mer hash initialization" help=""/>
+            <param argument="--include-only-extendable" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Include only extendable" help=""/>
+            <param argument="--ignore-multi-kmer" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Skip k-mers occurring multiple times (>=2)" help=""/>
+        </section>
+        <section name="misc" title="Misc">
+            <param argument="--rescore-mode" type="select" label="Rescore diagonals with" help="" >
+                <option value="0" selected="true">Hamming distance</option>
+                <option value="1">Local alignment (score only)</option>
+                <option value="2">Local alignment</option>
+                <option value="3">Global alignment</option>
+                <option value="4">Longest alignment fulfilling window quality criterion</option>
+            </param>
+            <param argument="--shuffle" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Shuffle input database" help=""/>
+            <param argument="--id-offset" type="integer" min="0" value="0" label="Numeric ids in index file are offset by this value" help=""/>
+        </section>
+        <expand macro="common_section"/>
+        <section name="expert" title="Expert">
+            <expand macro="expert_common_parameters" />
+        </section>
+        <section name="output_files" title="Selection of the output files">
+          <param name="output_selection" type="select" min="1" display="checkboxes" multiple="true"  label="Output files selection">
+              <option value="file_rep_seq" selected="true">Representatives sequences in fasta</option>
+              <option value="file_all_seq" selected="true">FASTA-like per cluster</option>
+              <option value="file_cluster_tsv" selected="true">Adjecency list in TSV</option>
+          </param>
+        </section>
+    </inputs>
+    <outputs>
+        <data name="output_rep_seq" format="fasta" from_work_dir="result_rep_seq.fasta" label="${tool.name} on ${on_string} : Representatives sequences" >
+            <filter>output_files['output_selection'] and "file_rep_seq" in output_files['output_selection']</filter>
+        </data>
+        <data name="output_all_seq" format="fasta" from_work_dir="result_all_seqs.fasta" label="${tool.name} on ${on_string} : FASTA-like per cluster" >
+            <filter>output_files['output_selection'] and "file_all_seq" in output_files['output_selection']</filter>
+        </data>
+        <data name="output_cluster" format="tabular" from_work_dir="result_cluster.tsv" label="${tool.name} on ${on_string} : Adjecency list">
+            <filter>output_files['output_selection'] and "file_cluster_tsv" in output_files['output_selection']</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="3">
+            <param name="input_fasta" value="light_mystery_reads.fasta" ftype="fasta"/>
+            <conditional name="alph_type">
+                <param name="dbtype" value="2"/>
+            </conditional>
+            <output name="output_rep_seq" ftype="fasta">
+                <assert_contents>
+                    <has_text text="TACTTCTCAGCTGTACTGTTTCTTGGTGTAGGGTCAACAACCCTTCAATGGATGTTCTCTTACTACCCAACCGATTGGGCGCACTACCGGGTCACATATGC"/>               
+                    <has_size value="551000" delta="50000"/>
+                </assert_contents>
+            </output>
+            <output name="output_all_seq" ftype="fasta">
+                <assert_contents>
+                    <has_text text="GAATAGCGGGACGCCAAGGGGCGGCCTTGCGTCCGCCCACGTGTGTGCTTGGCACGCGGGGCGTCCGCAAACCTTTGATCGGAACTTGCGATGGAGAAGCT"/>
+                    <has_size value="627000" delta="20000"/>
+                    <has_n_lines n="14806" delta="500"/>
+                </assert_contents>
+            </output>
+            <output name="output_cluster" ftype="tabular">
+                <assert_contents>
+                    <has_line line="MYSTERY.13&#009;MYSTERY.13"/>
+                    <has_n_columns n="2"/>
+                    <has_size value="113000" delta="50000"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+**MMseqs2: ultra fast and sensitive sequence search and clustering suite**
+
+MMseqs2 (Many-against-Many sequence searching) is a software suite to search and cluster huge protein and nucleotide sequence sets. 
+MMseqs2 is open source GPL-licensed software implemented in C++ for Linux, MacOS, and (as beta version, via cygwin) Windows. 
+The software is designed to run on multiple cores and servers and exhibits very good scalability. 
+MMseqs2 can run 10000 times faster than BLAST. At 100 times its speed it achieves almost the same sensitivity. 
+It can perform profile searches with the same sensitivity as PSI-BLAST at over 400 times its speed.
+
+**Usage** 
+MMseqs easy-linclust is useful to clusters entries from a FASTA/FASTQ file using the cascaded clustering algorithm.
+It offers an efficient clustering workflow, scaling linearly with input size. Similar to easy-cluster, but more suitable for handling very large datasets efficiently.
+
+https://github.com/soedinglab/MMseqs2
+
+    ]]></help>
+    <expand macro="citations"/>
+</tool>
\ No newline at end of file