mmseqs2_easy_linclust_clustering: mmseqs2_easy_linclust

comparison mmseqs2_easy_linclust_clustering.xml @ 0:9f6869226de1 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/mmsesq2 commit 1400593429eb4e9c6e307df3621825a8b84a6fa7

author	iuc
date	Thu, 27 Mar 2025 14:37:56 +0000
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:9f6869226de1
+<tool id="mmseqs2_easy_linclust_clustering" name="MMseqs2 Sequence Clustering" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+<description>
+of very large datasets
+</description>
+<macros>
+<import>macro.xml</import>
+</macros>
+<expand macro="biotools"/>
+<expand macro="requirements"/>
+<expand macro="version_command"/>
+<command detect_errors="exit_code"><![CDATA[
+mmseqs easy-linclust
+'$input_fasta'
+'result'
+'tmp'
+#if '$alph_type.dbtype' == "1"
+--comp-bias-corr-scale $alph_type.type.comp_bias_corr_scale
+--kmer-per-seq-scale $alph_type.type.kmer_per_seq_scale
+#elif '$alph_type.dbtype' == "2"
+--zdrop $alph_type.type.zdrop
+--kmer-per-seq-scale $alph_type.type.kmer_per_seq_scale
+--adjust-kmer-len $alph_type.type.adjust_kmer_len
+#end if
+##Pre-filter options
+--add-self-matches $prefilter.add_self_matches
+-k $prefilter.kmer_length
+##--split-memory-limit BYTE        Set max memory per split. E.g. 800B, 5K, 10M, 1G. Default (0) to all available system memory [0]
+--mask $prefilter.mask
+--mask-prob $prefilter.mask_prob
+--mask-lower-case $prefilter.mask_lower_case
+--spaced-kmer-mode $prefilter.spaced_kmer_mode
+##--spaced-kmer-pattern STR        User-specified spaced k-mer pattern []
+##--disk-space-limit BYTE          Set max disk space to use for reverse profile searches. E.g. 800B, 5K, 10M, 1G. Default (0) to all available disk space in the temp folder [0]
+##Align options
+-a $align.convertalis
+##The next 2 parameters seems to be the same
+--alignment-mode $align.alignment_mode
+--alignment-output-mode $align.alignment_output_mode
+--wrapped-scoring $align.wrapped_scoring
+-e $align.evalue
+--min-seq-id $min_seq_id
+--min-aln-len $align.min_aln_len
+--seq-id-mode $align.seq_id_mode
+--alt-ali $align.alt_ali
+-c $cov
+--cov-mode $cov_mode
+--max-rejected $align.max_rejected
+--max-accept $align.max_accept
+--score-bias $align.score_bias
+--realign $align.realign
+--realign-score-bias $align.realign_score_bias
+--realign-max-seqs $align.realign_max_seqs
+--corr-score-weight $align.corr_score_weight
+##Clustering options
+--cluster-mode $cluster.cluster_mode
+--max-iterations $cluster.max_iterations
+--similarity-type $cluster.similarity_type
+##kmermatcher options
+##--weights STR                    Weights used for cluster priorization []
+--cluster-weight-threshold $kmermatcher.cluster_weight_threshold
+--kmer-per-seq $kmermatcher.kmer_per_seq
+--hash-shift $kmermatcher.hash_shift
+--include-only-extendable $kmermatcher.include_only_extendable
+--ignore-multi-kmer $kmermatcher.ignore_multi_kmer
+##Profile options
+##--pca                            Pseudo count admixture strength []
+##--pcb                            Pseudo counts: Neff at half of maximum admixture (range 0.0-inf) []
+##Misc options
+--rescore-mode $misc.rescore_mode
+--dbtype $alph_type.dbtype
+--shuffle $misc.shuffle
+--id-offset $misc.id_offset
+##Common options
+##--compressed INT                 Write compressed output [0]
+--threads "\${GALAXY_SLOTS:-1}"
+##-v INT                           Verbosity level: 0: quiet, 1: +errors, 2: +warnings, 3: +info [3]
+--max-seq-len $common.max_seq_len
+##--db-load-mode INT               Database preload mode 0: auto, 1: fread, 2: mmap, 3: mmap+touch [0]
+##--mpi-runner STR                 Use MPI on compute cluster with this MPI command (e.g. "mpirun -np 42") []
+##--force-reuse BOOL               Reuse tmp filse in tmp/latest folder ignoring parameters and version changes [0]
+##--remove-tmp-files BOOL          Delete temporary files [0]
+##Expert options
+--filter-hits $expert.filter_hits
+--sort-results $expert.sort_results
+##--create-lookup INT              Create database lookup file (can be very large) [0]
+]]></command>
+<inputs>
+<param name="input_fasta" type="data" format="fasta,fasta.gz" label="Input fasta file" help="" />
+<conditional name="alph_type">
+<param argument="--dbtype" type="select" label="Input data type" help="" >
+<option value="0" selected="true">Automatic</option>
+<option value="1">Amino acid</option>
+<option value="2">Nucleotides</option>
+</param>
+<when value="0"/>
+<when value="1">
+<param argument="--comp-bias-corr-scale" type="float" min="0" max="1" value="1" label="Scale composition bias correction" help=""/>
+<param argument="--kmer-per-seq-scale" type="float" min="0" value="0.000" label="Scale k-mer per sequence based on sequence length" help=""/>
+</when>
+<when value="2">
+<param argument="--zdrop" type="integer" min="0" value="40" label="Maximal allowed difference between score values before alignment is truncated" help=""/>
+<param argument="--kmer-per-seq-scale" type="float" min="0" value="0.200" label="Scale k-mer per sequence based on sequence length" help=""/>
+<param argument="--adjust-kmer-len" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Adjust k-mer length based on specificity" help=""/>
+</when>
+</conditional>
+<param argument="--min-seq-id" type="float" min="0" max="1" value="0" label="Minimum sequence identity" help="List matches above this sequence identity for clustering"/>
+<param argument="--cov-mode" type="select" label="Coverage mode" help="" >
+<option value="0" selected="true">Coverage of query and target</option>
+<option value="1">Coverage of target</option>
+<option value="2">Coverage of query</option>
+<option value="3">Target seq. length has to be at least x% of query length</option>
+<option value="4">Query seq. length has to be at least x% of target length</option>
+<option value="5">Short seq. needs to be at least x% of the other seq. length</option>
+</param>
+<param argument="-c" name="cov" type="float" min="0" value="0.800" label="List matches above this fraction of aligned (covered) residues" help=""/>
+<section name="prefilter" title="Pre-filter">
+<expand macro="prefilter_common_parameters" />
+<param argument="--spaced-kmer-mode" type="select" label="Spaced k-mer mode" help="">
+<option value="0" selected="true">Use consecutive positions in k-mers</option>
+<option value="1">Use spaced k-mers</option>
+</param>
+</section>
+<section name="align" title="Align">
+<expand macro="align_common_parameters" />
+<param argument="--alignment-mode" type="select" label="Alignment mode : How to compute the alignment" help="" >
+<option value="0" selected="true">Automatic</option>
+<option value="1">Only score and end_pos</option>
+<option value="2">Also start_pos and cov</option>
+<option value="3">Also seq.id</option>
+<option value="4">Only ungapped alignment</option>
+</param>
+<param argument="-e" name="evalue" type="float" min="0" value="1.000E-03" label="E-value threshold" help="List matches below this E-value"/>
+<param argument="--max-rejected" type="integer" min="0" value="2147483647" optional="true" label="Maximum rejected alignments before alignment calculation for a query is stopped" help=""/>
+<param argument="--max-accept" type="integer" min="0" value="2147483647" optional="true" label="Maximum accepted alignments before alignment calculation for a query is stopped" help=""/>
+</section>
+<section name="cluster" title="Clustering">
+<param argument="--cluster-mode" type="select" label="Cluster mode" help="" >
+<option value="0" selected="true">Set-Cover (greedy)</option>
+<option value="1">Connected component (BLASTclust)</option>
+<option value="2">Greedy clustering by sequence length (CDHIT)</option>
+</param>
+<param argument="--max-iterations" type="integer" min="0" value="1000" label="Maximum depth of breadth first search in connected component clustering" help=""/>
+<param argument="--similarity-type" type="select" label="Type of score used for clustering" help="" >
+<option value="1">Alignment score</option>
+<option value="2" selected="true">Sequence identity</option>
+</param>
+</section>
+<section name="kmermatcher" title="K-mer matcher">
+<param argument="--cluster-weight-threshold" type="float" min="0" value="0.900" label="Weight threshold used for cluster priorization" help=""/>
+<param argument="--kmer-per-seq" type="integer" min="0" value="21" label="Number of k-mers per sequence" help=""/>
+<param argument="--hash-shift" type="integer" min="0" value="67" label="Shift k-mer hash initialization" help=""/>
+<param argument="--include-only-extendable" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Include only extendable" help=""/>
+<param argument="--ignore-multi-kmer" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Skip k-mers occurring multiple times (>=2)" help=""/>
+</section>
+<section name="misc" title="Misc">
+<param argument="--rescore-mode" type="select" label="Rescore diagonals with" help="" >
+<option value="0" selected="true">Hamming distance</option>
+<option value="1">Local alignment (score only)</option>
+<option value="2">Local alignment</option>
+<option value="3">Global alignment</option>
+<option value="4">Longest alignment fulfilling window quality criterion</option>
+</param>
+<param argument="--shuffle" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Shuffle input database" help=""/>
+<param argument="--id-offset" type="integer" min="0" value="0" label="Numeric ids in index file are offset by this value" help=""/>
+</section>
+<expand macro="common_section"/>
+<section name="expert" title="Expert">
+<expand macro="expert_common_parameters" />
+</section>
+<section name="output_files" title="Selection of the output files">
+<param name="output_selection" type="select" min="1" display="checkboxes" multiple="true"  label="Output files selection">
+<option value="file_rep_seq" selected="true">Representatives sequences in fasta</option>
+<option value="file_all_seq" selected="true">FASTA-like per cluster</option>
+<option value="file_cluster_tsv" selected="true">Adjecency list in TSV</option>
+</param>
+</section>
+</inputs>
+<outputs>
+<data name="output_rep_seq" format="fasta" from_work_dir="result_rep_seq.fasta" label="${tool.name} on ${on_string} : Representatives sequences" >
+<filter>output_files['output_selection'] and "file_rep_seq" in output_files['output_selection']</filter>
+</data>
+<data name="output_all_seq" format="fasta" from_work_dir="result_all_seqs.fasta" label="${tool.name} on ${on_string} : FASTA-like per cluster" >
+<filter>output_files['output_selection'] and "file_all_seq" in output_files['output_selection']</filter>
+</data>
+<data name="output_cluster" format="tabular" from_work_dir="result_cluster.tsv" label="${tool.name} on ${on_string} : Adjecency list">
+<filter>output_files['output_selection'] and "file_cluster_tsv" in output_files['output_selection']</filter>
+</data>
+</outputs>
+<tests>
+<test expect_num_outputs="3">
+<param name="input_fasta" value="light_mystery_reads.fasta" ftype="fasta"/>
+<conditional name="alph_type">
+<param name="dbtype" value="2"/>
+</conditional>
+<output name="output_rep_seq" ftype="fasta">
+<assert_contents>
+<has_text text="TACTTCTCAGCTGTACTGTTTCTTGGTGTAGGGTCAACAACCCTTCAATGGATGTTCTCTTACTACCCAACCGATTGGGCGCACTACCGGGTCACATATGC"/>
+<has_size value="551000" delta="50000"/>
+</assert_contents>
+</output>
+<output name="output_all_seq" ftype="fasta">
+<assert_contents>
+<has_text text="GAATAGCGGGACGCCAAGGGGCGGCCTTGCGTCCGCCCACGTGTGTGCTTGGCACGCGGGGCGTCCGCAAACCTTTGATCGGAACTTGCGATGGAGAAGCT"/>
+<has_size value="627000" delta="20000"/>
+<has_n_lines n="14806" delta="500"/>
+</assert_contents>
+</output>
+<output name="output_cluster" ftype="tabular">
+<assert_contents>
+<has_line line="MYSTERY.13&#009;MYSTERY.13"/>
+<has_n_columns n="2"/>
+<has_size value="113000" delta="50000"/>
+</assert_contents>
+</output>
+</test>
+</tests>
+<help><![CDATA[
+**MMseqs2: ultra fast and sensitive sequence search and clustering suite**
+MMseqs2 (Many-against-Many sequence searching) is a software suite to search and cluster huge protein and nucleotide sequence sets.
+MMseqs2 is open source GPL-licensed software implemented in C++ for Linux, MacOS, and (as beta version, via cygwin) Windows.
+The software is designed to run on multiple cores and servers and exhibits very good scalability.
+MMseqs2 can run 10000 times faster than BLAST. At 100 times its speed it achieves almost the same sensitivity.
+It can perform profile searches with the same sensitivity as PSI-BLAST at over 400 times its speed.
+**Usage**
+MMseqs easy-linclust is useful to clusters entries from a FASTA/FASTQ file using the cascaded clustering algorithm.
+It offers an efficient clustering workflow, scaling linearly with input size. Similar to easy-cluster, but more suitable for handling very large datasets efficiently.
+https://github.com/soedinglab/MMseqs2
+]]></help>
+<expand macro="citations"/>
+</tool>

Mercurial > repos > iuc > mmseqs2_easy_linclust_clustering

comparison mmseqs2_easy_linclust_clustering.xml @ 0:9f6869226de1 draft