Mercurial > repos > iuc > mmseqs2_easy_linclust_clustering
diff mmseqs2_easy_linclust_clustering.xml @ 0:9f6869226de1 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/mmsesq2 commit 1400593429eb4e9c6e307df3621825a8b84a6fa7
author | iuc |
---|---|
date | Thu, 27 Mar 2025 14:37:56 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mmseqs2_easy_linclust_clustering.xml Thu Mar 27 14:37:56 2025 +0000 @@ -0,0 +1,242 @@ +<tool id="mmseqs2_easy_linclust_clustering" name="MMseqs2 Sequence Clustering" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> + <description> + of very large datasets + </description> + <macros> + <import>macro.xml</import> + </macros> + <expand macro="biotools"/> + <expand macro="requirements"/> + <expand macro="version_command"/> + <command detect_errors="exit_code"><![CDATA[ +mmseqs easy-linclust + '$input_fasta' + 'result' + 'tmp' + + #if '$alph_type.dbtype' == "1" + --comp-bias-corr-scale $alph_type.type.comp_bias_corr_scale + --kmer-per-seq-scale $alph_type.type.kmer_per_seq_scale + #elif '$alph_type.dbtype' == "2" + --zdrop $alph_type.type.zdrop + --kmer-per-seq-scale $alph_type.type.kmer_per_seq_scale + --adjust-kmer-len $alph_type.type.adjust_kmer_len + #end if + ##Pre-filter options + --add-self-matches $prefilter.add_self_matches + -k $prefilter.kmer_length + ##--split-memory-limit BYTE Set max memory per split. E.g. 800B, 5K, 10M, 1G. Default (0) to all available system memory [0] + --mask $prefilter.mask + --mask-prob $prefilter.mask_prob + --mask-lower-case $prefilter.mask_lower_case + --spaced-kmer-mode $prefilter.spaced_kmer_mode + ##--spaced-kmer-pattern STR User-specified spaced k-mer pattern [] + ##--disk-space-limit BYTE Set max disk space to use for reverse profile searches. E.g. 800B, 5K, 10M, 1G. Default (0) to all available disk space in the temp folder [0] + + ##Align options + -a $align.convertalis + ##The next 2 parameters seems to be the same + --alignment-mode $align.alignment_mode + --alignment-output-mode $align.alignment_output_mode + --wrapped-scoring $align.wrapped_scoring + -e $align.evalue + --min-seq-id $min_seq_id + --min-aln-len $align.min_aln_len + --seq-id-mode $align.seq_id_mode + --alt-ali $align.alt_ali + -c $cov + --cov-mode $cov_mode + --max-rejected $align.max_rejected + --max-accept $align.max_accept + --score-bias $align.score_bias + --realign $align.realign + --realign-score-bias $align.realign_score_bias + --realign-max-seqs $align.realign_max_seqs + --corr-score-weight $align.corr_score_weight + + ##Clustering options + --cluster-mode $cluster.cluster_mode + --max-iterations $cluster.max_iterations + --similarity-type $cluster.similarity_type + + ##kmermatcher options + ##--weights STR Weights used for cluster priorization [] + --cluster-weight-threshold $kmermatcher.cluster_weight_threshold + --kmer-per-seq $kmermatcher.kmer_per_seq + --hash-shift $kmermatcher.hash_shift + --include-only-extendable $kmermatcher.include_only_extendable + --ignore-multi-kmer $kmermatcher.ignore_multi_kmer + + ##Profile options + ##--pca Pseudo count admixture strength [] + ##--pcb Pseudo counts: Neff at half of maximum admixture (range 0.0-inf) [] + + ##Misc options + --rescore-mode $misc.rescore_mode + --dbtype $alph_type.dbtype + --shuffle $misc.shuffle + --id-offset $misc.id_offset + + ##Common options + ##--compressed INT Write compressed output [0] + --threads "\${GALAXY_SLOTS:-1}" + ##-v INT Verbosity level: 0: quiet, 1: +errors, 2: +warnings, 3: +info [3] + --max-seq-len $common.max_seq_len + ##--db-load-mode INT Database preload mode 0: auto, 1: fread, 2: mmap, 3: mmap+touch [0] + ##--mpi-runner STR Use MPI on compute cluster with this MPI command (e.g. "mpirun -np 42") [] + ##--force-reuse BOOL Reuse tmp filse in tmp/latest folder ignoring parameters and version changes [0] + ##--remove-tmp-files BOOL Delete temporary files [0] + + ##Expert options + --filter-hits $expert.filter_hits + --sort-results $expert.sort_results + ##--create-lookup INT Create database lookup file (can be very large) [0] + ]]></command> + <inputs> + <param name="input_fasta" type="data" format="fasta,fasta.gz" label="Input fasta file" help="" /> + <conditional name="alph_type"> + <param argument="--dbtype" type="select" label="Input data type" help="" > + <option value="0" selected="true">Automatic</option> + <option value="1">Amino acid</option> + <option value="2">Nucleotides</option> + </param> + <when value="0"/> + <when value="1"> + <param argument="--comp-bias-corr-scale" type="float" min="0" max="1" value="1" label="Scale composition bias correction" help=""/> + <param argument="--kmer-per-seq-scale" type="float" min="0" value="0.000" label="Scale k-mer per sequence based on sequence length" help=""/> + </when> + <when value="2"> + <param argument="--zdrop" type="integer" min="0" value="40" label="Maximal allowed difference between score values before alignment is truncated" help=""/> + <param argument="--kmer-per-seq-scale" type="float" min="0" value="0.200" label="Scale k-mer per sequence based on sequence length" help=""/> + <param argument="--adjust-kmer-len" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Adjust k-mer length based on specificity" help=""/> + </when> + </conditional> + <param argument="--min-seq-id" type="float" min="0" max="1" value="0" label="Minimum sequence identity" help="List matches above this sequence identity for clustering"/> + <param argument="--cov-mode" type="select" label="Coverage mode" help="" > + <option value="0" selected="true">Coverage of query and target</option> + <option value="1">Coverage of target</option> + <option value="2">Coverage of query</option> + <option value="3">Target seq. length has to be at least x% of query length</option> + <option value="4">Query seq. length has to be at least x% of target length</option> + <option value="5">Short seq. needs to be at least x% of the other seq. length</option> + </param> + <param argument="-c" name="cov" type="float" min="0" value="0.800" label="List matches above this fraction of aligned (covered) residues" help=""/> + <section name="prefilter" title="Pre-filter"> + <expand macro="prefilter_common_parameters" /> + <param argument="--spaced-kmer-mode" type="select" label="Spaced k-mer mode" help=""> + <option value="0" selected="true">Use consecutive positions in k-mers</option> + <option value="1">Use spaced k-mers</option> + </param> + </section> + <section name="align" title="Align"> + <expand macro="align_common_parameters" /> + <param argument="--alignment-mode" type="select" label="Alignment mode : How to compute the alignment" help="" > + <option value="0" selected="true">Automatic</option> + <option value="1">Only score and end_pos</option> + <option value="2">Also start_pos and cov</option> + <option value="3">Also seq.id</option> + <option value="4">Only ungapped alignment</option> + </param> + <param argument="-e" name="evalue" type="float" min="0" value="1.000E-03" label="E-value threshold" help="List matches below this E-value"/> + <param argument="--max-rejected" type="integer" min="0" value="2147483647" optional="true" label="Maximum rejected alignments before alignment calculation for a query is stopped" help=""/> + <param argument="--max-accept" type="integer" min="0" value="2147483647" optional="true" label="Maximum accepted alignments before alignment calculation for a query is stopped" help=""/> + </section> + <section name="cluster" title="Clustering"> + <param argument="--cluster-mode" type="select" label="Cluster mode" help="" > + <option value="0" selected="true">Set-Cover (greedy)</option> + <option value="1">Connected component (BLASTclust)</option> + <option value="2">Greedy clustering by sequence length (CDHIT)</option> + </param> + <param argument="--max-iterations" type="integer" min="0" value="1000" label="Maximum depth of breadth first search in connected component clustering" help=""/> + <param argument="--similarity-type" type="select" label="Type of score used for clustering" help="" > + <option value="1">Alignment score</option> + <option value="2" selected="true">Sequence identity</option> + </param> + </section> + <section name="kmermatcher" title="K-mer matcher"> + <param argument="--cluster-weight-threshold" type="float" min="0" value="0.900" label="Weight threshold used for cluster priorization" help=""/> + <param argument="--kmer-per-seq" type="integer" min="0" value="21" label="Number of k-mers per sequence" help=""/> + <param argument="--hash-shift" type="integer" min="0" value="67" label="Shift k-mer hash initialization" help=""/> + <param argument="--include-only-extendable" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Include only extendable" help=""/> + <param argument="--ignore-multi-kmer" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Skip k-mers occurring multiple times (>=2)" help=""/> + </section> + <section name="misc" title="Misc"> + <param argument="--rescore-mode" type="select" label="Rescore diagonals with" help="" > + <option value="0" selected="true">Hamming distance</option> + <option value="1">Local alignment (score only)</option> + <option value="2">Local alignment</option> + <option value="3">Global alignment</option> + <option value="4">Longest alignment fulfilling window quality criterion</option> + </param> + <param argument="--shuffle" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Shuffle input database" help=""/> + <param argument="--id-offset" type="integer" min="0" value="0" label="Numeric ids in index file are offset by this value" help=""/> + </section> + <expand macro="common_section"/> + <section name="expert" title="Expert"> + <expand macro="expert_common_parameters" /> + </section> + <section name="output_files" title="Selection of the output files"> + <param name="output_selection" type="select" min="1" display="checkboxes" multiple="true" label="Output files selection"> + <option value="file_rep_seq" selected="true">Representatives sequences in fasta</option> + <option value="file_all_seq" selected="true">FASTA-like per cluster</option> + <option value="file_cluster_tsv" selected="true">Adjecency list in TSV</option> + </param> + </section> + </inputs> + <outputs> + <data name="output_rep_seq" format="fasta" from_work_dir="result_rep_seq.fasta" label="${tool.name} on ${on_string} : Representatives sequences" > + <filter>output_files['output_selection'] and "file_rep_seq" in output_files['output_selection']</filter> + </data> + <data name="output_all_seq" format="fasta" from_work_dir="result_all_seqs.fasta" label="${tool.name} on ${on_string} : FASTA-like per cluster" > + <filter>output_files['output_selection'] and "file_all_seq" in output_files['output_selection']</filter> + </data> + <data name="output_cluster" format="tabular" from_work_dir="result_cluster.tsv" label="${tool.name} on ${on_string} : Adjecency list"> + <filter>output_files['output_selection'] and "file_cluster_tsv" in output_files['output_selection']</filter> + </data> + </outputs> + <tests> + <test expect_num_outputs="3"> + <param name="input_fasta" value="light_mystery_reads.fasta" ftype="fasta"/> + <conditional name="alph_type"> + <param name="dbtype" value="2"/> + </conditional> + <output name="output_rep_seq" ftype="fasta"> + <assert_contents> + <has_text text="TACTTCTCAGCTGTACTGTTTCTTGGTGTAGGGTCAACAACCCTTCAATGGATGTTCTCTTACTACCCAACCGATTGGGCGCACTACCGGGTCACATATGC"/> + <has_size value="551000" delta="50000"/> + </assert_contents> + </output> + <output name="output_all_seq" ftype="fasta"> + <assert_contents> + <has_text text="GAATAGCGGGACGCCAAGGGGCGGCCTTGCGTCCGCCCACGTGTGTGCTTGGCACGCGGGGCGTCCGCAAACCTTTGATCGGAACTTGCGATGGAGAAGCT"/> + <has_size value="627000" delta="20000"/> + <has_n_lines n="14806" delta="500"/> + </assert_contents> + </output> + <output name="output_cluster" ftype="tabular"> + <assert_contents> + <has_line line="MYSTERY.13	MYSTERY.13"/> + <has_n_columns n="2"/> + <has_size value="113000" delta="50000"/> + </assert_contents> + </output> + </test> + </tests> + <help><![CDATA[ +**MMseqs2: ultra fast and sensitive sequence search and clustering suite** + +MMseqs2 (Many-against-Many sequence searching) is a software suite to search and cluster huge protein and nucleotide sequence sets. +MMseqs2 is open source GPL-licensed software implemented in C++ for Linux, MacOS, and (as beta version, via cygwin) Windows. +The software is designed to run on multiple cores and servers and exhibits very good scalability. +MMseqs2 can run 10000 times faster than BLAST. At 100 times its speed it achieves almost the same sensitivity. +It can perform profile searches with the same sensitivity as PSI-BLAST at over 400 times its speed. + +**Usage** +MMseqs easy-linclust is useful to clusters entries from a FASTA/FASTQ file using the cascaded clustering algorithm. +It offers an efficient clustering workflow, scaling linearly with input size. Similar to easy-cluster, but more suitable for handling very large datasets efficiently. + +https://github.com/soedinglab/MMseqs2 + + ]]></help> + <expand macro="citations"/> +</tool> \ No newline at end of file