comparison mmseqs2_easy_linclust_clustering.xml @ 0:9f6869226de1 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/mmsesq2 commit 1400593429eb4e9c6e307df3621825a8b84a6fa7
author iuc
date Thu, 27 Mar 2025 14:37:56 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9f6869226de1
1 <tool id="mmseqs2_easy_linclust_clustering" name="MMseqs2 Sequence Clustering" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
2 <description>
3 of very large datasets
4 </description>
5 <macros>
6 <import>macro.xml</import>
7 </macros>
8 <expand macro="biotools"/>
9 <expand macro="requirements"/>
10 <expand macro="version_command"/>
11 <command detect_errors="exit_code"><![CDATA[
12 mmseqs easy-linclust
13 '$input_fasta'
14 'result'
15 'tmp'
16
17 #if '$alph_type.dbtype' == "1"
18 --comp-bias-corr-scale $alph_type.type.comp_bias_corr_scale
19 --kmer-per-seq-scale $alph_type.type.kmer_per_seq_scale
20 #elif '$alph_type.dbtype' == "2"
21 --zdrop $alph_type.type.zdrop
22 --kmer-per-seq-scale $alph_type.type.kmer_per_seq_scale
23 --adjust-kmer-len $alph_type.type.adjust_kmer_len
24 #end if
25 ##Pre-filter options
26 --add-self-matches $prefilter.add_self_matches
27 -k $prefilter.kmer_length
28 ##--split-memory-limit BYTE Set max memory per split. E.g. 800B, 5K, 10M, 1G. Default (0) to all available system memory [0]
29 --mask $prefilter.mask
30 --mask-prob $prefilter.mask_prob
31 --mask-lower-case $prefilter.mask_lower_case
32 --spaced-kmer-mode $prefilter.spaced_kmer_mode
33 ##--spaced-kmer-pattern STR User-specified spaced k-mer pattern []
34 ##--disk-space-limit BYTE Set max disk space to use for reverse profile searches. E.g. 800B, 5K, 10M, 1G. Default (0) to all available disk space in the temp folder [0]
35
36 ##Align options
37 -a $align.convertalis
38 ##The next 2 parameters seems to be the same
39 --alignment-mode $align.alignment_mode
40 --alignment-output-mode $align.alignment_output_mode
41 --wrapped-scoring $align.wrapped_scoring
42 -e $align.evalue
43 --min-seq-id $min_seq_id
44 --min-aln-len $align.min_aln_len
45 --seq-id-mode $align.seq_id_mode
46 --alt-ali $align.alt_ali
47 -c $cov
48 --cov-mode $cov_mode
49 --max-rejected $align.max_rejected
50 --max-accept $align.max_accept
51 --score-bias $align.score_bias
52 --realign $align.realign
53 --realign-score-bias $align.realign_score_bias
54 --realign-max-seqs $align.realign_max_seqs
55 --corr-score-weight $align.corr_score_weight
56
57 ##Clustering options
58 --cluster-mode $cluster.cluster_mode
59 --max-iterations $cluster.max_iterations
60 --similarity-type $cluster.similarity_type
61
62 ##kmermatcher options
63 ##--weights STR Weights used for cluster priorization []
64 --cluster-weight-threshold $kmermatcher.cluster_weight_threshold
65 --kmer-per-seq $kmermatcher.kmer_per_seq
66 --hash-shift $kmermatcher.hash_shift
67 --include-only-extendable $kmermatcher.include_only_extendable
68 --ignore-multi-kmer $kmermatcher.ignore_multi_kmer
69
70 ##Profile options
71 ##--pca Pseudo count admixture strength []
72 ##--pcb Pseudo counts: Neff at half of maximum admixture (range 0.0-inf) []
73
74 ##Misc options
75 --rescore-mode $misc.rescore_mode
76 --dbtype $alph_type.dbtype
77 --shuffle $misc.shuffle
78 --id-offset $misc.id_offset
79
80 ##Common options
81 ##--compressed INT Write compressed output [0]
82 --threads "\${GALAXY_SLOTS:-1}"
83 ##-v INT Verbosity level: 0: quiet, 1: +errors, 2: +warnings, 3: +info [3]
84 --max-seq-len $common.max_seq_len
85 ##--db-load-mode INT Database preload mode 0: auto, 1: fread, 2: mmap, 3: mmap+touch [0]
86 ##--mpi-runner STR Use MPI on compute cluster with this MPI command (e.g. "mpirun -np 42") []
87 ##--force-reuse BOOL Reuse tmp filse in tmp/latest folder ignoring parameters and version changes [0]
88 ##--remove-tmp-files BOOL Delete temporary files [0]
89
90 ##Expert options
91 --filter-hits $expert.filter_hits
92 --sort-results $expert.sort_results
93 ##--create-lookup INT Create database lookup file (can be very large) [0]
94 ]]></command>
95 <inputs>
96 <param name="input_fasta" type="data" format="fasta,fasta.gz" label="Input fasta file" help="" />
97 <conditional name="alph_type">
98 <param argument="--dbtype" type="select" label="Input data type" help="" >
99 <option value="0" selected="true">Automatic</option>
100 <option value="1">Amino acid</option>
101 <option value="2">Nucleotides</option>
102 </param>
103 <when value="0"/>
104 <when value="1">
105 <param argument="--comp-bias-corr-scale" type="float" min="0" max="1" value="1" label="Scale composition bias correction" help=""/>
106 <param argument="--kmer-per-seq-scale" type="float" min="0" value="0.000" label="Scale k-mer per sequence based on sequence length" help=""/>
107 </when>
108 <when value="2">
109 <param argument="--zdrop" type="integer" min="0" value="40" label="Maximal allowed difference between score values before alignment is truncated" help=""/>
110 <param argument="--kmer-per-seq-scale" type="float" min="0" value="0.200" label="Scale k-mer per sequence based on sequence length" help=""/>
111 <param argument="--adjust-kmer-len" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Adjust k-mer length based on specificity" help=""/>
112 </when>
113 </conditional>
114 <param argument="--min-seq-id" type="float" min="0" max="1" value="0" label="Minimum sequence identity" help="List matches above this sequence identity for clustering"/>
115 <param argument="--cov-mode" type="select" label="Coverage mode" help="" >
116 <option value="0" selected="true">Coverage of query and target</option>
117 <option value="1">Coverage of target</option>
118 <option value="2">Coverage of query</option>
119 <option value="3">Target seq. length has to be at least x% of query length</option>
120 <option value="4">Query seq. length has to be at least x% of target length</option>
121 <option value="5">Short seq. needs to be at least x% of the other seq. length</option>
122 </param>
123 <param argument="-c" name="cov" type="float" min="0" value="0.800" label="List matches above this fraction of aligned (covered) residues" help=""/>
124 <section name="prefilter" title="Pre-filter">
125 <expand macro="prefilter_common_parameters" />
126 <param argument="--spaced-kmer-mode" type="select" label="Spaced k-mer mode" help="">
127 <option value="0" selected="true">Use consecutive positions in k-mers</option>
128 <option value="1">Use spaced k-mers</option>
129 </param>
130 </section>
131 <section name="align" title="Align">
132 <expand macro="align_common_parameters" />
133 <param argument="--alignment-mode" type="select" label="Alignment mode : How to compute the alignment" help="" >
134 <option value="0" selected="true">Automatic</option>
135 <option value="1">Only score and end_pos</option>
136 <option value="2">Also start_pos and cov</option>
137 <option value="3">Also seq.id</option>
138 <option value="4">Only ungapped alignment</option>
139 </param>
140 <param argument="-e" name="evalue" type="float" min="0" value="1.000E-03" label="E-value threshold" help="List matches below this E-value"/>
141 <param argument="--max-rejected" type="integer" min="0" value="2147483647" optional="true" label="Maximum rejected alignments before alignment calculation for a query is stopped" help=""/>
142 <param argument="--max-accept" type="integer" min="0" value="2147483647" optional="true" label="Maximum accepted alignments before alignment calculation for a query is stopped" help=""/>
143 </section>
144 <section name="cluster" title="Clustering">
145 <param argument="--cluster-mode" type="select" label="Cluster mode" help="" >
146 <option value="0" selected="true">Set-Cover (greedy)</option>
147 <option value="1">Connected component (BLASTclust)</option>
148 <option value="2">Greedy clustering by sequence length (CDHIT)</option>
149 </param>
150 <param argument="--max-iterations" type="integer" min="0" value="1000" label="Maximum depth of breadth first search in connected component clustering" help=""/>
151 <param argument="--similarity-type" type="select" label="Type of score used for clustering" help="" >
152 <option value="1">Alignment score</option>
153 <option value="2" selected="true">Sequence identity</option>
154 </param>
155 </section>
156 <section name="kmermatcher" title="K-mer matcher">
157 <param argument="--cluster-weight-threshold" type="float" min="0" value="0.900" label="Weight threshold used for cluster priorization" help=""/>
158 <param argument="--kmer-per-seq" type="integer" min="0" value="21" label="Number of k-mers per sequence" help=""/>
159 <param argument="--hash-shift" type="integer" min="0" value="67" label="Shift k-mer hash initialization" help=""/>
160 <param argument="--include-only-extendable" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Include only extendable" help=""/>
161 <param argument="--ignore-multi-kmer" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Skip k-mers occurring multiple times (>=2)" help=""/>
162 </section>
163 <section name="misc" title="Misc">
164 <param argument="--rescore-mode" type="select" label="Rescore diagonals with" help="" >
165 <option value="0" selected="true">Hamming distance</option>
166 <option value="1">Local alignment (score only)</option>
167 <option value="2">Local alignment</option>
168 <option value="3">Global alignment</option>
169 <option value="4">Longest alignment fulfilling window quality criterion</option>
170 </param>
171 <param argument="--shuffle" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Shuffle input database" help=""/>
172 <param argument="--id-offset" type="integer" min="0" value="0" label="Numeric ids in index file are offset by this value" help=""/>
173 </section>
174 <expand macro="common_section"/>
175 <section name="expert" title="Expert">
176 <expand macro="expert_common_parameters" />
177 </section>
178 <section name="output_files" title="Selection of the output files">
179 <param name="output_selection" type="select" min="1" display="checkboxes" multiple="true" label="Output files selection">
180 <option value="file_rep_seq" selected="true">Representatives sequences in fasta</option>
181 <option value="file_all_seq" selected="true">FASTA-like per cluster</option>
182 <option value="file_cluster_tsv" selected="true">Adjecency list in TSV</option>
183 </param>
184 </section>
185 </inputs>
186 <outputs>
187 <data name="output_rep_seq" format="fasta" from_work_dir="result_rep_seq.fasta" label="${tool.name} on ${on_string} : Representatives sequences" >
188 <filter>output_files['output_selection'] and "file_rep_seq" in output_files['output_selection']</filter>
189 </data>
190 <data name="output_all_seq" format="fasta" from_work_dir="result_all_seqs.fasta" label="${tool.name} on ${on_string} : FASTA-like per cluster" >
191 <filter>output_files['output_selection'] and "file_all_seq" in output_files['output_selection']</filter>
192 </data>
193 <data name="output_cluster" format="tabular" from_work_dir="result_cluster.tsv" label="${tool.name} on ${on_string} : Adjecency list">
194 <filter>output_files['output_selection'] and "file_cluster_tsv" in output_files['output_selection']</filter>
195 </data>
196 </outputs>
197 <tests>
198 <test expect_num_outputs="3">
199 <param name="input_fasta" value="light_mystery_reads.fasta" ftype="fasta"/>
200 <conditional name="alph_type">
201 <param name="dbtype" value="2"/>
202 </conditional>
203 <output name="output_rep_seq" ftype="fasta">
204 <assert_contents>
205 <has_text text="TACTTCTCAGCTGTACTGTTTCTTGGTGTAGGGTCAACAACCCTTCAATGGATGTTCTCTTACTACCCAACCGATTGGGCGCACTACCGGGTCACATATGC"/>
206 <has_size value="551000" delta="50000"/>
207 </assert_contents>
208 </output>
209 <output name="output_all_seq" ftype="fasta">
210 <assert_contents>
211 <has_text text="GAATAGCGGGACGCCAAGGGGCGGCCTTGCGTCCGCCCACGTGTGTGCTTGGCACGCGGGGCGTCCGCAAACCTTTGATCGGAACTTGCGATGGAGAAGCT"/>
212 <has_size value="627000" delta="20000"/>
213 <has_n_lines n="14806" delta="500"/>
214 </assert_contents>
215 </output>
216 <output name="output_cluster" ftype="tabular">
217 <assert_contents>
218 <has_line line="MYSTERY.13&#009;MYSTERY.13"/>
219 <has_n_columns n="2"/>
220 <has_size value="113000" delta="50000"/>
221 </assert_contents>
222 </output>
223 </test>
224 </tests>
225 <help><![CDATA[
226 **MMseqs2: ultra fast and sensitive sequence search and clustering suite**
227
228 MMseqs2 (Many-against-Many sequence searching) is a software suite to search and cluster huge protein and nucleotide sequence sets.
229 MMseqs2 is open source GPL-licensed software implemented in C++ for Linux, MacOS, and (as beta version, via cygwin) Windows.
230 The software is designed to run on multiple cores and servers and exhibits very good scalability.
231 MMseqs2 can run 10000 times faster than BLAST. At 100 times its speed it achieves almost the same sensitivity.
232 It can perform profile searches with the same sensitivity as PSI-BLAST at over 400 times its speed.
233
234 **Usage**
235 MMseqs easy-linclust is useful to clusters entries from a FASTA/FASTQ file using the cascaded clustering algorithm.
236 It offers an efficient clustering workflow, scaling linearly with input size. Similar to easy-cluster, but more suitable for handling very large datasets efficiently.
237
238 https://github.com/soedinglab/MMseqs2
239
240 ]]></help>
241 <expand macro="citations"/>
242 </tool>