Mercurial > repos > iuc > mmseqs2_easy_linclust_clustering
comparison mmseqs2_easy_linclust_clustering.xml @ 0:9f6869226de1 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/mmsesq2 commit 1400593429eb4e9c6e307df3621825a8b84a6fa7
author | iuc |
---|---|
date | Thu, 27 Mar 2025 14:37:56 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9f6869226de1 |
---|---|
1 <tool id="mmseqs2_easy_linclust_clustering" name="MMseqs2 Sequence Clustering" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> | |
2 <description> | |
3 of very large datasets | |
4 </description> | |
5 <macros> | |
6 <import>macro.xml</import> | |
7 </macros> | |
8 <expand macro="biotools"/> | |
9 <expand macro="requirements"/> | |
10 <expand macro="version_command"/> | |
11 <command detect_errors="exit_code"><![CDATA[ | |
12 mmseqs easy-linclust | |
13 '$input_fasta' | |
14 'result' | |
15 'tmp' | |
16 | |
17 #if '$alph_type.dbtype' == "1" | |
18 --comp-bias-corr-scale $alph_type.type.comp_bias_corr_scale | |
19 --kmer-per-seq-scale $alph_type.type.kmer_per_seq_scale | |
20 #elif '$alph_type.dbtype' == "2" | |
21 --zdrop $alph_type.type.zdrop | |
22 --kmer-per-seq-scale $alph_type.type.kmer_per_seq_scale | |
23 --adjust-kmer-len $alph_type.type.adjust_kmer_len | |
24 #end if | |
25 ##Pre-filter options | |
26 --add-self-matches $prefilter.add_self_matches | |
27 -k $prefilter.kmer_length | |
28 ##--split-memory-limit BYTE Set max memory per split. E.g. 800B, 5K, 10M, 1G. Default (0) to all available system memory [0] | |
29 --mask $prefilter.mask | |
30 --mask-prob $prefilter.mask_prob | |
31 --mask-lower-case $prefilter.mask_lower_case | |
32 --spaced-kmer-mode $prefilter.spaced_kmer_mode | |
33 ##--spaced-kmer-pattern STR User-specified spaced k-mer pattern [] | |
34 ##--disk-space-limit BYTE Set max disk space to use for reverse profile searches. E.g. 800B, 5K, 10M, 1G. Default (0) to all available disk space in the temp folder [0] | |
35 | |
36 ##Align options | |
37 -a $align.convertalis | |
38 ##The next 2 parameters seems to be the same | |
39 --alignment-mode $align.alignment_mode | |
40 --alignment-output-mode $align.alignment_output_mode | |
41 --wrapped-scoring $align.wrapped_scoring | |
42 -e $align.evalue | |
43 --min-seq-id $min_seq_id | |
44 --min-aln-len $align.min_aln_len | |
45 --seq-id-mode $align.seq_id_mode | |
46 --alt-ali $align.alt_ali | |
47 -c $cov | |
48 --cov-mode $cov_mode | |
49 --max-rejected $align.max_rejected | |
50 --max-accept $align.max_accept | |
51 --score-bias $align.score_bias | |
52 --realign $align.realign | |
53 --realign-score-bias $align.realign_score_bias | |
54 --realign-max-seqs $align.realign_max_seqs | |
55 --corr-score-weight $align.corr_score_weight | |
56 | |
57 ##Clustering options | |
58 --cluster-mode $cluster.cluster_mode | |
59 --max-iterations $cluster.max_iterations | |
60 --similarity-type $cluster.similarity_type | |
61 | |
62 ##kmermatcher options | |
63 ##--weights STR Weights used for cluster priorization [] | |
64 --cluster-weight-threshold $kmermatcher.cluster_weight_threshold | |
65 --kmer-per-seq $kmermatcher.kmer_per_seq | |
66 --hash-shift $kmermatcher.hash_shift | |
67 --include-only-extendable $kmermatcher.include_only_extendable | |
68 --ignore-multi-kmer $kmermatcher.ignore_multi_kmer | |
69 | |
70 ##Profile options | |
71 ##--pca Pseudo count admixture strength [] | |
72 ##--pcb Pseudo counts: Neff at half of maximum admixture (range 0.0-inf) [] | |
73 | |
74 ##Misc options | |
75 --rescore-mode $misc.rescore_mode | |
76 --dbtype $alph_type.dbtype | |
77 --shuffle $misc.shuffle | |
78 --id-offset $misc.id_offset | |
79 | |
80 ##Common options | |
81 ##--compressed INT Write compressed output [0] | |
82 --threads "\${GALAXY_SLOTS:-1}" | |
83 ##-v INT Verbosity level: 0: quiet, 1: +errors, 2: +warnings, 3: +info [3] | |
84 --max-seq-len $common.max_seq_len | |
85 ##--db-load-mode INT Database preload mode 0: auto, 1: fread, 2: mmap, 3: mmap+touch [0] | |
86 ##--mpi-runner STR Use MPI on compute cluster with this MPI command (e.g. "mpirun -np 42") [] | |
87 ##--force-reuse BOOL Reuse tmp filse in tmp/latest folder ignoring parameters and version changes [0] | |
88 ##--remove-tmp-files BOOL Delete temporary files [0] | |
89 | |
90 ##Expert options | |
91 --filter-hits $expert.filter_hits | |
92 --sort-results $expert.sort_results | |
93 ##--create-lookup INT Create database lookup file (can be very large) [0] | |
94 ]]></command> | |
95 <inputs> | |
96 <param name="input_fasta" type="data" format="fasta,fasta.gz" label="Input fasta file" help="" /> | |
97 <conditional name="alph_type"> | |
98 <param argument="--dbtype" type="select" label="Input data type" help="" > | |
99 <option value="0" selected="true">Automatic</option> | |
100 <option value="1">Amino acid</option> | |
101 <option value="2">Nucleotides</option> | |
102 </param> | |
103 <when value="0"/> | |
104 <when value="1"> | |
105 <param argument="--comp-bias-corr-scale" type="float" min="0" max="1" value="1" label="Scale composition bias correction" help=""/> | |
106 <param argument="--kmer-per-seq-scale" type="float" min="0" value="0.000" label="Scale k-mer per sequence based on sequence length" help=""/> | |
107 </when> | |
108 <when value="2"> | |
109 <param argument="--zdrop" type="integer" min="0" value="40" label="Maximal allowed difference between score values before alignment is truncated" help=""/> | |
110 <param argument="--kmer-per-seq-scale" type="float" min="0" value="0.200" label="Scale k-mer per sequence based on sequence length" help=""/> | |
111 <param argument="--adjust-kmer-len" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Adjust k-mer length based on specificity" help=""/> | |
112 </when> | |
113 </conditional> | |
114 <param argument="--min-seq-id" type="float" min="0" max="1" value="0" label="Minimum sequence identity" help="List matches above this sequence identity for clustering"/> | |
115 <param argument="--cov-mode" type="select" label="Coverage mode" help="" > | |
116 <option value="0" selected="true">Coverage of query and target</option> | |
117 <option value="1">Coverage of target</option> | |
118 <option value="2">Coverage of query</option> | |
119 <option value="3">Target seq. length has to be at least x% of query length</option> | |
120 <option value="4">Query seq. length has to be at least x% of target length</option> | |
121 <option value="5">Short seq. needs to be at least x% of the other seq. length</option> | |
122 </param> | |
123 <param argument="-c" name="cov" type="float" min="0" value="0.800" label="List matches above this fraction of aligned (covered) residues" help=""/> | |
124 <section name="prefilter" title="Pre-filter"> | |
125 <expand macro="prefilter_common_parameters" /> | |
126 <param argument="--spaced-kmer-mode" type="select" label="Spaced k-mer mode" help=""> | |
127 <option value="0" selected="true">Use consecutive positions in k-mers</option> | |
128 <option value="1">Use spaced k-mers</option> | |
129 </param> | |
130 </section> | |
131 <section name="align" title="Align"> | |
132 <expand macro="align_common_parameters" /> | |
133 <param argument="--alignment-mode" type="select" label="Alignment mode : How to compute the alignment" help="" > | |
134 <option value="0" selected="true">Automatic</option> | |
135 <option value="1">Only score and end_pos</option> | |
136 <option value="2">Also start_pos and cov</option> | |
137 <option value="3">Also seq.id</option> | |
138 <option value="4">Only ungapped alignment</option> | |
139 </param> | |
140 <param argument="-e" name="evalue" type="float" min="0" value="1.000E-03" label="E-value threshold" help="List matches below this E-value"/> | |
141 <param argument="--max-rejected" type="integer" min="0" value="2147483647" optional="true" label="Maximum rejected alignments before alignment calculation for a query is stopped" help=""/> | |
142 <param argument="--max-accept" type="integer" min="0" value="2147483647" optional="true" label="Maximum accepted alignments before alignment calculation for a query is stopped" help=""/> | |
143 </section> | |
144 <section name="cluster" title="Clustering"> | |
145 <param argument="--cluster-mode" type="select" label="Cluster mode" help="" > | |
146 <option value="0" selected="true">Set-Cover (greedy)</option> | |
147 <option value="1">Connected component (BLASTclust)</option> | |
148 <option value="2">Greedy clustering by sequence length (CDHIT)</option> | |
149 </param> | |
150 <param argument="--max-iterations" type="integer" min="0" value="1000" label="Maximum depth of breadth first search in connected component clustering" help=""/> | |
151 <param argument="--similarity-type" type="select" label="Type of score used for clustering" help="" > | |
152 <option value="1">Alignment score</option> | |
153 <option value="2" selected="true">Sequence identity</option> | |
154 </param> | |
155 </section> | |
156 <section name="kmermatcher" title="K-mer matcher"> | |
157 <param argument="--cluster-weight-threshold" type="float" min="0" value="0.900" label="Weight threshold used for cluster priorization" help=""/> | |
158 <param argument="--kmer-per-seq" type="integer" min="0" value="21" label="Number of k-mers per sequence" help=""/> | |
159 <param argument="--hash-shift" type="integer" min="0" value="67" label="Shift k-mer hash initialization" help=""/> | |
160 <param argument="--include-only-extendable" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Include only extendable" help=""/> | |
161 <param argument="--ignore-multi-kmer" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Skip k-mers occurring multiple times (>=2)" help=""/> | |
162 </section> | |
163 <section name="misc" title="Misc"> | |
164 <param argument="--rescore-mode" type="select" label="Rescore diagonals with" help="" > | |
165 <option value="0" selected="true">Hamming distance</option> | |
166 <option value="1">Local alignment (score only)</option> | |
167 <option value="2">Local alignment</option> | |
168 <option value="3">Global alignment</option> | |
169 <option value="4">Longest alignment fulfilling window quality criterion</option> | |
170 </param> | |
171 <param argument="--shuffle" type="boolean" checked="true" truevalue="1" falsevalue="0" label="Shuffle input database" help=""/> | |
172 <param argument="--id-offset" type="integer" min="0" value="0" label="Numeric ids in index file are offset by this value" help=""/> | |
173 </section> | |
174 <expand macro="common_section"/> | |
175 <section name="expert" title="Expert"> | |
176 <expand macro="expert_common_parameters" /> | |
177 </section> | |
178 <section name="output_files" title="Selection of the output files"> | |
179 <param name="output_selection" type="select" min="1" display="checkboxes" multiple="true" label="Output files selection"> | |
180 <option value="file_rep_seq" selected="true">Representatives sequences in fasta</option> | |
181 <option value="file_all_seq" selected="true">FASTA-like per cluster</option> | |
182 <option value="file_cluster_tsv" selected="true">Adjecency list in TSV</option> | |
183 </param> | |
184 </section> | |
185 </inputs> | |
186 <outputs> | |
187 <data name="output_rep_seq" format="fasta" from_work_dir="result_rep_seq.fasta" label="${tool.name} on ${on_string} : Representatives sequences" > | |
188 <filter>output_files['output_selection'] and "file_rep_seq" in output_files['output_selection']</filter> | |
189 </data> | |
190 <data name="output_all_seq" format="fasta" from_work_dir="result_all_seqs.fasta" label="${tool.name} on ${on_string} : FASTA-like per cluster" > | |
191 <filter>output_files['output_selection'] and "file_all_seq" in output_files['output_selection']</filter> | |
192 </data> | |
193 <data name="output_cluster" format="tabular" from_work_dir="result_cluster.tsv" label="${tool.name} on ${on_string} : Adjecency list"> | |
194 <filter>output_files['output_selection'] and "file_cluster_tsv" in output_files['output_selection']</filter> | |
195 </data> | |
196 </outputs> | |
197 <tests> | |
198 <test expect_num_outputs="3"> | |
199 <param name="input_fasta" value="light_mystery_reads.fasta" ftype="fasta"/> | |
200 <conditional name="alph_type"> | |
201 <param name="dbtype" value="2"/> | |
202 </conditional> | |
203 <output name="output_rep_seq" ftype="fasta"> | |
204 <assert_contents> | |
205 <has_text text="TACTTCTCAGCTGTACTGTTTCTTGGTGTAGGGTCAACAACCCTTCAATGGATGTTCTCTTACTACCCAACCGATTGGGCGCACTACCGGGTCACATATGC"/> | |
206 <has_size value="551000" delta="50000"/> | |
207 </assert_contents> | |
208 </output> | |
209 <output name="output_all_seq" ftype="fasta"> | |
210 <assert_contents> | |
211 <has_text text="GAATAGCGGGACGCCAAGGGGCGGCCTTGCGTCCGCCCACGTGTGTGCTTGGCACGCGGGGCGTCCGCAAACCTTTGATCGGAACTTGCGATGGAGAAGCT"/> | |
212 <has_size value="627000" delta="20000"/> | |
213 <has_n_lines n="14806" delta="500"/> | |
214 </assert_contents> | |
215 </output> | |
216 <output name="output_cluster" ftype="tabular"> | |
217 <assert_contents> | |
218 <has_line line="MYSTERY.13	MYSTERY.13"/> | |
219 <has_n_columns n="2"/> | |
220 <has_size value="113000" delta="50000"/> | |
221 </assert_contents> | |
222 </output> | |
223 </test> | |
224 </tests> | |
225 <help><![CDATA[ | |
226 **MMseqs2: ultra fast and sensitive sequence search and clustering suite** | |
227 | |
228 MMseqs2 (Many-against-Many sequence searching) is a software suite to search and cluster huge protein and nucleotide sequence sets. | |
229 MMseqs2 is open source GPL-licensed software implemented in C++ for Linux, MacOS, and (as beta version, via cygwin) Windows. | |
230 The software is designed to run on multiple cores and servers and exhibits very good scalability. | |
231 MMseqs2 can run 10000 times faster than BLAST. At 100 times its speed it achieves almost the same sensitivity. | |
232 It can perform profile searches with the same sensitivity as PSI-BLAST at over 400 times its speed. | |
233 | |
234 **Usage** | |
235 MMseqs easy-linclust is useful to clusters entries from a FASTA/FASTQ file using the cascaded clustering algorithm. | |
236 It offers an efficient clustering workflow, scaling linearly with input size. Similar to easy-cluster, but more suitable for handling very large datasets efficiently. | |
237 | |
238 https://github.com/soedinglab/MMseqs2 | |
239 | |
240 ]]></help> | |
241 <expand macro="citations"/> | |
242 </tool> |