comparison cd_hit.xml @ 0:e0da3400ac2f draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/cdhit commit 8e14fc2573a53eaf8a538e018ae292f4c3134ec2
author iuc
date Mon, 15 Oct 2018 10:54:50 -0400
parents
children 7807800a3d03
comparison
equal deleted inserted replaced
-1:000000000000 0:e0da3400ac2f
1 <tool id="cd_hit" name="cd-hit" version="4.6.8.1">
2 <description>Cluster or compare biological sequence datasets</description>
3 <requirements>
4 <requirement type="package" version="4.6.8">cd-hit</requirement>
5 </requirements>
6 <version_command><![CDATA[
7 cd-hit | grep "CD-HIT version" | cut -d" " -f 4
8 ]]></version_command>
9 <command detect_errors="exit_code"><![CDATA[
10 cd-hit$est.est_select$twod.twod_select
11 -i '$fasta_in'
12 -o rep_seq
13 -c $est.similarity
14 -n $est.wordsize
15 #if $est.est_select == '-est':
16 $est.strand
17 #if str($est.estadvalign.mask) != 'None':
18 -mask '$est.estadvalign.mask'
19 #end if
20 -match $est.estadvalign.match
21 -mismatch $est.estadvalign.mismatch
22 -gap $est.estadvalign.gap
23 -gap-ext $est.estadvalign.gapext
24 #else:
25 -t $est.redtol
26 #end if
27 #if $twod.twod_select == '-2d':
28 -i2 '$fasta_in2'
29 #if $advanced.advalign.style == 'local':
30 -s2 $advanced.advalign.advancedtwod.cutoff_diff_len2
31 -S2 $advanced.advalign.advancedtwod.aa_cutoff_diff_len2
32 #end if
33 #end if
34
35 -b $advanced.band_width
36 -l $advanced.throw_away_len
37 #if $advanced.advalign.style == 'local':
38 -G 0
39 -aL $advanced.advalign.align_coverage_long
40 -AL $advanced.advalign.align_coverage_long_control
41 -aS $advanced.advalign.align_coverage_short
42 -AS $advanced.advalign.align_coverage_short_control
43 -A $advanced.advalign.align_coverage_min
44 -s $advanced.advalign.cutoff_diff_len
45 -S $advanced.advalign.aa_cutoff_diff_len
46 #end if
47 -uL $advanced.max_unmatched_per_l
48 -uS $advanced.max_unmatched_per_s
49 -U $advanced.max_unmatched_len
50 $advanced.accurate
51 $advanced.inram
52 #if $print_alnovl.print_alnovl_select == "yes":
53 -p 1
54 -d $print_alnovl.desclen
55 #end if
56
57 ## instead of 800 (default) we use 0:unlimited
58 -M \${GALAXY_MEMORY_MB:-0}
59 -T \${GALAXY_SLOTS:-1}
60 ]]></command>
61 <inputs>
62 <param name="fasta_in" argument="-i" type="data" format="fasta" label="Sequences to cluster/compare"/>
63 <conditional name="twod">
64 <param name="twod_select" type="select" label="Cluster / Compare (i.e. call cd-hit[-est] / cd-hit[-est]-2d)?">
65 <option value="" selected="true">Cluster sequences</option>
66 <option value="-2d">Compare with 2nd sequence data set</option>
67 </param>
68 <when value=""/>
69 <when value="-2d">
70 <param name="fasta_in2" argument="-i2" type="data" format="fasta" label="Other sequences to cluster/compare"/>
71 </when>
72 </conditional>
73 <conditional name="est">
74 <param name="est_select" type="select" label="Sequence type?" help="For nucleotides the -est variant of cd-hit is called">
75 <option value="" selected="true">Protein</option>
76 <option value="-est">Nucleotides</option>
77 </param>
78 <when value="">
79 <param name="similarity" argument="-c" type="float" min="0.4" max="1.0" value="0.9" label="Sequence identity threshold" help="Global sequence identity: number of identical alignment positions divided by the full length of the shorter sequence"/>
80 <param name="wordsize" argument="-n" type="integer" min="2" max="5" value="5" label="Word size">
81 <help>Suggested word size:
82 5 for thresholds 0.7 ~ 1.0
83 4 for thresholds 0.6 ~ 0.7
84 3 for thresholds 0.5 ~ 0.6
85 2 for thresholds 0.4 ~ 0.5 (-n)
86 </help>
87 </param>
88 <param name="redtol" argument="-t" type="integer" value="2" label="Tolerance for redundance"/>
89 </when>
90 <when value="-est">
91 <param name="similarity" argument="-c" type="float" min="0.8" max="1.0" value="0.9" label="Sequence identity threshold" help="Global sequence identity: number of identical alignment positions divided by the full length of the shorter sequence"/>
92 <param name="wordsize" argument="-n" type="integer" min="4" max="11" value="10" label="Word size">
93 <help>Suggested word size:
94 10,11 for threshold in 0.95 ~ 1.0
95 8,9 for threshold in 0.9 ~ 0,95
96 7 for threshold in 0.88 ~ 0.9
97 6 for threshold in 0.85 ~ 0.88
98 5 for threshold in 0.80 ~ 0.85
99 4 for threshold in 0.75 ~ 0.8 (-n)
100 </help>
101 </param>
102 <param name="strand" argument="-r" type="boolean" truevalue="-r 1" falsevalue="-r 0" checked="false" label="Compare both strands?"/>
103 <section name="estadvalign" title="Advanced EST alignment options">
104 <param argument="-mask" type="text" optional="true" label="Masking letters" help="NX, to mask out both 'N' and 'X'"/>
105 <param argument="-match" type="integer" value="2" label="Match score" help="Default 2 (1 for T-U and N-N)"/>
106 <param argument="-mismatch" type="integer" value="-2" label="Mismatch score"/>
107 <param argument="-gap" type="integer" value="-6" label="Gap opening score"/>
108 <param name="gapext" argument="-gap-ext" type="integer" value="-1" label="Gap extension score"/>
109 </section>
110 </when>
111 </conditional>
112 <section name="advanced" title="Advanced options">
113 <param name="band_width" argument="-b" type="integer" min="1" value="20" label="Alignment band width"/>
114 <param name="throw_away_len" argument="-l" type="integer" min="1" value="10" label="Length of throw away sequences"/>
115 <conditional name="advalign">
116 <param name="style" type="select" label="Use local/global sequence identity" help="Local sequence identity: number of identical amino alignment positions divided by the length of the alignment. Note: in local mode one of -aL, aS, or A must be > 0!">
117 <option value="global" selected="true">Global</option>
118 <option value="local" >Local</option>
119 </param>
120 <when value="global"/>
121 <when value="local">
122 <param name="align_coverage_long" argument="-aL" type="float" min="0.0" max="1.0" value="0.0" label="Alignment coverage for the longer sequence" help="Fraction of the longer sequence that must be covered"/>
123 <param name="align_coverage_long_control" argument="-AL" type="integer" min="0" value="99999999" label="Alignment coverage control for the longer sequence " help="Maximum number of residues uncovered residues of the longer sequence"/>
124 <param name="align_coverage_short" argument="-aS" type="float" min="0.0" max="1.0" value="0.0" label="Alignment coverage for the shorter sequence" help="Fraction of the shorter sequence that must be covered"/>
125 <param name="align_coverage_short_control" argument="-AS" type="integer" min="0" value="99999999" label="Alignment coverage control for the shorter sequence" help="Maximum number of residues uncovered residues of the shorter sequence"/>
126 <param name="align_coverage_min" argument="-A" type="integer" min="0" value="0" label="Minimal alignment coverage control for the both sequences" help="Minimum number of residues of both sequences that must be covered"/>
127 <param name="cutoff_diff_len" argument="-s" type="float" min="0.0" max="1.0" value="0.0" label="Length difference cutoff" help="If set to 0.9, the shorter sequences need to be at least 90% length of the representative of the cluster"/>
128 <param name="aa_cutoff_diff_len" argument="-S" type="integer" min="0" value="999999" label="Length difference cutoff in residues" help="If set to 60, the length difference between the shorter sequences and the representative of the cluster can not be bigger than 60"/>
129 <section name="advancedtwod" title="Advanced options for 2D local alignment">
130 <param name="cutoff_diff_len2" argument="-s2" type="float" min="0.0" max="1.0" value="1.0" label="Length difference cutoff for other sequences" help="By default, seqs in db1 >= seqs in db2 in a same cluster if set to 0.9, seqs in db1 may just >= 90% seqs in db2. Only effective in local alignment mode"/>
131 <param name="aa_cutoff_diff_len2" argument="-S2" type="integer" min="0" value="0" label="Length difference cutoff in residues for other sequences" help="by default, seqs in db1 >= seqs in db2 in a same cluster if set to 60, seqs in db2 may 60aa longer than seqs in db1. Only effective in local alignment mode"/>
132 </section>
133 </when>
134 </conditional>
135 <param name="max_unmatched_per_l" argument="-uL" type="float" min="0.0" max="1.0" value="1.0" label="Maximum unmatched percentage for the shorter sequence" help="If set to 0.1, the unmatched region (excluding leading and tailing gaps) must not be more than 10% of the sequence"/>
136 <param name="max_unmatched_per_s" argument="-uS" type="float" min="0.0" max="1.0" value="1.0" label="Maximum unmatched percentage for the shorter sequence" help="If set to 0.1, the unmatched region (excluding leading and tailing gaps) must not be more than 10% of the sequence"/>
137 <param name="max_unmatched_len" argument="-U" type="integer" min="0" value="99999999" label="Maximum unmatched length" help="If set to 10, the unmatched region (excluding leading and tailing gaps) must not be more than 10 bases"/>
138 <param name="inram" argument="-B" type="boolean" truevalue="-B 0" falsevalue="-B 1" checked="true" label="Sequences are stored in RAM" help="If false: sequence are stored on hard drive - use for huge data sets"/>
139 <param name="accurate" argument="-g" type="boolean" truevalue="-g 1" falsevalue="-g 0" checked="false" label="Accurate but slow mode" help="By cd-hit's default algorithm, a sequence is clustered to the first cluster that meet the threshold (fast cluster). If set to true, the program will cluster it into the most similar cluster that meet the threshold (accurate but slow mode)"/>
140 </section>
141
142 <conditional name="print_alnovl">
143 <param name="print_alnovl_select" type="select" label="Print alignment overlap in .clstr file?">
144 <option value="no" selected="true">No</option>
145 <option value="yes">Yes</option>
146 </param>
147 <when value="no"/>
148 <when value="yes">
149 <param name="desclen" argument="-d" type="integer" min="0" value="20" label="Length of description in .clstr file" help="If set to 0, it takes the fasta defline and stops at first space"/>
150 </when>
151 </conditional>
152 </inputs>
153
154 <outputs>
155 <data name="clusters_out" format="txt" label="${tool.name} on ${on_string}: Clusters" from_work_dir="rep_seq.clstr"/>
156 <data name="fasta_out" format="fasta" label="${tool.name} on ${on_string}: Representative sequences" from_work_dir="rep_seq"/>
157 </outputs>
158
159 <tests>
160 <!--cd-hit with default options -->
161 <test>
162 <param name="fasta_in" value="cd_hit_protein_in.fasta" />
163 <conditional name="twod">
164 <param name="twod_select" value="" />
165 </conditional>
166 <conditional name="est">
167 <param name="est_select" value="" />
168 <param name="wordsize" value="5" />
169 </conditional>
170 <param name="similarity" value="0.9" />
171 <output name="clusters_out" file="protein_clusters_output.txt"/>
172 <output name="fasta_out" file="protein_fasta_output.fasta"/>
173 </test>
174 <!--cd-hit local (default options), changed similarity, and output string length -->
175 <test>
176 <param name="fasta_in" value="cd_hit_protein_in.fasta" />
177 <conditional name="twod">
178 <param name="twod_select" value="" />
179 </conditional>
180 <conditional name="est">
181 <param name="est_select" value="" />
182 <param name="wordsize" value="5" />
183 </conditional>
184 <param name="similarity" value="0.8" />
185 <section name="advanced">
186 <conditional name="advalign">
187 <param name="style" value="local" />
188 <param name="align_coverage_short" value="0.9" />
189 </conditional>
190 </section>
191 <conditional name="print_alnovl">
192 <param name="print_alnovl_select" value="yes"/>
193 <param name="desclen" value="40"/>
194 </conditional>
195 <output name="clusters_out" file="protein_clusters_output_local.txt"/>
196 <output name="fasta_out" file="protein_fasta_output_local.fasta"/>
197 </test>
198 <!-- cd-hit-est -->
199 <test>
200 <param name="fasta_in" value="cd_hit_est_in.fa" />
201 <conditional name="twod">
202 <param name="twod_select" value="" />
203 </conditional>
204 <conditional name="est">
205 <param name="est_select" value="-est" />
206 <param name="wordsize" value="8"/>
207 <param name="strand" value="false"/>
208 </conditional>
209 <param name="similarity" value="0.9"/>
210 <output name="clusters_out" file="est_clusters_output.txt"/>
211 <output name="fasta_out" file="est_fasta_output.fasta"/>
212 </test>
213 <!-- cd-hit-est-2d (also changing strand param) -->
214 <test>
215 <param name="fasta_in" value="db1.fasta" />
216 <conditional name="twod">
217 <param name="twod_select" value="-2d" />
218 <param name="fasta_in2" value="db2.fasta" />
219 </conditional>
220 <conditional name="est">
221 <param name="est_select" value="-est" />
222 <param name="wordsize" value="8"/>
223 <param name="strand" value="true"/>
224 </conditional>
225 <param name="similarity" value="0.9"/>
226 <output name="clusters_out" file="est-2d.txt.clstr"/>
227 <output name="fasta_out" file="est-2d.txt"/>
228 </test>
229 </tests>
230 <help><![CDATA[
231 **What it does**
232
233 cd-hit stands for Cluster Database at High Identity with Tolerance. The tool implements four variants: cd-hit, cd-hit-est, cd-hit-2d, and cd-hit-est-2d.
234
235 The program cd-hit (resp. cd-hit-est) takes a FASTA format aminoacid (resp. nucleotide) sequence database as input and produces a set of 'non-redundant' (nr) representative sequences as output. In addition cd-hit outputs a cluster file, documenting the members of the sequence clusters for each nr sequence representative. The idea is to reduce the overall size of the database without removing any sequence information by only removing 'redundant' (or highly similar) sequences. This is why the resulting database is called non-redundant (nr). Essentially, cd-hit (resp. cd-hit-est) produces a set of closely related protein (resp. nucleotide sequence) families from a given FASTA sequence database.
236
237 The program cd-hit-2d (resp. cd-hit-est-2d) compares two aminoacid (resp. nucleotide) sequence datasets (db1, db2) in FASTA format. It identifies the sequences in db2 that are similar to db1 at a certain threshold. It outputs two files: a FASTA file of sequences in db2 that are not similar to db1 and a text file that lists similar sequences between db1 & db2.
238
239 .. _CD-HIT: http://weizhongli-lab.org/cd-hit/
240
241 ------
242
243 **Inputs**
244
245 cd-hit/cd-hit-2d requires a (two) protein FASTA file(s) as input.
246
247 cd-hit-est/cd-hit-est-2d requires a (two) nucleotide FASTA file(s) as input.
248
249 ------
250
251 **Outputs**
252
253 For cd-hit and cd-hit-est:
254
255 1. The first output is a FASTA file containing representative sequences.
256
257 2. The second output is a text file listing the mapping of sequences to the representative sequences:
258
259 >Cluster 0
260 0 2799aa, >PF04998.6|RPOC2_CHLRE/275-3073... *
261 >Cluster 1
262 0 2214aa, >PF06317.1|Q6Y625_9VIRU/1-2214... at 80%
263 1 2215aa, >PF06317.1|O09705_9VIRU/1-2215... at 84%
264 2 2217aa, >PF06317.1|Q6Y630_9VIRU/1-2217... *
265 3 2216aa, >PF06317.1|Q6GWS6_9VIRU/1-2216... at 84%
266 4 527aa, >PF06317.1|Q67E14_9VIRU/6-532... at 63%
267 >Cluster 2
268 0 2202aa, >PF06317.1|Q6UY61_9VIRU/8-2209... at 60%
269 1 2208aa, >PF06317.1|Q6IVU4_JUNIN/1-2208... *
270 2 2207aa, >PF06317.1|Q6IVU0_MACHU/1-2207... at 73%
271 3 2208aa, >PF06317.1|RRPO_TACV/1-2208... at 69%
272
273 For cd-hit-2d and cd-hit-est-2d:
274
275 1. The first output is a FASTA file of sequences in db2 that are not similar to db1.
276
277 2. The second output is a text file that lists similar sequences between db1 & db2
278 ]]></help>
279 <citations>
280 <citation type="doi">10.1093/bioinformatics/btl158</citation>
281 <citation type="doi">10.1093/bioinformatics/bts565</citation>
282 </citations>
283 </tool>