annotate cd_hit_est.xml @ 0:23f5701549b1

Uploaded
author jjohnson
date Mon, 12 Sep 2011 10:43:08 -0400
parents
children 34a799d173f7
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
1 <tool id="cd_hit_est" name="CD-HIT-EST" version="1.0">
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
2 <description>Cluster a nucleotide dataset into representative sequences</description>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
3 <command>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
4 cd-hit-est -i $fasta_in -o rep_seq -c $similarity -n $wordsize $strand
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
5 </command>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
6 <inputs>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
7 <param name="fasta_in" type="data" format="fasta" label="EST Sequences to cluster"/>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
8 <param name="similarity" type="float" value="0.9" label="similarity threshold: .75 - 1.0, default is .9">
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
9 <validator type="in_range" message="sequence similarity threshold should be .75 - 1.0" min=".75" max="1.0"/>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
10 </param>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
11 <param name="wordsize" type="integer" value="8" label="word size">
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
12 <help> Suggested word size:
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
13 8,9,10 for thresholds 0.90 ~ 1.0
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
14 7 for thresholds 0.88 ~ 0.9
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
15 6 for thresholds 0.85 ~ 0.88
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
16 5 for thresholds 0.80 ~ 0.85
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
17 4 for thresholds 0.75 ~ 0.8
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
18 </help>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
19 <validator type="in_range" message="word size should be between 4 and 10" min="4" max="10"/>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
20 </param>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
21 <param name="strand" type="boolean" truevalue="-r 1" falsevalue="" checked="false" label="Compare both strands"/>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
22 </inputs>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
23 <outputs>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
24 <data format="txt" name="clusters_out" label="${tool.name} on ${on_string}: clusters" from_work_dir="rep_seq.clstr"/>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
25 <data format="fasta" name="fasta_out" label="${tool.name} on ${on_string}: representatives.fasta" from_work_dir="rep_seq"/>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
26 </outputs>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
27 <requirements>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
28 <requirement type="binary">cd-hit-est</requirement>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
29 </requirements>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
30 <tests>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
31 </tests>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
32 <help>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
33 **CD-HIT-EST**
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
34
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
35 CD-HIT_ stands for Cluster Database at High Identity with Tolerance. The program (cd-hit) takes a fasta format sequence database as input and produces a set of 'non-redundant' (nr) representative sequences as output. In addition cd-hit outputs a cluster file, documenting the sequence 'groupies' for each nr sequence representative. The idea is to reduce the overall size of the database without removing any sequence information by only removing 'redundant' (or highly similar) sequences. This is why the resulting database is called non-redundant (nr). Essentially, cd-hit produces a set of closely related protein families from a given fasta sequence database.
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
36
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
37 .. _CD-HIT: http://www.bioinformatics.org/cd-hit/
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
38
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
39 ------
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
40
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
41 **Inputs**
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
42
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
43 cd-hit-est requires a fasta dataset as input.
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
44
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
45 ------
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
46
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
47 **Outputs**
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
48
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
49 A fasta datasets containing representative sequences.
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
50
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
51 A text file listing the mapping of sequences to the representative sequences::
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
52
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
53 >Cluster 0
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
54 0 2799aa, >PF04998.6|RPOC2_CHLRE/275-3073... *
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
55 >Cluster 1
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
56 0 2214aa, >PF06317.1|Q6Y625_9VIRU/1-2214... at 80%
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
57 1 2215aa, >PF06317.1|O09705_9VIRU/1-2215... at 84%
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
58 2 2217aa, >PF06317.1|Q6Y630_9VIRU/1-2217... *
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
59 3 2216aa, >PF06317.1|Q6GWS6_9VIRU/1-2216... at 84%
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
60 4 527aa, >PF06317.1|Q67E14_9VIRU/6-532... at 63%
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
61 >Cluster 2
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
62 0 2202aa, >PF06317.1|Q6UY61_9VIRU/8-2209... at 60%
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
63 1 2208aa, >PF06317.1|Q6IVU4_JUNIN/1-2208... *
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
64 2 2207aa, >PF06317.1|Q6IVU0_MACHU/1-2207... at 73%
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
65 3 2208aa, >PF06317.1|RRPO_TACV/1-2208... at 69%
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
66
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
67
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
68 </help>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
69 </tool>