annotate cd_hit_est.xml @ 5:28b7a43907f0 default tip

Fix cheetah tests of whether optional arguments are set
author Jim Johnson <jj@umn.edu>
date Thu, 29 Oct 2015 09:54:44 -0500
parents 43724ea1c85f
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
cca0838c1597 Add an environment variable for the -M and -T options for memory and thread allocation
Jim Johnson <jj@umn.edu>
parents: 1
diff changeset
1 <tool id="cd_hit_est" name="CD-HIT-EST" version="1.2">
0
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
2 <description>Cluster a nucleotide dataset into representative sequences</description>
1
34a799d173f7 Add tool_dependencies and functional test
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
3 <requirements>
34a799d173f7 Add tool_dependencies and functional test
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
4 <requirement type="package" version="4.6.1">cd-hit</requirement>
34a799d173f7 Add tool_dependencies and functional test
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
5 </requirements>
3
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
6 <macros>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
7 <import>cdhit_macros.xml</import>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
8 </macros>
0
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
9 <command>
3
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
10 cd-hit-est -i "$fasta_in" -o rep_seq -c $similarity -n $wordsize $strand
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
11 #include source=$common_cdhit_options#
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
12 #include source=$runtime_tuning#
0
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
13 </command>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
14 <inputs>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
15 <param name="fasta_in" type="data" format="fasta" label="EST Sequences to cluster"/>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
16 <param name="similarity" type="float" value="0.9" label="similarity threshold: .75 - 1.0, default is .9">
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
17 <validator type="in_range" message="sequence similarity threshold should be .75 - 1.0" min=".75" max="1.0"/>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
18 </param>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
19 <param name="wordsize" type="integer" value="8" label="word size">
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
20 <help> Suggested word size:
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
21 8,9,10 for thresholds 0.90 ~ 1.0
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
22 7 for thresholds 0.88 ~ 0.9
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
23 6 for thresholds 0.85 ~ 0.88
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
24 5 for thresholds 0.80 ~ 0.85
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
25 4 for thresholds 0.75 ~ 0.8
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
26 </help>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
27 <validator type="in_range" message="word size should be between 4 and 10" min="4" max="10"/>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
28 </param>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
29 <param name="strand" type="boolean" truevalue="-r 1" falsevalue="" checked="false" label="Compare both strands"/>
3
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
30 <expand macro="common_cdhit_options" />
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
31 <expand macro="runtime_tuning" />
0
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
32 </inputs>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
33 <outputs>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
34 <data format="txt" name="clusters_out" label="${tool.name} on ${on_string}: clusters" from_work_dir="rep_seq.clstr"/>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
35 <data format="fasta" name="fasta_out" label="${tool.name} on ${on_string}: representatives.fasta" from_work_dir="rep_seq"/>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
36 </outputs>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
37 <tests>
1
34a799d173f7 Add tool_dependencies and functional test
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
38 <test>
3
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
39 <!-- Expect 3 clusters: 0,1,2 -->
1
34a799d173f7 Add tool_dependencies and functional test
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
40 <param name="fasta_in" value="cd_hit_est_in.fa" />
34a799d173f7 Add tool_dependencies and functional test
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
41 <param name="similarity" value="0.9"/>
34a799d173f7 Add tool_dependencies and functional test
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
42 <param name="wordsize" value="8"/>
3
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
43 <param name="strand" value="true"/>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
44 <!-- conditionals in macros -->
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
45 <param name="settings" value="no"/>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
46 <param name="tuning" value="default"/>
1
34a799d173f7 Add tool_dependencies and functional test
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
47 <output name="clusters_out">
34a799d173f7 Add tool_dependencies and functional test
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
48 <assert_contents>
3
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
49 <has_text text=">Cluster 0" />
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
50 <!-- There should not be a Cluster 3 -->
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
51 <not_has_text text="Cluster 3" />
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
52 <has_text_matching expression="F12Fcsw_481739" />
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
53 </assert_contents>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
54 </output>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
55 <output name="fasta_out">
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
56 <assert_contents>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
57 <has_text_matching expression="^>[MF]\d\dFcsw_\d*" />
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
58 </assert_contents>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
59 </output>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
60 </test>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
61 <test>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
62 <!-- tighter constraints should yield more clusters -->
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
63 <param name="fasta_in" value="cd_hit_est_in.fa" />
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
64 <param name="similarity" value="0.95"/>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
65 <param name="wordsize" value="9"/>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
66 <param name="strand" value="true"/>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
67 <!-- conditionals in macros -->
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
68 <param name="settings" value="no"/>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
69 <param name="tuning" value="default"/>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
70 <output name="clusters_out">
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
71 <assert_contents>
43724ea1c85f Add cd-hit for protein fastas
Jim Johnson <jj@umn.edu>
parents: 2
diff changeset
72 <has_text text=">Cluster 4" />
1
34a799d173f7 Add tool_dependencies and functional test
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
73 <has_text_matching expression=">F12Fcsw_481739" />
34a799d173f7 Add tool_dependencies and functional test
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
74 </assert_contents>
34a799d173f7 Add tool_dependencies and functional test
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
75 </output>
34a799d173f7 Add tool_dependencies and functional test
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
76 <output name="fasta_out">
34a799d173f7 Add tool_dependencies and functional test
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
77 <assert_contents>
34a799d173f7 Add tool_dependencies and functional test
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
78 <has_text_matching expression="^>[MF]\d\dFcsw_\d*" />
34a799d173f7 Add tool_dependencies and functional test
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
79 </assert_contents>
34a799d173f7 Add tool_dependencies and functional test
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
80 </output>
34a799d173f7 Add tool_dependencies and functional test
Jim Johnson <jj@umn.edu>
parents: 0
diff changeset
81 </test>
0
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
82 </tests>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
83 <help>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
84 **CD-HIT-EST**
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
85
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
86 CD-HIT_ stands for Cluster Database at High Identity with Tolerance. The program (cd-hit) takes a fasta format sequence database as input and produces a set of 'non-redundant' (nr) representative sequences as output. In addition cd-hit outputs a cluster file, documenting the sequence 'groupies' for each nr sequence representative. The idea is to reduce the overall size of the database without removing any sequence information by only removing 'redundant' (or highly similar) sequences. This is why the resulting database is called non-redundant (nr). Essentially, cd-hit produces a set of closely related protein families from a given fasta sequence database.
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
87
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
88 .. _CD-HIT: http://www.bioinformatics.org/cd-hit/
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
89
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
90 ------
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
91
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
92 **Inputs**
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
93
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
94 cd-hit-est requires a fasta dataset as input.
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
95
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
96 ------
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
97
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
98 **Outputs**
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
99
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
100 A fasta datasets containing representative sequences.
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
101
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
102 A text file listing the mapping of sequences to the representative sequences::
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
103
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
104 >Cluster 0
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
105 0 2799aa, >PF04998.6|RPOC2_CHLRE/275-3073... *
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
106 >Cluster 1
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
107 0 2214aa, >PF06317.1|Q6Y625_9VIRU/1-2214... at 80%
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
108 1 2215aa, >PF06317.1|O09705_9VIRU/1-2215... at 84%
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
109 2 2217aa, >PF06317.1|Q6Y630_9VIRU/1-2217... *
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
110 3 2216aa, >PF06317.1|Q6GWS6_9VIRU/1-2216... at 84%
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
111 4 527aa, >PF06317.1|Q67E14_9VIRU/6-532... at 63%
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
112 >Cluster 2
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
113 0 2202aa, >PF06317.1|Q6UY61_9VIRU/8-2209... at 60%
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
114 1 2208aa, >PF06317.1|Q6IVU4_JUNIN/1-2208... *
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
115 2 2207aa, >PF06317.1|Q6IVU0_MACHU/1-2207... at 73%
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
116 3 2208aa, >PF06317.1|RRPO_TACV/1-2208... at 69%
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
117
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
118
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
119 </help>
23f5701549b1 Uploaded
jjohnson
parents:
diff changeset
120 </tool>