comparison hammock.xml @ 0:b1ac138f0287 draft

Uploaded
author hammock
date Mon, 26 Jan 2015 06:24:21 -0500
parents
children d90f4809ccc6
comparison
equal deleted inserted replaced
-1:000000000000 0:b1ac138f0287
1 <tool id="hammock_1.0" name="Hammock - cluster peptides">
2
3 <description>Clusters short peptide sequences</description>
4
5 <command interpreter="bash">
6 wrapper.sh \$HAMMOCK_JAR full --galaxy -t \${GALAXY_SLOTS:-4} -i $input --goc $output_clusters --gos $output_sequences
7
8 #if $label_params.set_labels == "set":
9 #for $s in $label_params.round_labels:
10 #set $l_field = $l_field + str($s.label) + ","
11 #end for
12 -l $l_field
13 #end if
14
15 #if $advanced_greedy_params.set_greedy_params == "set":
16 -x $advanced_greedy_params.max_shift
17 -p $advanced_greedy_params.shift_penalty
18 -m \${MATRIX_PATH}${advanced_greedy_params.scoring_matrix}.txt
19 #if $advanced_greedy_params.greedy_params.set_greedy == "set":
20 -g $advanced_greedy_params.greedy_params.greedy_threshold
21 #end if
22 #end if
23
24 #if $advanced_hmm_params.set_hmm_params == "set":
25 #if $advanced_hmm_params.threshold_params.clustering_threshold == "part":
26 -a $advanced_hmm_params.threshold_params.part_threshold
27 #end if
28
29 #if $advanced_hmm_params.threshold_params.clustering_threshold == "size":
30 -s $advanced_hmm_params.threshold_params.size_threshold
31 #end if
32
33 #if $advanced_hmm_params.threshold_params.clustering_threshold == "count":
34 -c $advanced_hmm_params.threshold_params.count_threshold
35 #end if
36
37 #set $n_field=""
38 #set $v_field=""
39 #set $r_field=""
40
41 #if $advanced_hmm_params.score_params.set_scores == "set":
42 #if $advanced_hmm_params.score_params.relative_scores == "relative":
43 -e
44 #end if
45 #if $advanced_hmm_params.score_params.relative_scores == "absolute":
46 -b
47 #end if
48
49 #for $s in $advanced_hmm_params.score_params.round:
50 #set $n_field = $n_field + str($s.assign_score) + ","
51 #set $v_field = $v_field + str($s.overlap_score) + ","
52 #set $r_field = $r_field + str($s.merge_score) + ","
53 #end for
54
55 -n $n_field
56 -v $v_field
57 -r $r_field
58 #end if
59 #set $l_field=""
60
61 #if $advanced_hmm_params.match_state_params.set_max_aln_length == "set":
62 -j $advanced_hmm_params.match_state_params.max_aln_length
63 #end if
64
65 #if $advanced_hmm_params.extension_increase_length == "Yes":
66 -q
67 #end if
68
69 -k $advanced_hmm_params.min_ic
70 -y $advanced_hmm_params.max_gap_proportion
71 -u $advanced_hmm_params.max_inner_gaps
72 -h $advanced_hmm_params.min_match_states
73 #end if
74
75 </command>
76
77 <inputs>
78 <param format="fasta" name="input" type="data" label="Source sequence file" help="File with sequences to cluster in fasta format. See -i, --input in manual for details." />
79
80 <conditional name="label_params">
81 <param name="set_labels" type="select" label="Specify a subset of labels to be used" help="Set Automatic to use all labels present in the data or choose a subset of labels to be used. See -l, --labels in manual for details.">
82 <option value="auto">Automatic - all labels</option>
83 <option value="set">Set list of labels manually</option>
84 </param>
85 <when value="auto" />
86 <when value="set">
87 <repeat name="round_labels" title="Label">
88 <param name="label" type="text" value="" label="Sequence label"/>
89 </repeat>
90 </when>
91 </conditional>
92
93
94 <conditional name="advanced_greedy_params">
95 <param name="set_greedy_params" type="select" label="Greedy clustering options">
96 <option value="auto">Default - automatic settings</option>
97 <option value="set">Set manually</option>
98 </param>
99 <when value="auto" />
100 <when value="set">
101
102 <param name="max_shift" type="integer" value="3" min="0" label="Maximal sequence shift" help="Maximal number of positions sequences are allowed to shift for during greedy clustering. See -x, --max shift in manual for details." />
103
104 <param name="shift_penalty" type="integer" value="0" label="Sequence shift penalty" help="Score penalty added to to each alignment score during greedy clustering. This penalty is added for every amino acid aligned towards a (trailing) gap. This value should typically be non-positive (With a positive value, sequences benefit from containing more gaps). See -p, --gap penalty in manual for details."/>
105
106 <param name="scoring_matrix" type="select" label="Substitiution matrix schema." help="Select a substitution matrix to be used to score alignments during glreedy clustering. See -m, --matrix in manual for details.">
107 <option value="blosum62">Blosum 62</option>
108 <option value="blosum30">Blosum 30</option>
109 <option value="blosum35">Blosum 35</option>
110 <option value="blosum40">Blosum 40</option>
111 <option value="blosum45">Blosum 45</option>
112 <option value="blosum50">Blosum 50</option>
113 <option value="blosum55">Blosum 55</option>
114 <option value="blosum60">Blosum 60</option>
115 <option value="blosum65">Blosum 65</option>
116 <option value="blosum70">Blosum 70</option>
117 <option value="blosum75">Blosum 75</option>
118 <option value="blosum80">Blosum 80</option>
119 <option value="blosum85">Blosum 85</option>
120 <option value="blosum90">Blosum 90</option>
121 <option value="blosum100">Blosum 100</option>
122 <option value="gonnet250">Gonnet 250</option>
123 <option value="pam250">Pam 250</option>
124 </param>
125
126 <conditional name="greedy_params">
127 <param name="set_greedy" type="select" label="Set greedy clustering threshold" help="Minimal alignment score needed for a sequence to join a cluster during greedy clustering. Can be either user defined or set automatically based on mean sequence length. See -g, --greedy threshold in manual for details.">
128 <option value="auto">Auto detection</option>
129 <option value="set">Set manually</option>
130 </param>
131 <when value="auto" />
132 <when value="set">
133 <param name="greedy_threshold" type="integer" value="24" min="0" label="Greedy clustering threshold" help="Minimal alignment score needed for a sequence to join a cluster during greedy clustering." />
134 </when>
135 </conditional>
136
137 </when>
138 </conditional>
139
140 <conditional name="advanced_hmm_params">
141 <param name="set_hmm_params" type="select" label="HMM-clustering options">
142 <option value="auto">Default - automatic settings</option>
143 <option value="set">Set manually</option>
144 </param>
145 <when value="auto" />
146 <when value="set">
147
148
149 <conditional name="threshold_params">
150 <param name="clustering_threshold" type="select" label="How many initial clusters to use as cluster cores" help="After greedy clusering, some of the largest clusters are selected as cluster cores for subsequent clustering procedure. The number of cluster cores can be determined either automatically or manually as top x percent of largest clusters, all clusters satisfying size threshold or exact number of clusters. See -a, --part threshold, -s, --size threshold and -c, --count threshold in manual for details.">
151 <option value="auto">Automatic setting</option>
152 <option value="part">Set percentual proportion</option>
153 <option value="size">Set size threshold</option>
154 <option value="count">Set explicit count</option>
155 </param>
156 <when value="auto" />
157 <when value="part">
158 <param name="part_threshold" type="float" value="0.025" min="0.00001" max="1.0" label="The proporiton of the largest greedy clusters to be used as cluster cores in subsquent clustering procedure." help="See -a, --part threshold in manual for details." />
159 </when>
160 <when value="size">
161 <param name="size_threshold" type="integer" value="10" min="1" label="Minimum size of a greedy cluster needed for it to be used as cluster core in subsquent clustering procedure." help="See -s, --size threshold in manual for details."/>
162 </when>
163 <when value="count">
164 <param name="count_threshold" type="integer" value="25" min="1" label="The number of greedy clusters to be used as cluster cores in subsquent clustering procedure." help="See -c, --count threshold in manual for details"/>
165 </when>
166 </conditional>
167
168 <conditional name="score_params">
169 <param name="set_scores" type="select" label="Clustering rounds" help="Set the number of clustering rounds and score thresholds used. Automatic mode means 3 rounds and score thresholds defined based on mean sequence length.">
170 <option value="auto">Automatic settings</option>
171 <option value="set">Set manually</option>
172 </param>
173 <when value="auto" />
174 <when value="set">
175 <param name="relative_scores" type="select" label="Relative/absolute scores" help="All score thresholds in all clustering rounds can be interpreted either as relative values (per HMM match-state) or absolute values. See -e, --relative thresholds in manual for details.">
176 <option value="absolute">Scores are absolute values</option>
177 <option value="relative">Scores are relative, i.e. per match state</option>
178 </param>
179 <repeat name="round" title="Round">
180 <param name="assign_score" type="float" value="10.0" min="0.0" label="Assign threshold" help="Minimal score needed for a sequence to be assigned to a cluster. See -n, --assign thresholds in manual for details." />
181 <param name="overlap_score" type="float" value="8.0" min="0.0" label="Overlap threshold" help="Minimal score needed for two clusters to be considered overlapping. This affects cluster merging step heuristic speedup. If this is set to 0.0, full cluster merging routine will be performed, which is the most precise but the slowest. It is suggested to perform full cluster merging routine at least in the last round. See -v, --overlap thresholds in manual for details."/>
182 <param name="merge_score" type="float" value="10.0" min="0.0" label="Merge threshold" help="Minimal score needed for two clusters to be merged. See -r, --merge thresholds in manual for details"/>
183 </repeat>
184 </when>
185 </conditional>
186
187 <param name="min_match_states" type="integer" value="4" min="0" label="Minimal number of HMM match states." help=" Minimal number of match states maintained for each cluster's HMM throughout the computation. This parameter can be also viewed as minimal motif length. See -h, --min match states in manual for details."/>
188
189 <param name="max_gap_proportion" type="float" value="0.05" min="0.0" max="1.0" label="Maximal proportion of gaps allowed in a match state" help="Maximal proportion of gaps in HMM match states. Any multiple sequence alignment column containing more gaps will not be considered a match state. See -y, --max gap proportion in manual for details."/>
190
191 <param name="min_ic" type="float" value="1.2" min="0.0" max="4.3219280" label="Minimal information content allowed in a match state" help="Minimal information content (In terms of Shannon information theory) of HMM match states. Any multiple sequence alignment column having lower information content will not be considered a match state. Minimum: 0.0 (any MSA column composition), maximum: 4.32 (MSA column containing the same amino acid on each line). See -k, --min ic in manual for details."/>
192
193
194 <conditional name="match_state_params">
195 <param name="set_max_aln_length" type="select" label="Maximal alignment length" help="Maximal multiple sequence alignment length for every cluster. Can be either user defined or specify automatically based on mean sequence length. See -j, --max aln length in manual for details.">
196 <option value="auto">Auto detection</option>
197 <option value="set">Set manually</option>
198 </param>
199 <when value="auto" />
200 <when value="set">
201 <param name="max_aln_length" type="integer" value="24" min="0" label="Maximal alignment length" help="Maximal multiple sequence alignment length for every cluster." />
202 </when>
203 </conditional>
204
205 <param name="max_inner_gaps" type="integer" value="0" min="0" label="Maximum number of inner gaps" help="Maximum number of inner gaps in any line of any cluster's multiple sequence alignment. See -u, --max inner gaps in manual for details."/>
206
207 <conditional name="extension_increase_length">
208 <param name="set_max_aln_length" type="select" label="Can MSA length be increased during extension step?" help="By default, only cluster merging can increase cluster's multiple sequence alignment length. Setting this option to 'Yes' will allow also sequence insertions to icrease the MSA length. See -q, --extension increase length in manual for details.">
209 <option value="false">No</option>
210 <option value="true">Yes</option>
211 </param>
212 <when value="true" />
213 <when value="false" />
214 </conditional>
215 </when>
216 </conditional>
217
218 </inputs>
219
220 <outputs>
221 <data format="csv" name="output_clusters" />
222 <data format="csv" name="output_sequences" />
223 </outputs>
224
225 <requirements>
226 <requirement type="set_environment">HHLIB</requirement>
227 <requirement type="set_environment">HAMMOCK_JAR</requirement>
228 <requirement type="set_environment">MATRIX_PATH</requirement>
229 <requirement type="package" version="1.6.0">java</requirement>
230 <requirement type="package" version="1.2.0">clustalomega</requirement>
231 <requirement type="package" version="3.1b1">hmmer</requirement>
232 <requirement type="package" version="2.0.16">hhsuite</requirement>
233 </requirements>
234
235
236 <tests>
237 <test>
238 <param name="input" value="input.fa" />
239 <param name="advanced_greedy_params.max_shift" value="3" />
240 <param name="advanced_greedy_params.shift_penalty" value="0" />
241 <param name="advanced_greedy_params.scoring_matrix" value="blosum62" />
242 <param name="advanced_hmm_params.min_match_states" value="4" />
243 <param name="advanced_hmm_params.min_ic " value="1.2" />
244 <param name="advanced_hmm_params.max_gap_proportion" value="0.05" />
245 <param name="advanced_hmm_params.max_inner_gaps" value="0" />
246
247 <output name="output_clusters" file="output_clusters.csv" />
248 <output name="output_sequences" file="output_sequences.csv" />
249 </test>
250 </tests>
251
252
253
254 <help>
255
256
257 **Hammock overview**
258
259 Hammock performs peptide sequence clustering. It is able to identify clusters of sequences sharing a sequence motif within big datasets. For news, documentation and other available versions, see http://www.recamo.cz/en/software/hammock-cluster-peptides/
260
261 ------
262
263 .. class:: infomark
264
265 **Citation**
266 Please cite:
267
268 Krejci A, et al. *in preparation*
269
270 ------
271
272 **Input format**
273
274 Hammock accepts fasta files. For basic work, fasta description lines (those starting with ">") may contain virtually anything. For work with the concept of sequence labels, description line should be in this form:
275
276 | >id|count|label
277
278 an example of two records in this format:
279
280 | >1|42|label1
281 | RSPIVRQLPSLP
282 | >2|58|label2
283 | GSWVVDISNVED
284
285 For more detailed description of the label concept and input format, see the documentation_.
286
287 ------
288
289 **Outputs**
290
291 Hammock returns two files, both are semicolon-separated tables.
292
293 The first is the cluster overview file. It contains one line for each resulting cluster plus header. Columns are:
294
295 cluster_id main_sequence sum label1 label2 label3 ...
296
297 | cluster_id: Cluster's unique numeric identifier.
298 | main_sequence: The most popular (appearing in the highest number of copies) sequence of this cluster
299 | sum: Total count of all sequences in this cluster (sum over all labels)
300 | label1, label2 etc. Counts of sequences with particular labels
301
302
303 The second file provides more detailed information. It contains one line for each clustered sequence plus header. Columns are:
304
305 cluster_id sequence alignment sum label1 label2 label3 ...
306
307 | cluster_id: Id of the cluster this sequence belongs to
308 | sequence: Amino acid sequence of this peptide
309 | alignment: Aligned amino acid sequence of this peptide (part of cluster's multiple sequence alignment)
310 | sum: Total count of copies of this sequence (sum over all labels)
311 | label1, label2 etc. Counts of copies with particular labels
312
313 ------
314
315 **Parameters**
316 Default and auto-detected parameters have been carefully tuned and tested to work well with several datasets, they are especially suited for short peptides from Phage display experiments. Neverheless, there is no such thing as universal rules suitable for every dataset - parameter understanding and tuning may be needed. For more detailed description of parameters, see the documentation_.
317
318 .. _documentation: http://www.recamo.cz/userfiles/file/Software/Hammock/Hammock-manual.pdf
319
320
321 </help>
322
323 </tool>