comparison macros.xml @ 0:62479bdcc059 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/hmmer3 commit 4164b44c651bcbdac6637eccce61b2a802c9b569
author iuc
date Tue, 12 May 2015 15:04:26 -0400
parents
children b5066aa77fea
comparison
equal deleted inserted replaced
-1:000000000000 0:62479bdcc059
1 <?xml version="1.0"?>
2 <macros>
3 <xml name="requirements">
4 <requirements>
5 <requirement type="package" version="3.1b1">hmmer3_1b1</requirement>
6 <yield/>
7 </requirements>
8 </xml>
9 <token name="@WRAPPER_VERSION@">0.1</token>
10 <xml name="stdio">
11 <stdio>
12 <!-- Anything other than zero is an error -->
13 <exit_code range="1:"/>
14 <exit_code range=":-1"/>
15 <!-- In case the return code has not been set propery check stderr too -->
16 <regex match="Error:"/>
17 <regex match="Exception:"/>
18 </stdio>
19 </xml>
20 <token name="@THRESHOLDS@">
21 -E $E
22 --domE $domE
23
24 #if $T:
25 -T $T
26 #end if
27
28 #if $domT:
29 --domT $domT
30 #end if
31
32 #if $incE:
33 --incE $incE
34 #end if
35
36 #if $incT:
37 --incT $incT
38 #end if
39
40 #if $incdomE:
41 --incdomE $incdomE
42 #end if
43
44 #if $incdomT:
45 --incdomT $incdomT
46 #end if
47 </token>
48 <xml name="thresholds_xml">
49 <!-- Options controlling reporting thresholds -->
50 <param name="E" label="report sequences &lt;= this E-Value threshold in output" help="(-E)" value="10.0" type="float" min="0"/>
51 <param name="domE" label="report domains &lt;= this E-Value threshold in output" help="(--domE)" value="10.0" type="float" min="0"/>
52 <param name="T" label="report sequences &gt;= this score threshold in output" help="(-T)" type="float" optional="True"/>
53 <param name="domT" label="report domains &gt;= this score threshold in output" help="(--domT)" type="float" optional="True"/>
54 <!-- Options controlling inclusion (significance) thresholds -->
55 <param name="incE" label="consider sequences &lt;= this E-Value threshold as significant" help="(--incE)" type="float" optional="True"/>
56 <param name="incdomE" label="consider domains &lt;= this E-Value threshold as significant" help="(--incdomE)" type="float" optional="True"/>
57 <param name="incT" label="consider sequences &gt;= this score threshold as significant" help="(--incT)" type="float" optional="True"/>
58 <param name="incdomT" label="consider domains &gt;= this score threshold as significant" help="(--incdomT)" type="float" optional="True"/>
59 </xml>
60 <token name="@THRESHOLDS_NODOM@">
61 -E $E
62
63 #if $T:
64 -T $T
65 #end if
66
67 #if $incE:
68 --incE $incE
69 #end if
70
71 #if $incT:
72 --incT $incT
73 #end if
74 </token>
75 <xml name="thresholds_nodom">
76 <!-- Options controlling reporting thresholds -->
77 <param name="E" label="report sequences &lt;= this E-Value threshold in output" help="(-E)" value="10.0" type="float" min="0"/>
78 <param name="T" label="report sequences &gt;= this score threshold in output" help="(-T)" type="float" optional="True"/>
79 <!-- Options controlling inclusion (significance) thresholds -->
80 <param name="incE" label="consider sequences &lt;= this E-Value threshold as significant" help="(--incE)" type="float" optional="True"/>
81 <param name="incT" label="consider sequences &gt;= this score threshold as significant" help="(--incT)" type="float" optional="True"/>
82 </xml>
83 <token name="@ACCEL_HEUR@">
84 $max
85 --F1 $F1
86 --F2 $F2
87 --F3 $F3
88 $nobias
89
90 </token>
91 <xml name="accel_heur_xml">
92 <!-- Options controlling acceleration heuristics -->
93 <param name="max" type="boolean" truevalue="--max" label="Turn all heuristic filters off (less speed, more power)" help="(--max)" falsevalue=""/>
94 <param name="F1" type="float" label="Stage 1 (MSV) threshold: promote hits w/ P &lt;= F1" help="(--F1)" value="0.02"/>
95 <param name="F2" type="float" label="Stage 2 (Vit) threshold: promote hits w/ P &lt;= F2" help="(--F2)" value="1e-3"/>
96 <param name="F3" type="float" label="Stage 3 (Fwd) threshold: promote hits w/ P &lt;= F3" help="(--F3)" value="1e-5"/>
97 <param name="nobias" type="boolean" truevalue="--nobias" label="Turn off composition bias filter" help="(--nobias)" falsevalue=""/>
98 </xml>
99 <token name="@EVAL_CALIB@">
100 --EmL $EmL
101 --EmN $EmN
102 --EvL $EvL
103 --EvN $EvN
104 --EfL $EfL
105 --EfN $EfN
106 --Eft $Eft
107 </token>
108 <xml name="eval_calib_xml">
109 <!-- Control of E-value calibration -->
110 <param name="EmL" type="integer" value="200" min="1" help="(--EmL)" label="Length of sequences for MSV Gumbel mu fit"/>
111 <param name="EmN" type="integer" value="200" min="1" help="(--EmN)" label="Number of sequences for MSV Gumbel mu fit"/>
112 <param name="EvL" type="integer" value="200" min="1" help="(--EvL)" label="Length of sequences for Viterbi Gumbel mu fit"/>
113 <param name="EvN" type="integer" value="200" min="1" help="(--EvN)" label="Number of sequences for Viterbi Gumbel mu fit"/>
114 <param name="EfL" type="integer" value="100" min="1" help="(--EfL)" label="Length of sequences for Forward exp tail tau fit"/>
115 <param name="EfN" type="integer" value="200" min="1" help="(--EfN)" label="Number of sequences for Forward exp tail tau fit"/>
116 <param name="Eft" type="float" value="0.04" min="0" max="1" help="(--Eft)" label="tail mass for Forward exponential tail tau fit"/>
117 </xml>
118 <token name="@OFORMAT_WITH_OPTS_NOPFAM@">
119 #if 'tblout' in str($oformat):
120 --tblout $tblout
121 #end if
122
123 #if 'domtblout' in str($oformat):
124 --domtblout $domtblout
125 #end if
126
127 $acc $noali $notextw
128 </token>
129 <xml name="oformat_with_opts_nopfam">
130 <!-- Options directing output -->
131 <param name="oformat" multiple="True" display="checkboxes" label="Output Formats" type="select">
132 <option value="tblout" selected="true">Table of per-sequence hits (--tblout)</option>
133 <option value="domtblout" selected="true">Table of per-domain hits (--domtblout)</option>
134 </param>
135 <param name="acc" type="boolean" truevalue="--acc" falsevalue="" label="Prefer accessions over names in output" help="(--acc)"/>
136 <param name="noali" type="boolean" truevalue="--noali" falsevalue="" label="Don't output alignments, so output is smaller" help="(--noali)"/>
137 <param name="notextw" type="boolean" truevalue="--notextw" falsevalue="" label="Unlimited ASCII text output line width" help="(--notextw)"/>
138 </xml>
139 <token name="@OFORMAT_WITH_OPTS@">
140 #if 'tblout' in str($oformat):
141 --tblout $tblout
142 #end if
143
144 #if 'domtblout' in str($oformat):
145 --domtblout $domtblout
146 #end if
147
148 #if 'pfamtblout' in str($oformat):
149 --pfamtblout $pfamtblout
150 #end if
151
152 $acc $noali $notextw
153 </token>
154 <xml name="oformat_with_opts">
155 <!-- Options directing output -->
156 <param name="oformat" multiple="True" display="checkboxes" label="Output Formats" type="select">
157 <option value="tblout" selected="true">Table of per-sequence hits (--tblout)</option>
158 <option value="domtblout" selected="true">Table of per-domain hits (--domtblout)</option>
159 <option value="pfamtblout" selected="true">Table of hits and domains in Pfam format (--pfamtblout)</option>
160 </param>
161 <param name="acc" type="boolean" truevalue="--acc" falsevalue="" label="Prefer accessions over names in output" help="(--acc)"/>
162 <param name="noali" type="boolean" truevalue="--noali" falsevalue="" label="Don't output alignments, so output is smaller" help="(--noali)"/>
163 <param name="notextw" type="boolean" truevalue="--notextw" falsevalue="" label="Unlimited ASCII text output line width" help="(--notextw)"/>
164 </xml>
165 <xml name="oformat_test">
166 <param name="notextw" value="True" />
167 </xml>
168 <!-- TODO: tblout will match 'pfamtblout,dfamtblout' -->
169 <token name="@OFORMAT_WITH_OPTS_N@">
170 #if 'tblout' in str($oformat):
171 --tblout $tblout
172 #end if
173
174 #if 'dfamtblout' in str($oformat):
175 --dfamtblout $dfamtblout
176 #end if
177
178 #if 'aliscoresout' in str($oformat):
179 --aliscoresout $aliscoresout
180 #end if
181
182 $acc $noali $notextw
183 </token>
184 <xml name="oformat_with_opts_n">
185 <!-- Options directing output -->
186 <param name="oformat" multiple="True" display="checkboxes" label="Output Formats" type="select">
187 <option value="tblout" selected="true">Table of hits (--tblout)</option>
188 <option value="dfamtblout" selected="true">Table of hits in Dfam format (--dfamtblout)</option>
189 <option value="aliscoresout">Scores for each position in each alignment to file (--aliscoresout)</option>
190 </param>
191 <param name="acc" type="boolean" truevalue="--acc" falsevalue="" label="Prefer accessions over names in output" help="(--acc)"/>
192 <param name="noali" type="boolean" truevalue="--noali" falsevalue="" label="Don't output alignments, so output is smaller" help="(--noali)"/>
193 <param name="notextw" type="boolean" truevalue="--notextw" falsevalue="" label="Unlimited ASCII text output line width" help="(--notextw)"/>
194 </xml>
195 <token name="@HSSI@">
196 #if $hssi.hssi_select == "singlemx":
197 --popen $hssi.popen
198 --pextend $hssi.pextend
199 #end if
200 </token>
201 <xml name="hssi">
202 <!-- Handling single sequence inputs -->
203 <conditional name="hssi">
204 <param name="hssi_select" type="select" label="Options for handling single sequence inputs">
205 <option value="false" selected="true">Disable</option>
206 <option value="singlemx">Use substitution score matrix for single-sequence inputs</option>
207 </param>
208 <when value="singlemx">
209 <param name="popen" type="float" value="0.02" label="Gap open probability" help="(--popen)" min="0.0" max="0.5"/>
210 <param name="pextend" type="float" value="0.4" label="Gap extend probability" help="(--pextend)" min="0.0" max="1.0"/>
211 </when>
212 <when value="false">
213 </when>
214 <!-- -mx <s> : substitution score matrix (built-in matrices, with -singlemx)-->
215 <!-- -mxfile <f> : read substitution score matrix from file <f> (with -singlemx)-->
216 </conditional>
217 </xml>
218 <token name="@CPU@">
219 --cpu \${GALAXY_SLOTS:-2}
220 </token>
221 <token name="@SEED@">
222 --seed $seed
223 </token>
224 <xml name="seed">
225 <param name="seed" label="RNG seed, 0 generates a random seed" value="42" type="integer" help="(--seed)" min="0"/>
226 </xml>
227 <xml name="seed_test">
228 <param name="seed" value="4" />
229 </xml>
230 <token name="@ADV_OPTS@">
231 $nonull2
232
233 #if $Z:
234 -Z $Z
235 #end if
236
237 #if $domZ:
238 --domZ $domZ
239 #end if
240 </token>
241 <xml name="adv_opts">
242 <!-- Other options -->
243 <param name="nonull2" type="boolean" truevalue="--nonull2" label="Turn off biased composition score corrections" help="(--nonull2)" falsevalue=""/>
244 <param name="Z" type="integer" label="# of comparisons done for E-value calculation" help="(-Z)" optional="True"/>
245 <param name="domZ" type="integer" label="# of significant sequences, for domain E-value calculation" help="(--domZ)" optional="True"/>
246 </xml>
247 <token name="@FORMAT_SELECTOR@">
248 $input_format_select
249 </token>
250 <xml name="format_selector">
251 <param name="input_format_select" type="select" label="Format of sequence and model">
252 <option value="--amino">Protein</option>
253 <option value="--dna">DNA</option>
254 <option value="--rna">RNA</option>
255 </param>
256 </xml>
257 <xml name="format_selector_noprot">
258 <param name="input_format_select" type="select" label="Format of sequence and model">
259 <option value="--dna">DNA</option>
260 <option value="--rna">RNA</option>
261 </param>
262 </xml>
263 <token name="@ARSWS@">
264 $arsws.arsws_select
265
266 #if $arsws.arsws_select == "--wblosum":
267 --wid $arsws.wid
268 #end if
269 </token>
270 <xml name="arsws">
271 <!-- Alternative relative sequence weighting strategies -->
272 <conditional name="arsws">
273 <param name="arsws_select" type="select" label="Alternative relative sequence weighting strategies">
274 <option value="--wpb" selected="true">Henikoff position-based weights (--wpb)</option>
275 <option value="--wgsc">Gerstein/Sonnhammer/Chothia tree weights (--wgsc)</option>
276 <option value="--wblosum">Henikoff simple filter weights (--wblosum)</option>
277 <option value="--wnone">don't do any relative weighting; set all to 1 (--wnnoe)</option>
278 <option value="--wgiven">use weights as given in MSA file (--wgiven)</option>
279 </param>
280 <when value="--wpb">
281 </when>
282 <when value="--wgsc">
283 </when>
284 <when value="--wblosum">
285 <param name="wid" label="Set identity cutoff" value="0.62" type="float" help="(--wid)"/>
286 </when>
287 <when value="--wnone">
288 </when>
289 <when value="--wgiven">
290 </when>
291 </conditional>
292 </xml>
293 <token name="@AEEWS@">
294 #if $aeews.aeews_select != "":
295 --$aeews.aeews_select
296 #if $aeews.aeews_select == "eent":
297 --eset $aeews.eset
298 --ere $aeews.ere
299 --esigma $aeews.esigma
300 #elif $aeews.aeews_select == "eclust":
301 --eset $aeews.eset
302 --eid $aeews.eid
303 #end if
304 #end if
305 </token>
306 <xml name="aeews">
307 <!-- Alternative effective sequence weighting strategies -->
308 <conditional name="aeews">
309 <param name="aeews_select" type="select" label="Alternative effective sequence weighting strategies">
310 <option value="">Disabled</option>
311 <option value="eent">Adjust eff seq # to achieve relative entropy target (--eent)</option>
312 <option value="eclust">Eff seq # is the # of single linkage clusters (--eclust)</option>
313 <option value="enone">No effective seq # weighting: just use nseq (--enone)</option>
314 </param>
315 <when value="">
316 </when>
317 <when value="eent">
318 <param name="eset" type="float" value="0" label="set eff seq # for all models" help="(--eset)"/>
319 <param name="ere" type="float" value="0" label="set minimum rel entropy/position" help="(--ere)"/>
320 <param name="esigma" type="float" value="45" label="set sigma param" help="(--esigma)"/>
321 </when>
322 <when value="eclust">
323 <param name="eset" type="float" value="0" label="set eff seq # for all models" help="(--eset)"/>
324 <param name="eid" type="float" value="0.62" label="set fractional identity cutoff" min="0" max="1" help="(--eid)"/>
325 </when>
326 <when value="enone">
327 </when>
328 </conditional>
329 </xml>
330 <token name="@CUT@">
331 $cut_ga
332 $cut_nc
333 $cut_tc
334 </token>
335 <xml name="cut">
336 <param name="cut_ga" type="boolean" truevalue="--cut_ga" label="use profile's GA gathering cutoffs to set all thresholding" help="(--cut_ga)" falsevalue=""/>
337 <param name="cut_nc" type="boolean" truevalue="--cut_nc" label="use profile's NC gathering cutoffs to set all thresholding" help="(--cut_nc)" falsevalue=""/>
338 <param name="cut_tc" type="boolean" truevalue="--cut_tc" label="use profile's TC gathering cutoffs to set all thresholding" help="(--cut_tc)" falsevalue=""/>
339 </xml>
340 <token name="@MCSS@">
341 --$mcs.model_construction_strategy_select
342
343 #if $mcs.model_construction_strategy_select == "fast":
344 --symfrac $mcs.symfrac
345 #end if
346
347 </token>
348 <xml name="mcss">
349 <!-- Alternative model construction strategies -->
350 <conditional name="mcs">
351 <param name="model_construction_strategy_select" type="select" label="Model Construction Strategy">
352 <option value="fast" selected="true">Assign columns with &gt;= symfrac residues as consensus (--fast)</option>
353 <option value="hand">Manual construction (requires reference annotation) (--hand)</option>
354 </param>
355 <when value="fast">
356 <param name="symfrac" value="0.5" type="float" label="Sets sym fraction controlling --fast construction"/>
357 </when>
358 <when value="hand"></when>
359 </conditional>
360 <param name="fragthresh" label="Fraction of alignment length, under which sequences are excluded" help="HMMER infers fragments if the sequence length L is less than or equal to a fraction x times the alignment length in columns (--fragthresh)" value="0.5" optional="True" type="float" />
361
362 </xml>
363 <token name="@PRIOR@">
364 $aps_select
365 </token>
366 <xml name="prior">
367 <param name="aps_select" type="select" label="Alternative Prior Strategies">
368 <option value="" selected="true">Unspecified</option>
369 <option value="--pnone">Don't use any prior; parameters are frequencies (--pnone)</option>
370 <option value="--plaplace">Use a Laplace +1 prior (--plaplace)</option>
371 </param>
372 </xml>
373 <xml name="citation">
374 <citations>
375 <citation type="doi">10.1093/nar/gkr367</citation>
376 </citations>
377 </xml>
378 <token name="@LENGTHS@">
379 #if $w_beta:
380 --w_beta $w_beta
381 #end if
382
383 #if $w_length:
384 --w_length $w_length
385 #end if
386
387 </token>
388 <xml name="lengths">
389 <param name="w_beta" label="Tail mass at which window length is determined"
390 help="(--w_beta)" optional="True" type="float"/>
391 <param name="w_length" label="Window Length"
392 help="(--w_length)" optional="True" type="integer" />
393 </xml>
394 <xml name="input_hmm">
395 <param name="hmmfile" type="data" label="HMM model" format="hmmer2,hmmer3"/>
396 </xml>
397 <xml name="input_msa">
398 <param name="msafile" type="data" label="MSA File" format="stockholm"/>
399 </xml>
400
401
402 <token name="@ACCEL_HEUR_HELP@"><![CDATA[
403 Acceleration Heuristicts (--F1, --F2, --F3)
404 -------------------------------------------
405
406 **MSV filter**
407
408 The sequence is aligned to the profile using a specialized model that
409 allows multiple high-scoring local ungapped segments to match. The
410 optimal alignment score (Viterbi score) is calculated under this multi-
411 segment model, hence the term MSV, for “multi-segment Viterbi”. This is
412 HMMER’s main speed heuristic. The MSV score is comparable to BLAST’s sum
413 score (optimal sum of ungapped alignment segments). Roughly speaking,
414 MSV is comparable to skipping the heuristic word hit and hit extension
415 steps of the BLAST acceleration algorithm.
416
417 The MSV filter is very, very fast. In addition to avoiding indel
418 calculations in the dynamic programming table, it uses reduced precision
419 scores scaled to 8-bit integers, enabling acceleration via 16-way
420 parallel SIMD vector instructions.
421
422 The MSV score is a true log-odds likelihood ratio, so it obeys
423 conjectures about the expected score distribution (Eddy, 2008) that
424 allow immediate and accurate calculation of the statistical significance
425 (P- value) of the MSV bit score.
426
427 By default, comparisons with a P-value of ≤ 0.02 pass this filter,
428 meaning that about 2% of nonhomol- ogous sequences are expected to pass.
429 You can use the --F1 option to change this threshold. For example, --F1
430 <0.05> would pass 5% of the comparisons, making a search more sensitive
431 but slower. Setting the threshold to ≥ 1.0 (--F1 99 for example) assures
432 that all comparisons will pass. Shutting off the MSV filter may be
433 worthwhile if you want to make sure you don’t miss comparisons that have
434 a lot of scattered insertions and deletions. Alternatively, the --max
435 option causes the MSV filter step (and all other filter steps) to be
436 bypassed.
437
438 The MSV bit score is calculated as a log-odds score using the null model
439 for comparison. No correction for a biased composition or repetitive
440 sequence is done at this stage. For comparisons involving biased
441 sequences and/or profiles, more than 2% of comparisons will pass the MSV
442 filter. At the end of search output, there is a line like:
443
444 Passed MSV filter: 107917 (0.020272); expected 106468.8 (0.02)
445
446 which tells you how many and what fraction of comparisons passed the MSV
447 filter, versus how many (and what fraction) were expected.
448
449 **Viterbi filter**
450
451 The sequence is now aligned to the profile using a fast Viterbi algorithm for
452 optimal gapped alignment.
453
454 This Viterbi implementation is specialized for speed. It is implemented in
455 8-way parallel SIMD vector instructions, using reduced precision scores that
456 have been scaled to 16-bit integers. Only one row of the dynamic programming
457 matrix is stored, so the routine only recovers the score, not the optimal
458 alignment itself. The reduced representation has limited range; local alignment
459 scores will not underflow, but high scoring comparisons can overflow and return
460 infinity, in which case they automatically pass the filter.
461
462 The final Viterbi filter bit score is then computed using the appropriate null
463 model log likelihood (by default the biased composition filter model score, or
464 if the biased filter is off, just the null model score). If the P-value of this
465 score passes the Viterbi filter threshold, the sequence passes on to the next
466 step of the pipeline.
467
468 The --F2 <x> option controls the P-value threshold for passing the Viterbi
469 filter score. The default is 0.001. The --max option bypasses all filters in
470 the pipeline. At the end of a search output, you will see a line like:
471
472 Passed Vit filter: 2207 (0.00443803); expected 497.3 (0.001)
473
474 which tells you how many and what fraction of comparisons passed the Viterbi
475 filter, versus how many were expected.
476
477 **Forward filter/parser**
478
479 The sequence is now aligned to the profile using the full Forward algorithm,
480 which calculates the likelihood of the target sequence given the profile,
481 summed over the ensemble of all possible alignments.
482
483 This is a specialized time- and memory-efficient Forward implementation called
484 the “Forward parser”. It is implemented in 4-way parallel SIMD vector
485 instructions, in full precision (32-bit floating point). It stores just enough
486 information that, in combination with the results of the Backward parser
487 (below), posterior probabilities of start and stop points of alignments
488 (domains) can be calculated in the domain definition step (below), although the
489 detailed alignments themselves cannot be.
490
491 The Forward filter bit score is calculated by correcting this score using the
492 appropriate null model log likelihood (by default the biased composition filter
493 model score, or if the biased filter is off, just the null model score). If the
494 P-value of this bit score passes the Forward filter threshold, the sequence
495 passes on to the next step of the pipeline.
496
497 The bias filter score has no further effect in the pipeline. It is only used in
498 filter stages. It has no effect on final reported bit scores or P-values.
499 Biased composition compensation for final bit scores is done by a more complex
500 domain-specific algorithm, described below.
501
502 The --F3 <x> option controls the P-value threshold for passing the Forward
503 filter score. The default is 1e-5. The --max option bypasses all filters in the
504 pipeline. At the end of a search output, you will see a line like:
505
506 Passed Fwd filter: 1076 (0.00216371); expected 5.0 (1e-05)
507
508 which tells you how many and what fraction of comparisons passed the Forward
509 filter, versus how many were expected.
510
511 **Bias Filter Options**
512
513 The --max option bypasses all filters in the pipeline, including the bias
514 filter.
515
516 The --nobias option turns off (bypasses) the biased composition filter. The
517 simple null model is used as a null hypothesis for MSV and in subsequent filter
518 steps. The biased composition filter step compromises a small amount of
519 sensitivity. Though it is good to have it on by default, you may want to shut
520 it off if you know you will have no problem with biased composition hits.
521
522
523 **Advanced Documentation**
524
525 A more detailed look at the internals of the various filter pipelines was
526 posted on the `developer's blog <http://selab.janelia.org/people/eddys/blog/?p=508>`__.
527 The information posted there may be useful to those who are struggling with
528 poor-scoring sequences.
529
530 ]]></token>
531 <token name="@ADV_OPTS_HELP@"><![CDATA[
532 Advanced Options
533 ----------------
534
535 **nonull2**
536
537 can be too aggressive sometimes, causing you to miss homologs. You can turn the
538 biased-composition score correction off with the --nonull2 option (and if
539 you’re doing that, you may also want to set --nobias, to turn off another
540 biased composition step called the bias filter, which affects which sequences
541 get scored at all).
542
543 **domZ**
544
545 Assert that the total number of targets in your searches is <x>, for the
546 purposes of per-domain conditional E-value calculations, rather than the number
547 of targets that passed the reporting thresholds.
548
549 **Z**
550
551 Assert that the total number of targets in your searches is <x>, for the
552 purposes of per-sequence E-value calculations, rather than the actual number of
553 targets seen.
554 ]]></token>
555 <token name="@AEEWS_HELP@"><![CDATA[
556 Effective Sequence Number
557 -------------------------
558
559 After relative weights are determined, they are normalized to sum to a total
560 effective sequence number, eff nseq. This number may be the actual number of
561 sequences in the alignment, but it is almost always smaller than that. The
562 default entropy weighting method (--eent) reduces the effective sequence num-
563 ber to reduce the information content (relative entropy, or average expected
564 score on true homologs) per consensus position. The target relative entropy is
565 controlled by a two-parameter function, where the two parameters are settable
566 with --ere and --esigma.
567
568 **--eent**
569
570 Adjust effective sequence number to achieve a specific relative entropy per
571 position (see --ere). This is the default.
572
573 **--eclust**
574
575 Set effective sequence number to the number of single-linkage clusters at a
576 specific identity threshold (see --eid). This option is not recommended; it’s
577 for experiments evaluating how much better --eent is.
578
579 **--enone**
580
581 Turn off effective sequence number determination and just use the actual number
582 of sequences. One reason you might want to do this is to try to maximize the
583 relative entropy/position of your model, which may be useful for short models.
584
585 **--eset**
586
587 Explicitly set the effective sequence number for all models to <x>.
588
589 **--ere**
590
591 Set the minimum relative entropy/position target to <x>. Requires --eent. Default
592 depends on the sequence alphabet. For protein sequences, it is 0.59 bits/position;
593 for nucleotide sequences, it is 0.45 bits/position.
594
595 **--esigma**
596
597 Sets the minimum relative entropy contributed by an entire model alignment, over
598 its whole length. This has the effect of making short models have higher relative
599 entropy per position than --ere alone would give. The default is 45.0 bits.
600
601 **--eid**
602
603 Sets the fractional pairwise identity cutoff used by single linkage clustering
604 with the --eclust option. The default is 0.62.
605 ]]></token>
606 <token name="@ARSWS_HELP@"><![CDATA[
607 Options Controlling Relative Weights
608 ------------------------------------
609
610 HMMER uses an ad hoc sequence weighting algorithm to downweight closely related
611 sequences and up-weight distantly related ones. This has the effect of making
612 models less biased by uneven phylogenetic representation. For example, two
613 identical sequences would typically each receive half the weight that one
614 sequence would. These options control which algorithm gets used.
615
616
617 **--wpb**
618
619 Use the Henikoff position-based sequence weighting scheme [Henikoff and
620 Henikoff, J. Mol. Biol. 243:574, 1994]. This is the default.
621
622 **--wgsc**
623
624 Use the Gerstein/Sonnhammer/Chothia weighting algorithm [Gerstein et al, J.
625 Mol. Biol. 235:1067, 1994].
626
627 **--wblosum**
628
629 Use the same clustering scheme that was used to weight data in calculating
630 BLOSUM subsitution matrices [Henikoff and Henikoff, Proc. Natl. Acad. Sci
631 89:10915, 1992]. Sequences are single-linkage clustered at an identity
632 threshold (default 0.62; see --wid) and within each cluster of c sequences,
633 each sequence gets rela- tive weight 1/c.
634
635 **--wnone**
636
637 No relative weights. All sequences are assigned uniform weight.
638
639 **--wid**
640
641 Sets the identity threshold used by single-linkage clustering when using
642 --wblosum. Invalid with any other weighting scheme. Default is 0.62.
643 ]]></token>
644 <token name="@BIAS_COMP_HELP@"><![CDATA[
645 Bias Composition
646 ----------------
647
648 The next number, the bias, is a correction term for biased sequence composition
649 that has been applied to the sequence bit score.1 For instance, for the top hit
650 MYG PHYCA that scored 222.7 bits, the bias of 3.2 bits means that this sequence
651 originally scored 225.9 bits, which was adjusted by the slight 3.2 bit biased-
652 composition correction. The only time you really need to pay attention to the
653 bias value is when it’s large, on the same order of magnitude as the sequence
654 bit score. Sometimes (rarely) the bias correction isn’t aggressive enough, and
655 allows a non-homolog to retain too much score. Conversely, the bias correction
656 can be too aggressive sometimes, causing you to miss homologs. You can turn the
657 biased-composition score correction off with the --nonull2 option (and if
658 you’re doing that, you may also want to set --nobias, to turn off another
659 biased composition step called the bias filter, which affects which sequences
660 get scored at all).
661
662 ]]></token>
663 <token name="@CUT_HELP@"><![CDATA[
664 Options for Model-specific Score Thresholding
665 ---------------------------------------------
666
667 Curated profile databases may define specific bit score thresholds for each
668 profile, superseding any thresholding based on statistical significance alone.
669 To use these options, the profile must contain the appropriate (GA, TC, and/or
670 NC) optional score threshold annotation; this is picked up by hmmbuild from
671 Stockholm format alignment files. Each thresholding option has two scores: the
672 per-sequence threshold <x1> and the per-domain threshold <x2> These act as if
673 -T<x1> --incT<x1> --domT<x2> --incdomT<x2> has been applied specifically using
674 each model’s curated thresholds.
675
676 **--cut_ga**
677
678 Use the GA (gathering) bit scores in the model to set per-sequence (GA1) and
679 per-domain (GA2) reporting and inclusion thresholds. GA thresholds are
680 generally considered to be the reliable curated thresholds defining family
681 membership; for example, in Pfam, these thresholds define what gets included in
682 Pfam Full alignments based on searches with Pfam Seed models.
683
684 **--cut_nc**
685
686 Use the NC (noise cutoff) bit score thresholds in the model to set
687 per-sequence (NC1) and per-domain (NC2) reporting and inclusion thresholds. NC
688 thresholds are generally considered to be the score of the highest-scoring
689 known false positive.
690
691 **--cut_tc**
692
693 Use the NC (trusted cutoff) bit score thresholds in the model to set
694 per-sequence (TC1) and per-domain (TC2) reporting and inclusion thresholds. TC
695 thresholds are generally considered to be the score of the lowest-scoring known
696 true positive that is above all known false positives.
697 ]]></token>
698 <token name="@EVAL_CALIB_HELP@"><![CDATA[
699 Options Controlling H3 Parameter Estimation Methods
700 ---------------------------------------------------
701
702 H3 uses three short random sequence simulations to estimating the location
703 parameters for the expected score distributions for MSV scores, Viterbi scores,
704 and Forward scores. These options allow these simulations to be modified.
705
706 **--EmL**
707
708 Sets the sequence length in simulation that estimates the location parameter mu
709 for MSV E-values. Default is 200.
710
711 **--EmN**
712
713 Sets the number of sequences in simulation that estimates the location parameter
714 mu for MSV E-values. Default is 200.
715
716 **--EvL**
717
718 Sets the sequence length in simulation that estimates the location parameter mu
719 for Viterbi E-values. Default is 200.
720
721 **--EvN**
722
723 Sets the number of sequences in simulation that estimates the location parameter
724 mu for Viterbi E-values. Default is 200.
725
726
727 **--EfL**
728
729 Sets the sequence length in simulation that estimates the location parameter tau
730 for Forward E-values. Default is 100.
731
732 **--EfN**
733
734 Sets the number of sequences in simulation that estimates the location parameter
735 tau for Forward E-values. Default is 200.
736
737 **--Eft**
738
739 Sets the tail mass fraction to fit in the simulation that estimates the location param-
740 eter tau for Forward evalues. Default is 0.04.
741 ]]></token>
742 <token name="@FORMAT_SELECTOR_HELP@"><![CDATA[
743 Options for Specifying the Alphabet
744 -----------------------------------
745
746 The alphabet type (amino, DNA, or RNA) is autodetected by default, by looking
747 at the composition of the msafile. Autodetection is normally quite reliable,
748 but occasionally alphabet type may be ambiguous and autodetection can fail (for
749 instance, on tiny toy alignments of just a few residues). To avoid this, or to
750 increase robustness in automated analysis pipelines, you may specify the
751 alphabet type of msafile with these options.
752 ]]></token>
753 <token name="@HSSI_HELP@"><![CDATA[
754 Options Controlling Single Sequence Scoring (first Iteration)
755 -------------------------------------------------------------
756
757 By default, the first iteration uses a search model constructed from a single
758 query sequence. This model is constructed using a standard 20x20 substitution
759 matrix for residue probabilities, and two additional pa- rameters for
760 position-independent gap open and gap extend probabilities. These options allow
761 the default single-sequence scoring parameters to be changed.
762
763 **Gap Open (--popen)**
764
765 Set the gap open probability for a single sequence query model to <x>
766
767 **Gap Extend (--pextend)**
768
769 Set the gap extend probability for a single sequence query model to <x>.
770
771
772 **--mx/--mxfile**
773
774 These options are not currently supported
775 ]]></token>
776 <token name="@LENGTHS_HELP@"><![CDATA[
777 Tail Mass Options
778 -----------------
779
780 **Window length tail mass (--w_beta)**
781
782 The upper bound, W, on the length at which nhmmer expects to find an instance
783 of the model is set such that the fraction of all sequences generated by the
784 model with length >= W is less than <x>. The default is 1e-7.
785
786
787 **Model instance length upper bound (--w length)**
788
789 Override the model instance length upper bound, W, which is otherwise
790 controlled by --w beta. It should be larger than the model length. The value of
791 W is used deep in the acceleration pipeline, and modest changes are not
792 expected to impact results (though larger values of W do lead to longer run
793 time).
794
795 ]]></token>
796 <token name="@MCSS_HELP@"><![CDATA[
797 **Options Controlling Profile Construction**
798
799 These options control how consensus columns are defined in an alignment.
800
801 **--fast**
802
803 Define consensus columns as those that have a fraction >= symfrac of residues
804 as opposed to gaps. (See below for the --symfrac option.) This is the default.
805
806 **--hand**
807
808 Define consensus columns in next profile using reference annotation to the multiple
809 alignment. This allows you to define any consensus columns you like.
810
811
812 **--symfrac**
813
814 Define the residue fraction threshold necessary to define a consensus column
815 when using the --fast option. The default is 0.5. The symbol fraction in each
816 column is calculated after taking relative sequence weighting into account, and
817 ignoring gap characters corresponding to ends of sequence fragments (as opposed
818 to internal insertions/deletions). Setting this to 0.0 means that every
819 alignment column will be assigned as consensus, which may be useful in some
820 cases. Setting it to 1.0 means that only columns that include 0 gaps (internal
821 insertions/deletions) will be assigned as consensus.
822
823 **--fragthresh**
824
825 We only want to count terminal gaps as deletions if the aligned sequence is
826 known to be full-length, not if it is a fragment (for instance, because only
827 part of it was sequenced). HMMER uses a simple rule to infer fragments: if the
828 sequence length L is less than or equal to a fraction <x> times the alignment
829 length in columns, then the sequence is handled as a fragment. The default is
830 0.5. Setting --fragthresh0 will define no (nonempty) sequence as a fragment;
831 you might want to do this if you know you’ve got a carefully curated alignment
832 of full-length sequences. Setting --fragthresh1 will define all sequences as
833 fragments; you might want to do this if you know your alignment is entirely
834 composed of fragments, such as translated short reads in metagenomic shotgun
835 data.
836
837 ]]></token>
838 <token name="@OFORMAT_WITH_OPTS_HELP@"><![CDATA[
839 Options for Controlling Output
840 ------------------------------
841
842 **Table of hits**
843
844 Save a simple tabular (space-delimited) file summarizing the per-target output, with
845 one data line per homologous target model found.
846
847 **Table of per-domain hits**
848
849 Save a simple tabular (space-delimited) file summarizing the per-domain output,
850 with one data line per homologous domain detected in a query sequence for each
851 homologous model.
852
853 **Table of hits and domains in Pfam Format**
854
855 Save an especially succinct tabular (space-delimited) file summarizing the
856 per-target output, with one data line per homologous target model found.
857 ]]></token>
858 <token name="@OFORMAT_WITH_OPTS_NOPFAM_HELP@"><![CDATA[
859 Options for Controlling Output
860 ------------------------------
861
862 **Table of hits**
863
864 Save a simple tabular (space-delimited) file summarizing the per-target output, with
865 one data line per homologous target model found.
866
867 **Table of per-domain hits**
868
869 Save a simple tabular (space-delimited) file summarizing the per-domain output,
870 with one data line per homologous domain detected in a query sequence for each
871 homologous model.
872 ]]></token>
873 <token name="@OFORMAT_WITH_OPTS_N_HELP@"><![CDATA[
874 Options for Controlling Output
875 ------------------------------
876
877 **Table of hits**
878
879 Save a simple tabular (space-delimited) file summarizing the per-target output, with
880 one data line per homologous target model found.
881
882 **Table of hits (dfam)**
883
884 Save a tabular (space-delimited) file summarizing the per-hit output, similar
885 to --tblout but more succinct.
886
887
888 **List of per-position scores for each hit (--aliscoreout)**
889
890 Save to file a list of per-position scores for each hit. This is useful, for
891 example, in identifying regions of high score density for use in resolving
892 overlapping hits from different models.
893
894 ]]></token>
895 <token name="@PRIOR_HELP@"><![CDATA[
896 Options Controlling Priors
897 --------------------------
898
899 By default, weighted counts are converted to mean posterior probability
900 parameter estimates using mixture Dirichlet priors. Default mixture Dirichlet
901 prior parameters for protein models and for nucleic acid (RNA and DNA) models
902 are built in. The following options allow you to override the default priors.
903
904 **No priors (--pnone)**
905
906 Don’t use any priors. Probability parameters will simply be the observed
907 frequencies, after relative sequence weighting.
908
909 **Laplace +1 prior**
910
911 Use a Laplace +1 prior in place of the default mixture Dirichlet prior.
912 ]]></token>
913 <token name="@SEED_HELP@"><![CDATA[
914 Random Seeding
915 --------------
916
917 Seed the random number generator with <n>, an integer >= 0. If <n> is nonzero,
918 any stochastic simulations will be reproducible; the same command will give the
919 same results. If <n> is 0, the random number generator is seeded arbitrarily,
920 and stochastic simulations will vary from run to run of the same command.
921
922 ]]></token>
923 <token name="@THRESHOLDS_HELP@"><![CDATA[
924 Options for Reporting Thresholds
925 --------------------------------
926
927 Reporting thresholds control which hits are reported in output files (the main
928 output, --tblout, and --domtblout).
929
930 **E-value (-E)**
931
932 In the per-target output, report target profiles with an E-value of <= <x>. The
933 default is 10.0, meaning that on average, about 10 false positives will be
934 reported per query, so you can see the top of the noise and decide for yourself
935 if it’s really noise.
936
937 **Bit score (-T)**
938
939 Instead of thresholding per-profile output on E-value, instead report target profiles
940 with a bit score of >= <x>.
941
942 **domain E-value (--domE)**
943
944 In the per-domain output, for target profiles that have already satisfied the
945 per-profile reporting threshold, report individual domains with a conditional
946 E-value of <= <x>. The default is 10.0. A conditional E-value means the
947 expected number of additional false positive domains in the smaller search
948 space of those comparisons that already satisfied the per-profile reporting
949 threshold (and thus must have at least one homologous domain already).
950
951 **domain Bit scores (--domT)**
952
953 Instead of thresholding per-domain output on E-value, instead report domains
954 with a bit score of >= <x>.
955
956 Options for Inclusion Thresholds
957 --------------------------------
958
959 Inclusion thresholds are stricter than reporting thresholds. Inclusion
960 thresholds control which hits are considered to be reliable enough to be
961 included in an output alignment or a subsequent search round. In hmmscan, which
962 does not have any alignment output (like hmmsearch or phmmer) nor any iterative
963 search steps (like jackhmmer), inclusion thresholds have little effect. They
964 only affect what domains get marked as significant (!) or questionable (?) in
965 domain output.
966
967 **E-value of per target inclusion threshold**
968
969 Use an E-value of <= <x> as the per-target inclusion threshold. The default is
970 0.01, meaning that on average, about 1 false positive would be expected in
971 every 100 searches with different query sequences.
972
973 **Bit score of per target inclusion threshold**
974
975 Instead of using E-values for setting the inclusion threshold, instead use a
976 bit score of >= <x> as the per-target inclusion threshold. It would be unusual
977 to use bit score thresholds with hmmscan, because you don’t expect a single
978 score threshold to work for different profiles; different profiles have
979 slightly different expected score distributions.
980
981 **domain E-value per target inclusion treshold**
982
983 Use a conditional E-value of <= <x> as the per-domain inclusion threshold, in
984 targets that have already satisfied the overall per-target inclusion threshold.
985
986 **domain Bit score per target inclusion treshold**
987
988 Instead of using E-values, instead use a bit score of >= <x> as the per-domain
989 inclusion threshold. As with --incT above, it would be unusual to use a single
990 bit score threshold in hmmscan.
991
992 ]]></token>
993 <token name="@THRESHOLDS_NODOM_HELP@"><![CDATA[
994 Options for Reporting Thresholds
995 --------------------------------
996
997 Reporting thresholds control which hits are reported in output files (the main
998 output, --tblout, and --domtblout).
999
1000 **E-value (-E)**
1001
1002 In the per-target output, report target profiles with an E-value of <= <x>. The
1003 default is 10.0, meaning that on average, about 10 false positives will be
1004 reported per query, so you can see the top of the noise and decide for yourself
1005 if it’s really noise.
1006
1007 **Bit score (-T)**
1008
1009 Instead of thresholding per-profile output on E-value, instead report target profiles
1010 with a bit score of >= <x>.
1011
1012 Options for Inclusion Thresholds
1013 --------------------------------
1014
1015 Inclusion thresholds are stricter than reporting thresholds. Inclusion
1016 thresholds control which hits are considered to be reliable enough to be
1017 included in an output alignment or a subsequent search round. In hmmscan, which
1018 does not have any alignment output (like hmmsearch or phmmer) nor any iterative
1019 search steps (like jackhmmer), inclusion thresholds have little effect. They
1020 only affect what domains get marked as significant (!) or questionable (?) in
1021 domain output.
1022
1023 **E-value of per target inclusion threshold**
1024
1025 Use an E-value of <= <x> as the per-target inclusion threshold. The default is
1026 0.01, meaning that on average, about 1 false positive would be expected in
1027 every 100 searches with different query sequences.
1028
1029 **Bit score of per target inclusion threshold**
1030
1031 Instead of using E-values for setting the inclusion threshold, instead use a
1032 bit score of >= <x> as the per-target inclusion threshold. It would be unusual
1033 to use bit score thresholds with hmmscan, because you don’t expect a single
1034 score threshold to work for different profiles; different profiles have
1035 slightly different expected score distributions.
1036
1037 ]]></token>
1038 <token name="@ATTRIBUTION@"><![CDATA[
1039
1040 Attribution
1041 -----------
1042
1043 This Galaxy tool relies on HMMER3_ from http://hmmer.janelia.org/
1044 Internally the software is cited as:
1045
1046 ::
1047
1048 # hmmscan :: search sequence(s) against a profile database
1049 # HMMER 3.1 (February 2013); http://hmmer.org/
1050 # Copyright (C) 2011 Howard Hughes Medical Institute.
1051 # Freely distributed under the GNU General Public License (GPLv3).
1052 # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1053
1054 The wrappers were written by Eric Rasche and is licensed under Apache2_. The
1055 documentation is copied from the HMMER3 documentation.
1056
1057 .. _Apache2: http://www.apache.org/licenses/LICENSE-2.0
1058 .. _HMMER3: http://hmmer.janelia.org/
1059
1060
1061 ]]></token>
1062 <token name="@HELP_PRE@"><![CDATA[
1063
1064 What it does
1065 ============
1066 ]]></token>
1067 <token name="@HELP_PRE_OTH@"><![CDATA[
1068 Options
1069 =======
1070 ]]></token>
1071 </macros>