comparison cmalign.xml @ 1:55bb96edfc07 draft

author bgruening
date Thu, 24 Apr 2014 15:02:05 -0400
children fac157e22e1b
equal deleted inserted replaced
0:652f9d550531 1:55bb96edfc07
1 <tool id="infernal_cmsearch" name="Search covariance model(s)" version="">
2 <description>against a sequence database (cmsearch)</description>
3 <parallelism method="multi" split_inputs="seqdb" split_mode="to_size" split_size="100" shared_inputs="" merge_outputs="outfile,multiple_alignment_output"></parallelism>
4 <requirements>
5 <requirement type="package">infernal</requirement>
6 <requirement type="package" version="1.1">infernal</requirement>
7 <requirement type="package" version="8.21">gnu_coreutils</requirement>
8 </requirements>
9 <command>
10 ## a temp file is needed, because the standard tabular output from infernal is not usefull in Galaxy
11 ## it will be converted to a tab delimited file and piped to Galaxy
12 temp_tabular_output=\$(mktemp);
14 cmsearch
15 ## Infernal Options
16 --cpu "\${GALAXY_SLOTS:-12}"
17 -o /dev/null
18 --tformat $seqdb.ext ##target format: fasta, embl, genbank, ddbj, stockholm, pfam, a2m, afa, clustal, and phylip
19 $bottomonly
20 $toponly
21 $cyk
22 $notrunc
23 $max
24 $nohmm
25 $mid
26 ##$bitscore_thresholds
27 --tblout \$temp_tabular_output
28 $g
29 #if $A:
30 $A $multiple_alignment_output
31 #end if
33 #if $inclusion_thresholds_opts.inclusion_thresholds_selector == "--incE":
34 --incE $inclusion_thresholds_opts.incE
35 #elif $inclusion_thresholds_opts.inclusion_thresholds_selector == "--incT":
36 --incT $inclusion_thresholds_opts.incT
37 #end if
39 #if $reporting_thresholds_opts.reporting_thresholds_selector == "-E":
40 -E $reporting_thresholds_opts.E
41 #elif $reporting_thresholds_opts.reporting_thresholds_selector == "-T":
42 -T $reporting_thresholds_opts.T
43 #end if
45 ## CM file from the history or stored as database on disc
47 #if $cm_opts.cm_opts_selector == "db":
48 $cm_opts.database.fields.path
49 #else:
50 $cm_opts.cmfile
51 #end if
53 ## sequence file
54 $seqdb
55 2>&#38;1
56 ;
58 ## 1. replace all lines starting # (comment lines)
59 ## 2. replace the first 18 spaces with tabs, 18th field is a free text field (can contain spaces)
60 sed -e 's/#.*$//' -e '/^$/d' -e 's/ /\t/g' -e 's/\t/ /18g' \$temp_tabular_output > $outfile
62 </command>
63 <inputs>
65 <param name="seqdb" type="data" format="fasta" label="Sequence database"/>
66 <conditional name="cm_opts">
67 <param name="cm_opts_selector" type="select" label="Subject covariance models">
68 <option value="db" selected="True">Locally installed covariance models</option>
69 <option value="histdb">Covariance model from your history</option>
70 </param>
71 <when value="db">
72 <param name="database" type="select" label="Covariance models">
73 <options from_file="infernal.loc">
74 <column name="value" index="0"/>
75 <column name="name" index="1"/>
76 <column name="path" index="2"/>
77 </options>
78 </param>
79 </when>
80 <when value="histdb">
81 <param name="cmfile" type="data" format="txt" label="Covariance models file from the history."/>
82 </when>
83 </conditional>
85 <param name="g" truevalue="-g" falsevalue="" checked="False" type="boolean"
86 label="Turn on the glocal alignment algorithm" help="... global with respect to the query model and local with respect to the target database."/>
88 <param name="bottomonly" truevalue="--bottomonly" falsevalue="" checked="False" type="boolean"
89 label="Only search the bottom (Crick) strand of target sequences" help="in the sequence database"/>
90 <param name="toponly" truevalue="--toponly" falsevalue="" checked="False" type="boolean"
91 label="Only search the top (Watson) strand of target sequences" help="in the sequence database"/>
93 <param name="cyk" truevalue="--cyk" falsevalue="" checked="False" type="boolean"
94 label="Use the CYK algorithm, not Inside, to determine the final score of all hits" help=""/>
95 <param name="--acyk" truevalue="--cyk" falsevalue="" checked="False" type="boolean"
96 label="Use the CYK algorithm to align hits" help="By default, the Durbin/Holmes optimal accuracy algorithm is used, which finds the alignment that maximizes the expected accuracy of all aligned residues."/>
98 <param name="notrunc" truevalue="--notrunc" falsevalue="" checked="False" type="boolean"
99 label="Turn off truncated hit detection" help=""/>
101 <!-- accelleration pipeline -->
103 <param name="max" truevalue="--max" falsevalue="" checked="False" type="boolean"
104 label="Turn off all filters, and run non-banded Inside on every full-length target sequence" help="This
105 increases sensitivity somewhat, at an extremely large cost in speed."/>
107 <param name="nohmm" truevalue="--nohmm" falsevalue="" checked="False" type="boolean"
108 label="Turn off all HMM filter stages " help=""/>
110 <param name="mid" truevalue="--mid" falsevalue="" checked="False" type="boolean"
111 label="Turn off the HMM SSV and Viterbi filter stages" help=""/>
114 <!-- Options for model-specific score thresholding -->
115 <!--
116 <param name="bitscore_thresholds" type="select" label="Bit score thresholds" help="Curated CM databases may define specific bit score thresholds for each CM, superseding any thresholding based on statistical significance alone.">
117 <option value="" selected="true">None</option>
118 <option value=" - -cut_ga">GA (gathering) bit scores</option>
119 <option value=" - -cut_nc">NC (noise cutoff) bit score</option>
120 <option value=" - -cut_tc">TC (trusted cutoff) bit score</option>
121 </param>
122 -->
123 <!-- Options for inclusion thresholds -->
126 <conditional name="inclusion_thresholds_opts">
127 <param name="inclusion_thresholds_selector" type="select" label="Inclusion thresholds"
128 help="Inclusion thresholds are stricter than reporting thresholds. Inclusion thresholds control which hits are considered to be reliable enough to be included in an output alignment or in a possible subsequent search round, or marked as significant (”!”) as opposed to questionable (”?”) in hit output.">
129 <option value="" selected="true">default</option>
130 <option value="--incE">Use E-value</option>
131 <option value="--incT">Use bit score</option>
132 </param>
133 <when />
134 <when value="--incE">
135 <param name="incE" type="float" value="0.01" size="5" label="Use E-value" help="of &lt;= X as the hit inclusion threshold.">
136 <sanitizer>
137 <valid initial="string.printable">
138 <remove value="&apos;"/>
139 </valid>
140 </sanitizer>
141 </param>
142 </when>
143 <when value="--incT">
144 <param name="incT" type="integer" size="5" value="0" label="Use bit score" help="of >= X as the hit inclusion threshold.">
145 <sanitizer>
146 <valid initial="string.printable">
147 <remove value="&apos;"/>
148 </valid>
149 </sanitizer>
150 </param>
151 </when>
152 </conditional>
154 <!-- Options controlling reporting thresholds -->
156 <conditional name="reporting_thresholds_opts">
157 <param name="reporting_thresholds_selector" type="select" label="reporting thresholds"
158 help="Reporting thresholds control which hits are reported in output files">
159 <option value="" selected="true">default</option>
160 <option value="-E">Use E-value</option>
161 <option value="-T">Use bit score</option>
162 </param>
163 <when />
164 <when value="-E">
165 <param name="E" type="float" value="10.0" size="5" label="Use E-value" help="of &lt;= X as the hit reporting threshold. The default is 10.0, meaning that on average, about 10 false positives will be reported per query, so you can see the top of the noise and decide for yourself if it’s really noise.">
166 <sanitizer>
167 <valid initial="string.printable">
168 <remove value="&apos;"/>
169 </valid>
170 </sanitizer>
171 </param>
172 </when>
173 <when value="-T">
174 <param name="T" type="integer" size="5" value="0" label="Use bit score" help="of >= X as the hit reporting threshold.">
175 <sanitizer>
176 <valid initial="string.printable">
177 <remove value="&apos;"/>
178 </valid>
179 </sanitizer>
180 </param>
181 </when>
182 </conditional>
184 <param name="A" truevalue="-A" falsevalue="" checked="False" type="boolean"
185 label="Save a multiple alignment of all significant hits" help="... those satisfying inclusion thresholds"/>
187 </inputs>
188 <outputs>
190 <data format="tabular" name="outfile" label="cmsearch on ${on_string}"/>
191 <data format="tabular" name="multiple_alignment_output" label="cmsearch on ${on_string} (multi alignment)">
192 <filter>A is True</filter>
193 </data>
195 </outputs>
196 <help>
199 **What it does**
201 cmalign aligns the RNA sequences in <seqfile> to the covariance model (CM) in <cmfile>. The new alignment is
202 output to stdout in Stockholm format, but can be redirected to a file <f> with the -o <f> option.
203 Either <cmfile> or <seqfile> (but not both) may be ’-’ (dash), which means reading this input from stdin rather than a
204 file.
205 The sequence file <seqfile> must be in FASTA or Genbank format.
206 cmalign uses an HMM banding technique to accelerate alignment by default as described below for the --hbanded
207 option. HMM banding can be turned off with the --nonbanded option.
208 By default, cmalign computes the alignment with maximum expected accuracy that is consistent with constraints
209 (bands) derived from an HMM, using a banded version of the Durbin/Holmes optimal accuracy algorithm. This be-
210 havior can be changed with the --cyk or --sample options.
211 cmalign takes special care to correctly align truncated sequences, where some nucleotides from the beginning (5’)
212 and/or end (3’) of the actual full length biological sequence are not present in the input sequence (see DL Kolbe and
213 SR Eddy, Bioinformatics, 25:1236-1243, 2009). This behavior is on by default, but can be turned off with --notrunc. In
214 previous versions of cmalign the --sub option was required to appropriately handle truncated sequences. The --sub
215 option is still available in this version, but the new default method for handling truncated sequences should be as good
216 or superior to the sub method in nearly all cases.
217 The --mapali <s> option allows inclusion of the fixed training alignment used to build the CM from file <s> within the
218 output alignment of cmalign.
219 It is possible to merge two or more alignments created by the same CM using the Easel miniapp esl-alimerge (included
220 in the easel/miniapps/ subdirectory of Infernal). Previous versions of cmalign included options to merge alignments
221 but they were deprecated upon development of esl-alimerge, which is significantly more memory efficient.
222 By default, cmalign will output the alignment to stdout. The alignment can be redirected to an output file <f> with the
223 -o <f> option. With -o, information on each aligned sequence, including score and model alignment boundaries will be
224 printed to stdout (more on this below).
225 The output alignment will be in Stockholm format by default. This can be changed to Pfam, aligned FASTA (AFA), A2M,
226 Clustal, or Phylip format using the --outformat <s> option, where <s> is the name of the desired format. As a special
227 case, if the output alignment is large (more than 10,000 sequences or more than 10,000,000 total nucleotides) than the
228 output format will be Pfam format, with each sequence appearing on a single line, for reasons of memory efficiency. For
229 alignments larger than this, using --ileaved will force interleaved Stockholm format, but the user should be aware that
230 this may require a lot of memory. --ileaved will only work for alignments up to 100,000 sequences or 100,000,000 total
231 nucleotides.
232 If the output alignment format is Stockholm or Pfam, the output alignment will be annotated with posterior probabilities
233 which estimate the confidence level of each aligned nucleotide. This annotation appears as lines beginning with ”#=GR
234 <seq name> PP”, one per sequence, each immediately below the corresponding aligned sequence ”<seq name>”.
235 Characters in PP lines have 12 possible values: ”0-9”, ”*”, or ”.”. If ”.”, the position corresponds to a gap in the sequence.
236 A value of ”0” indicates a posterior probability of between 0.0 and 0.05, ”1” indicates between 0.05 and 0.15, ”2”
237 indicates between 0.15 and 0.25 and so on up to ”9” which indicates between 0.85 and 0.95. A value of ”*” indicates
238 a posterior probability of between 0.95 and 1.0. Higher posterior probabilities correspond to greater confidence that
239 the aligned nucleotide belongs where it appears in the alignment. With --nonbanded, the calculation of the posterior
240 probabilities considers all possible alignments of the target sequence to the CM. Without --nonbanded (i.e. in default
241 mode), the calculation considers only possible alignments within the HMM bands. Further, the posterior probabilities
242 are conditional on the truncation mode of the alignment. For example, if the sequence alignment is truncated 5’, a PP
243 value of ”9” indicates between 0.85 and 0.95 of all 5’ truncated alignments include the given nucleotide at the given
244 position. The posterior annotation can be turned off with the --noprob option. If --small is enabled, posterior annotation
245 must also be turned off using --noprob.
246 The tabular output that is printed to stdout if the -o option is used includes one line per sequence and twelve fields
247 per line: ”idx”: the index of the sequence in the input file, ”seq name”: the sequence name; ”length”: the length of the
248 sequence; ”cm from” and ”cm to”: the model start and end positions of the alignment; ”trunc”: ”no” if the sequence is
249 not truncated, ”5’” if the beginning of the sequence truncated 5’, ”3’” if the end of the sequence is truncated, and ”5’&3’”
250 if both the beginning and the end are truncated; ”bit sc”: the bit score of the alignment, ”avg pp” the average posterior
251 probability of all aligned nucleotides in the alignment; ”band calc”, ”alignment” and ”total”: the time in seconds required
252 for calculating HMM bands, computing the alignment, and complete processing of the sequence, respectively; ”mem
253 (Mb)”: the size in Mb of all dynamic programming matrices required for aligning the sequence. This tabular data can be
254 saved to file <f> with the --sfile <f> option.
257 Options for controlling the alignment algorithm
258 --optacc Align sequences using the Durbin/Holmes optimal accuracy algorithm. This is the default.
259 The optimal accuracy alignment will be constrained by HMM bands for acceleration unless
260 the --nonbanded option is enabled. The optimal accuracy algorithm determines the align-
261 ment that maximizes the posterior probabilities of the aligned nucleotides within it. The
262 posterior probabilites are determined using (possibly HMM banded) variants of the Inside
263 and Outside algorithms.
264 --cyk Do not use the Durbin/Holmes optimal accuracy alignment to align the sequences, instead
265 use the CYK algorithm which determines the optimally scoring (maximum likelihood) align-
266 ment of the sequence to the model, given the HMM bands (unless --nonbanded is also
267 enabled).
268 --sample Sample an alignment from the posterior distribution of alignments. The posterior distribution
269 is determined using an HMM banded (unless --nonbanded) variant of the Inside algorithm.
270 --seed <n> Seed the random number generator with <n>, an integer >= 0. This option can only be
271 used in combination with --sample. If <n> is nonzero, stochastic sampling of alignments
272 will be reproducible; the same command will give the same results. If <n> is 0, the random
273 number generator is seeded arbitrarily, and stochastic samplings may vary from run to run
274 of the same command. The default seed is 181.
275 --notrunc Turn off truncated alignment algorithms. All sequences in the input file will be assumed to be
276 full length, unless --sub is also used, in which case the program can still handle truncated
277 sequences but will use an alternative strategy for their alignment.
278 --sub Turn on the sub model construction and alignment procedure. For each sequence, an HMM
279 is first used to predict the model start and end consensus columns, and a new sub CM is
280 constructed that only models consensus columns from start to end. The sequence is then
281 aligned to this sub CM. Sub alignment is an older method than the default one for aligning
282 sequences that are possibly truncated. By default, cmalign uses special DP algorithms to
283 handle truncated sequences which should be more accurate than the sub method in most
284 cases. --sub is still included as an option mainly for testing against this default truncated
285 sequence handling. This ”sub CM” procedure is not the same as the ”sub CMs” described
286 by Weinberg and Ruzzo.
289 Other options
290 --mapali <f> Reads the alignment from file <f> used to build the model aligns it as a single object to
291 the CM; e.g. the alignment in <f> is held fixed. This allows you to align sequences to a
292 model with cmalign and view them in the context of an existing trusted multiple alignment.
293 <f> must be the alignment file that the CM was built from. The program verifies that the
294 checksum of the file matches that of the file used to construct the CM. A similar option to
295 this one was called --withali in previous versions of cmalign.
296 --mapstr Must be used in combination with --mapali <f>. Propogate structural information for any
297 pseudoknots that exist in <f> to the output alignment. A similar option to this one was called
298 --withstr in previous versions of cmalign.
299 --informat <s> Assert that the input <seqfile> is in format <s>. Do not run Babelfish format autodec-
300 tion. This increases the reliability of the program somewhat, because the Babelfish can
301 make mistakes; particularly recommended for unattended, high-throughput runs of Infernal.
302 Acceptable formats are: FASTA, GENBANK, and DDBJ. <s> is case-insensitive.
303 --outformat <s> Specify the output alignment format as <s>. Acceptable formats are: Pfam, AFA, A2M,
304 Clustal, and Phylip. AFA is aligned fasta. Only Pfam and Stockholm alignment formats
305 will include consensus structure annotation and posterior probability annotation of aligned
306 residues.
307 --dnaout Output the alignments as DNA sequence alignments, instead of RNA ones.
308 --noprob Do not annotate the output alignment with posterior probabilities.
309 --matchonly Only include match columns in the output alignment, do not include any insertions relative
310 to the consensus model. This option may be useful when creating very large alignments
311 that require a lot of memory and disk space, most of which is necessary only to deal with
312 insert columns that are gaps in most sequences.
313 --ileaved Output the alignment in interleaved Stockholm format of a fixed width that may be more con-
314 venient for examination. This was the default output alignment format of previous versions
315 of cmalign. Note that cmalign requires more memory when this option is used. For this
316 reason, --ileaved will only work for alignments of up to 100,000 sequences or a total of
317 100,000,000 aligned nucleotides.
318 --regress <s> Save an additional copy of the output alignment with no author information to file <s>.
319 --verbose Output additional information in the tabular scores output (output to stdout if -o is used, or
320 to <f> if --sfile <f> is used). These are mainly useful for testing and debugging.
321 --cpu <n> Specify that <n> parallel CPU workers be used. If <n> is set as ”0”, then the program will
322 be run in serial mode, without using threads. You can also control this number by setting an
323 environment variable, INFERNAL NCPU. This option will only be available if the machine on
324 which Infernal was built is capable of using POSIX threading (see the Installation section of
325 the user guide for more information).
326 --mpi Run as an MPI parallel program. This option will only be available if Infernal has been
327 configured and built with the ”--enable-mpi” flag (see the Installation section of the user
328 guide for more information).
336 Output format
337 -------------
339 (1) target name: The name of the target sequence or profile.
340 (2) accession: The accession of the target sequence or profile, or ’-’ if none.
341 (3) query name: The name of the query sequence or profile.
342 (4) accession: The accession of the query sequence or profile, or ’-’ if none.
343 (5) mdl (model): Which type of model was used to compute the final score. Either ’cm’ or ’hmm’. A CM is used to compute the final hit scores unless the model has zero basepairs or the --hmmonly option is used, in which case a HMM will be used.
344 (6) mdl from (model coord): The start of the alignment of this hit with respect to the profile (CM or HMM), numbered 1..N for a profile of N consensus positions.
345 (7) mdl to (model coord): The end of the alignment of this hit with respect to the profile (CM or HMM), numbered 1..N for a profile of N consensus positions.
346 (8) seq from (ali coord): The start of the alignment of this hit with respect to the sequence, numbered 1..L for a sequence of L residues.
347 (9) seq to (ali coord): The end of the alignment of this hit with respect to the sequence, numbered 1..L for a sequence of L residues.
348 (10) strand: The strand on which the hit occurs on the sequence. ’+’ if the hit is on the top (Watson) strand, ’-’ if the hit is on the bottom (Crick) strand. If on the top strand, the “seq from” value will be less than or equal to the “seq to” value, else it will be greater than or equal to it.
349 (11) trunc: Indicates if this is predicted to be a truncated CM hit or not. This will be “no” if it is a CM hit that is not predicted to be truncated by the end of the sequence, “5’ ” or “3’ ” if the hit is predicted to have one or more 5’ or 3’ residues missing due to a artificial truncation of the sequence, or “5’&amp;3”’ if the hit is predicted to have one or more 5’ residues missing and one or more 3’ residues missing. If the hit is an HMM hit, this will always be ’-’.
350 (12) pass: Indicates what “pass” of the pipeline the hit was detected on. This is probably only useful for testing and debugging. Non-truncated hits are found on the first pass, truncated hits are found on successive passes.
351 (13) gc: Fraction of G and C nucleotides in the hit.
352 (14) bias: The biased-composition correction: the bit score difference contributed by the null3 model for CM hits, or the null2 model for HMM hits. High bias scores may be a red flag for a false positive. It is difficult to correct for all possible ways in which a nonrandom but nonhomologous biological sequences can appear to be similar, such as short-period tandem repeats, so there are cases where the bias correction is not strong enough (creating false positives).
353 (15) score: The score (in bits) for this target/query comparison. It includes the biased-composition cor-rection (the “null3” model for CM hits, or the “null2” model for HMM hits).
354 (16) E-value: The expectation value (statistical significance) of the target. This is a per query E-value; i.e. calculated as the expected number of false positives achieving this comparison’s score for a single query against the search space Z. For cmsearch Z is defined as the total number of nucleotides in the target dataset multiplied by 2 because both strands are searched. For cmscan Z is the total number of nucleotides in the query sequence multiplied by 2 because both strands are searched and multiplied by the number of models in the target database. If you search with multiple queries and if you want to control the overall false positive rate of that search rather than the false positive rate per query, you will want to multiply this per-query E-value by how many queries you’re doing.
355 (17) inc: Indicates whether or not this hit achieves the inclusion threshold: ’!’ if it does, ’?’ if it does not (and rather only achieves the reporting threshold). By default, the inclusion threshold is an E-value of 0.01 and the reporting threshold is an E-value of 10.0, but these can be changed with command line options as described in the manual pages.
356 (18) description of target: The remainder of the line is the target’s description line, as free text.
359 For further questions please refere to the Infernal Userguide_.
361 .. _Userguide:
364 How do I cite Infernal?
365 -----------------------
367 The recommended citation for using Infernal 1.1 is E. P. Nawrocki and S. R. Eddy, Infernal 1.1: 100-fold faster RNA homology searches , Bioinformatics 29:2933-2935 (2013).
369 **Galaxy Wrapper Author**::
371 * Bjoern Gruening, University of Freiburg
373 </help>
374 </tool>