comparison cmbuild.xml @ 4:c47a7c52ac4f draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/infernal commit 236abfe2af8034017994daea08079887e64b68c8
author bgruening
date Mon, 19 Dec 2016 15:27:06 -0500
parents 2c2c5e5e495b
children 6e18e0b098cd
comparison
equal deleted inserted replaced
3:2c2c5e5e495b 4:c47a7c52ac4f
1 <tool id="infernal_cmbuild" name="Build covariance models" version="1.1.0.1"> 1 <tool id="infernal_cmbuild" name="Build covariance models" version="1.1.0.2">
2 <description>from sequence alignments (cmbuild)</description> 2 <description>from sequence alignments (cmbuild)</description>
3 <parallelism method="multi" split_inputs="alignment_infile" split_mode="to_size" split_size="10" shared_inputs="" merge_outputs="cmfile_outfile"></parallelism> 3 <parallelism method="multi" split_inputs="alignment_infile" split_mode="to_size" split_size="10" shared_inputs="" merge_outputs="cmfile_outfile"></parallelism>
4 <requirements> 4 <requirements>
5 <requirement type="package">infernal</requirement> 5 <requirement type="package">infernal</requirement>
6 <requirement type="package" version="1.1">infernal</requirement> 6 <requirement type="package" version="1.1">infernal</requirement>
7 <requirement type="package" version="8.22">gnu_coreutils</requirement> 7 <requirement type="package" version="8.22">gnu_coreutils</requirement>
8 </requirements> 8 </requirements>
9 <command> 9 <command>
10 <![CDATA[ 10 <![CDATA[
11 cmbuild 11 cmbuild -F
12 #if $is_summery_output: 12 #if $is_summery_output:
13 -o $summary_outfile 13 -o '$summary_outfile'
14 #end if 14 #end if
15
16 ## to many outputs, is that one really needed?
17 ##-O $annotated_source_alignment_outfile
18 15
19 $model_construction_opts.model_construction_opts_selector 16 $model_construction_opts.model_construction_opts_selector
20 #if $model_construction_opts.model_construction_opts_selector == '--fast': 17 #if $model_construction_opts.model_construction_opts_selector == '--fast':
21 --symfrac $model_construction_opts.symfrac 18 --symfrac $model_construction_opts.symfrac
22 #end if 19 #end if
52 49
53 $notrunc 50 $notrunc
54 $cyk 51 $cyk
55 #end if 52 #end if
56 53
57 $cmfile_outfile 54 '$cmfile_outfile'
58 $alignment_infile 55 '$alignment_infile'
59 56 &&
57 cmcalibrate
58 -L 0.01 --cpu \${GALAXY_SLOTS:-2}
59 '$cmfile_outfile'
60 ]]> 60 ]]>
61 </command> 61 </command>
62 <inputs> 62 <inputs>
63 <!-- Stockholm or SELEX 63 <!-- Stockholm or SELEX
64 SELEX is defined in EMBOSS datatypes 64 SELEX is defined in EMBOSS datatypes
96 <param name="wid" type="float" value="0.5" 96 <param name="wid" type="float" value="0.5"
97 label="Percent identity for clustering the alignment (--wid)" help=""/> 97 label="Percent identity for clustering the alignment (--wid)" help=""/>
98 </when> 98 </when>
99 </conditional> 99 </conditional>
100 100
101
102 <conditional name="effective_opts"> 101 <conditional name="effective_opts">
103 <param name="effective_opts_selector" type="select" label="Options controlling effective sequence number" help=""> 102 <param name="effective_opts_selector" type="select" label="Options controlling effective sequence number" help="">
104 <option value="--eent" selected="true">entropy weighting strategy (--eent)</option> 103 <option value="--eent" >entropy weighting strategy (--eent)</option>
105 <option value="--enone">Turn off the entropy weighting strategy (--enone)</option> 104 <option value="--enone" selected="true">Turn off the entropy weighting strategy (--enone)</option>
106 </param> 105 </param>
107 <when value="--enone"/> 106 <when value="--enone"/>
108 <when value="--eent"> 107 <when value="--eent">
109 <param name="ere" type="float" value="0.59" 108 <param name="ere" type="float" value="0.59"
110 label="Set the target mean match state relative entropy (--ere)" help=""/> 109 label="Set the target mean match state relative entropy (--ere)" help=""/>
117 116
118 <param name="eset" type="integer" value="" 117 <param name="eset" type="integer" value=""
119 label="Set the effective sequence number for entropy weighting (--eset)" help=""/> 118 label="Set the effective sequence number for entropy weighting (--eset)" help=""/>
120 </when> 119 </when>
121 </conditional> 120 </conditional>
122
123 121
124 <conditional name="refining_opts"> 122 <conditional name="refining_opts">
125 <param name="refining_opts_selector" type="select" label="Options for refining the input alignment" help=""> 123 <param name="refining_opts_selector" type="select" label="Options for refining the input alignment" help="">
126 <option value="" selected="true">No refinement</option> 124 <option value="" selected="true">No refinement</option>
127 <option value="--refine">refine the input alignment</option> 125 <option value="--refine">refine the input alignment</option>
155 label="Output the refined alignment file as it is used to build the covariance model" help=""/> 153 label="Output the refined alignment file as it is used to build the covariance model" help=""/>
156 154
157 </when> 155 </when>
158 </conditional> 156 </conditional>
159 157
160
161 <param name="is_summery_output" truevalue="" falsevalue="" checked="False" type="boolean" 158 <param name="is_summery_output" truevalue="" falsevalue="" checked="False" type="boolean"
162 label="Output a summery file?" help=""/> 159 label="Output a summery file?" help=""/>
163 160
164 </inputs> 161 </inputs>
165 <outputs> 162 <outputs>
166
167 <data format="text" name="summary_outfile" label="cmbuild summary on ${on_string}"> 163 <data format="text" name="summary_outfile" label="cmbuild summary on ${on_string}">
168 <filter>is_summery_output is True</filter> 164 <filter>is_summery_output is True</filter>
169 </data> 165 </data>
170 <!--<data format="stockholm" name="annotated_source_alignment_outfile" label="Annotated alignment from ${on_string}"/>--> 166 <!--<data format="stockholm" name="annotated_source_alignment_outfile" label="Annotated alignment from ${on_string}"/>-->
171 <data format="cm" name="cmfile_outfile" label="Covariance models from ${on_string}"/> 167 <data format="cm" name="cmfile_outfile" label="Covariance models from ${on_string}"/>
181 177
182 </outputs> 178 </outputs>
183 <help> 179 <help>
184 <![CDATA[ 180 <![CDATA[
185 181
186
187 **What it does** 182 **What it does**
188 183
189 cmbuild belongs to the INFERNAL software package that allows you to make consensus RNA secondary structure profiles, and use them to search nucleic acid sequence databases for homologous RNAs, or to create new structure-based multiple sequence alignments. 184 cmbuild belongs to the INFERNAL software package that allows you to make consensus RNA secondary structure profiles, and use them to search nucleic acid sequence databases for homologous RNAs, or to create new structure-based multiple sequence alignments.
190 185
191 cm build builds a covariance model of an RNA multiple alignment. cmbuild uses the consensus structure to determine the architecture of the CM. 186 cm build builds a covariance model of an RNA multiple alignment. cmbuild uses the consensus structure to determine the architecture of the CM.
192
193 187
194 **Input** 188 **Input**
195 189
196 Input file is a multiple sequence alignment file in Stockholm or SELEX format, and must contain consensus secondary structure annotation. 190 Input file is a multiple sequence alignment file in Stockholm or SELEX format, and must contain consensus secondary structure annotation.
197 cmbuild uses the consensus structure to determine the architecture of the CM. 191 cmbuild uses the consensus structure to determine the architecture of the CM.
198 192
199 Example: simple example of a multiple RNA sequence alignment with secondary structure annotation 193 Example: simple example of a multiple RNA sequence alignment with secondary structure annotation
200 194
201 # STOCKHOLM 1.0 195 # STOCKHOLM 1.0
196
202 tRNA1 GCGGAUUUAGCUCAGUUGGG.AGAGCGCCAGACUGAAGAUCUGGAGGUCC 197 tRNA1 GCGGAUUUAGCUCAGUUGGG.AGAGCGCCAGACUGAAGAUCUGGAGGUCC
198
203 tRNA2 UCCGAUAUAGUGUAAC.GGCUAUCACAUCACGCUUUCACCGUGGAGA.CC 199 tRNA2 UCCGAUAUAGUGUAAC.GGCUAUCACAUCACGCUUUCACCGUGGAGA.CC
200
204 tRNA3 UCCGUGAUAGUUUAAU.GGUCAGAAUGGGCGCUUGUCGCGUGCCAGA.UC 201 tRNA3 UCCGUGAUAGUUUAAU.GGUCAGAAUGGGCGCUUGUCGCGUGCCAGA.UC
202
205 tRNA4 GCUCGUAUGGCGCAGU.GGU.AGCGCAGCAGAUUGCAAAUCUGUUGGUCC 203 tRNA4 GCUCGUAUGGCGCAGU.GGU.AGCGCAGCAGAUUGCAAAUCUGUUGGUCC
204
206 tRNA5 GGGCACAUGGCGCAGUUGGU.AGCGCGCUUCCCUUGCAAGGAAGAGGUCA 205 tRNA5 GGGCACAUGGCGCAGUUGGU.AGCGCGCUUCCCUUGCAAGGAAGAGGUCA
206
207 #=GC SS_cons <<<<<<<..<<<<.........>>>>.<<<<<.......>>>>>.....< 207 #=GC SS_cons <<<<<<<..<<<<.........>>>>.<<<<<.......>>>>>.....<
208 208
209 209
210 **Output** 210 **Output**
211 211
212 The output of cmbuild contains information about the size of your input alignment (in aligned columns 212 The output of cmbuild contains information about the size of your input alignment (in aligned columns
213 and # of sequences), and about the size of the resulting model. 213 and # of sequences), and about the size of the resulting model.
214 214
215 In addition to writing CM(s) to the output file, cmbuild also outputs a single line for each model created to stdout. 215 In addition to writing CM(s) to the output file, cmbuild also outputs a single line for each model created to stdout.
216 Each line has the following fields: 216 Each line has the following fields:
217 - aln: the index of the alignment used to build the CM 217 - aln: the index of the alignment used to build the CM
218 - idx: the index of the CM in the output file 218 - idx: the index of the CM in the output file
219 - name: the name of the CM 219 - name: the name of the CM
220 - nseq: the number of sequences in the alignment used to build the CM 220 - nseq: the number of sequences in the alignment used to build the CM
221 - eff nseq: the effective number of sequences used to build the model 221 - eff nseq: the effective number of sequences used to build the model
222 - alen: the length of the alignment used to build the CM 222 - alen: the length of the alignment used to build the CM
228 - description: description of the model/alignment. 228 - description: description of the model/alignment.
229 229
230 230
231 **Options controlling model construction** 231 **Options controlling model construction**
232 232
233
234 These options control how consensus columns are defined in an alignment. 233 These options control how consensus columns are defined in an alignment.
235 234
236 - *--fast*: Define consensus columns automatically as those that have a fraction >= symfrac of residues as opposed to gaps. (See below for the --symfrac option.) This is the default. 235 - *--fast*: Define consensus columns automatically as those that have a fraction >= symfrac of residues as opposed to gaps. (See below for the --symfrac option.) This is the default.
237 - *--hand*: Use reference coordinate annotation (#=GC RF line, in Stockholm) to determine which columns are consensus, and which are inserts. Any non-gap character indicates a consensus column. (For example, mark consensus columns with ”x”, and insert columns with ”.”.) 236 - *--hand*: Use reference coordinate annotation (#=GC RF line, in Stockholm) to determine which columns are consensus, and which are inserts. Any non-gap character indicates a consensus column. (For example, mark consensus columns with ”x”, and insert columns with ”.”.)
238 - *--symfrac*: Define the residue fraction threshold necessary to define a consensus column when not using --hand. The default is 0.5. The symbol fraction in each column is calculated after taking relative sequence weighting into account. Setting this to 0.0 means that every alignment column will be assigned as consensus, which may be useful in some cases. Setting it to 1.0 means that only columns that include 0 gaps will be assigned as consensus. 237 - *--symfrac*: Define the residue fraction threshold necessary to define a consensus column when not using --hand. The default is 0.5. The symbol fraction in each column is calculated after taking relative sequence weighting into account. Setting this to 0.0 means that every alignment column will be assigned as consensus, which may be useful in some cases. Setting it to 1.0 means that only columns that include 0 gaps will be assigned as consensus.
261 - *--ere*: Set the target mean match state relative entropy. By default the target relative entropy per match position is 0.59 bits for models with at least 1 basepair and 0.38 for models with zero basepairs. 260 - *--ere*: Set the target mean match state relative entropy. By default the target relative entropy per match position is 0.59 bits for models with at least 1 basepair and 0.38 for models with zero basepairs.
262 - *--eminseq*: Define the minimum allowed effective sequence number. 261 - *--eminseq*: Define the minimum allowed effective sequence number.
263 - *--ehmmre*: Set the target HMM mean match state relative entropy. Entropy for basepairing match states is calculated using marginalized basepair emission probabilities. 262 - *--ehmmre*: Set the target HMM mean match state relative entropy. Entropy for basepairing match states is calculated using marginalized basepair emission probabilities.
264 - *--eset*: Set the effective sequence number for entropy weighting. 263 - *--eset*: Set the effective sequence number for entropy weighting.
265 264
266
267
268 **Options for refining the input alignment** 265 **Options for refining the input alignment**
269 266
270 - *--refine*: Attempt to refine the alignment before building the CM using expectation-maximization (EM). A CM is first built from the initial alignment as usual. Then, the sequences in the alignment are realigned optimally (with the HMM banded CYK algorithm, optimal means optimal given the bands) to the CM, and a new CM is built from the resulting alignment. The sequences are then realigned to the new CM, and a new CM is built from that alignment. This is continued until convergence, specifically when the alignments for two successive iterations are not significantly different (the summed bit scores of all the sequences in the alignment changes less than 1% between two successive iterations). 267 - *--refine*: Attempt to refine the alignment before building the CM using expectation-maximization (EM). A CM is first built from the initial alignment as usual. Then, the sequences in the alignment are realigned optimally (with the HMM banded CYK algorithm, optimal means optimal given the bands) to the CM, and a new CM is built from the resulting alignment. The sequences are then realigned to the new CM, and a new CM is built from that alignment. This is continued until convergence, specifically when the alignments for two successive iterations are not significantly different (the summed bit scores of all the sequences in the alignment changes less than 1% between two successive iterations).
271 - *Turn on the local alignment algorithm*: allows the alignment to span two or more subsequences if necessary (e.g. if the structures of the query model and target sequence are only partially shared), allowing certain large insertions and deletions in the structure to be penalized differently than normal indels. The default is to globally align the query model to the target sequences. 268 - *Turn on the local alignment algorithm*: allows the alignment to span two or more subsequences if necessary (e.g. if the structures of the query model and target sequence are only partially shared), allowing certain large insertions and deletions in the structure to be penalized differently than normal indels. The default is to globally align the query model to the target sequences.
272 - *--gibbs sampling*: Modifies the behavior of --refine so Gibbs sampling is used instead of EM. The difference is that during the alignment stage the alignment is not necessarily optimal, instead an alignment (parsetree) for each sequences is sampled from the posterior distribution of alignments as determined by the Inside algorithm. Due to this sampling step --gibbs is non- deterministic, so different runs with the same alignment may yield different results. This is not true when --refine is used without the --gibbs option, in which case the final alignment and CM will always be the same. When --gibbs is enabled, the --seed "number" option can be used to seed the random number generator predictably, making the results reproducible. The goal of the --gibbs option is to help expert RNA alignment curators refine structural alignments by allowing them to observe alternative high scoring alignments. 269 - *--gibbs sampling*: Modifies the behavior of --refine so Gibbs sampling is used instead of EM. The difference is that during the alignment stage the alignment is not necessarily optimal, instead an alignment (parsetree) for each sequences is sampled from the posterior distribution of alignments as determined by the Inside algorithm. Due to this sampling step --gibbs is non- deterministic, so different runs with the same alignment may yield different results. This is not true when --refine is used without the --gibbs option, in which case the final alignment and CM will always be the same. When --gibbs is enabled, the --seed "number" option can be used to seed the random number generator predictably, making the results reproducible. The goal of the --gibbs option is to help expert RNA alignment curators refine structural alignments by allowing them to observe alternative high scoring alignments.
273 - *--Random seed*: Seed the random number generator with an integer >= 0. This option can only be used in combination with --gibbs. If the given number is nonzero, stochastic sampling of alignments will be reproducible; the same command will give the same results. If the given number is 0, the random number generator is seeded arbitrarily, and stochastic samplings may vary from run to run of the same command. The default seed is 0. 270 - *--Random seed*: Seed the random number generator with an integer >= 0. This option can only be used in combination with --gibbs. If the given number is nonzero, stochastic sampling of alignments will be reproducible; the same command will give the same results. If the given number is 0, the random number generator is seeded arbitrarily, and stochastic samplings may vary from run to run of the same command. The default seed is 0.
274 - *--Turn off the truncated alignment algorithm*: With --refine, turn off the truncated alignment algorithm. There is more information on this in the cmalign manual page. 271 - *--Turn off the truncated alignment algorithm*: With --refine, turn off the truncated alignment algorithm. There is more information on this in the cmalign manual page.
275 - *--cyk algorithm*: With --refine, align with the CYK algorithm. By default the optimal accuracy algorithm is used. There is more information on this in the cmalign manual page. 272 - *--cyk algorithm*: With --refine, align with the CYK algorithm. By default the optimal accuracy algorithm is used. There is more information on this in the cmalign manual page.
276
277
278 273
279 For further questions please refere to the Infernal Userguide_. 274 For further questions please refere to the Infernal Userguide_.
280 275
281 .. _Userguide: http://selab.janelia.org/software/infernal/Userguide.pdf 276 .. _Userguide: http://selab.janelia.org/software/infernal/Userguide.pdf
282
283 277
284 ]]> 278 ]]>
285 </help> 279 </help>
286
287 <citations> 280 <citations>
288 <citation type="doi">10.1093/bioinformatics/btt509</citation> 281 <citation type="doi">10.1093/bioinformatics/btt509</citation>
289 <citation type="bibtex"> 282 <citation type="bibtex">
290 @ARTICLE{bgruening_galaxytools, 283 @ARTICLE{bgruening_galaxytools,
291 Author = {Björn Grüning, Cameron Smith, Torsten Houwaart, Nicola Soranzo, Eric Rasche}, 284 Author = {Björn Grüning, Cameron Smith, Torsten Houwaart, Nicola Soranzo, Eric Rasche},
293 title = {{Galaxy Tools - A collection of bioinformatics and cheminformatics tools for the Galaxy environment}}, 286 title = {{Galaxy Tools - A collection of bioinformatics and cheminformatics tools for the Galaxy environment}},
294 url = {https://github.com/bgruening/galaxytools} 287 url = {https://github.com/bgruening/galaxytools}
295 } 288 }
296 </citation> 289 </citation>
297 </citations> 290 </citations>
298
299 </tool> 291 </tool>