comparison cmsearch.xml @ 2:fac157e22e1b draft

Uploaded
author bgruening
date Fri, 13 Feb 2015 03:10:51 -0500
parents 55bb96edfc07
children 2c2c5e5e495b
comparison
equal deleted inserted replaced
1:55bb96edfc07 2:fac157e22e1b
1 <tool id="infernal_cmsearch" name="Search covariance model(s)" version="1.1.0.2"> 1 <tool id="infernal_cmsearch" name="Search covariance model(s)" version="1.1.0.2">
2 <description>against a sequence database (cmsearch)</description> 2 <description>against a sequence database (cmsearch)</description>
3 <parallelism method="multi" split_inputs="seqdb" split_mode="to_size" split_size="100" shared_inputs="" merge_outputs="outfile,multiple_alignment_output"></parallelism> 3 <parallelism method="multi" split_inputs="seqdb" split_mode="to_size" split_size="500" shared_inputs="" merge_outputs="outfile,multiple_alignment_output"></parallelism>
4 <requirements> 4 <requirements>
5 <requirement type="package">infernal</requirement> 5 <requirement type="package">infernal</requirement>
6 <requirement type="package" version="1.1">infernal</requirement> 6 <requirement type="package" version="1.1">infernal</requirement>
7 <requirement type="package" version="8.21">gnu_coreutils</requirement> 7 <requirement type="package" version="8.22">gnu_coreutils</requirement>
8 </requirements> 8 </requirements>
9 <command> 9 <command>
10 <![CDATA[
10 ## a temp file is needed, because the standard tabular output from infernal is not usefull in Galaxy 11 ## a temp file is needed, because the standard tabular output from infernal is not usefull in Galaxy
11 ## it will be converted to a tab delimited file and piped to Galaxy 12 ## it will be converted to a tab delimited file and piped to Galaxy
12 temp_tabular_output=\$(mktemp); 13 temp_tabular_output=\$(mktemp);
13 14
14 cmsearch 15 cmsearch
15 ## Infernal Options 16 ## Infernal Options
16 --cpu "\${GALAXY_SLOTS:-12}" 17 --cpu "\${GALAXY_SLOTS:-12}"
17 -o /dev/null 18 -o /dev/null
18 --tformat $seqdb.ext ##target format: fasta, embl, genbank, ddbj, stockholm, pfam, a2m, afa, clustal, and phylip 19 --tformat $seqdb.ext ##target format: fasta, embl, genbank, ddbj, stockholm, pfam, a2m, afa, clustal, and phylip
19 $bottomonly 20 $bottomonly
20 $toponly 21 $toponly
21 $cyk 22 $cyk
22 $notrunc 23 $notrunc
23 $max 24 $max
50 $cm_opts.cmfile 51 $cm_opts.cmfile
51 #end if 52 #end if
52 53
53 ## sequence file 54 ## sequence file
54 $seqdb 55 $seqdb
55 2>&#38;1 56 2>&1
56 ; 57 ;
57 58
58 ## 1. replace all lines starting # (comment lines) 59 ## 1. replace all lines starting # (comment lines)
59 ## 2. replace the first 18 spaces with tabs, 18th field is a free text field (can contain spaces) 60 ## 2. replace the first 18 spaces with tabs, 18th field is a free text field (can contain spaces)
60 sed -e 's/#.*$//' -e '/^$/d' -e 's/\s\+/\t/g' -e 's/\t/ /18g' \$temp_tabular_output > $outfile 61 sed -e 's/#.*$//' -e '/^$/d' -e 's/\s\+/\t/g' -e 's/\t/ /18g' \$temp_tabular_output > $outfile
61 62
63 ]]>
62 </command> 64 </command>
63 <inputs> 65 <inputs>
64 66
65 <param name="seqdb" type="data" format="fasta" label="Sequence database"/> 67 <param name="seqdb" type="data" format="fasta" label="Sequence database"/>
66 68
81 <when value="histdb"> 83 <when value="histdb">
82 <param name="cmfile" type="data" format="cm" label="Covariance models file from the history."/> 84 <param name="cmfile" type="data" format="cm" label="Covariance models file from the history."/>
83 </when> 85 </when>
84 </conditional> 86 </conditional>
85 87
86 <param name="g" truevalue="-g" falsevalue="" checked="False" type="boolean" 88 <param name="g" truevalue="-g" falsevalue="" checked="False" type="boolean"
87 label="Turn on the glocal alignment algorithm" help="... global with respect to the query model and local with respect to the target database."/> 89 label="Turn on the glocal alignment algorithm" help="... global with respect to the query model and local with respect to the target database."/>
88 90
89 <param name="bottomonly" truevalue="--bottomonly" falsevalue="" checked="False" type="boolean" 91 <param name="bottomonly" truevalue="--bottomonly" falsevalue="" checked="False" type="boolean"
90 label="Only search the bottom (Crick) strand of target sequences" help="in the sequence database"/> 92 label="Only search the bottom (Crick) strand of target sequences" help="in the sequence database"/>
91 <param name="toponly" truevalue="--toponly" falsevalue="" checked="False" type="boolean" 93 <param name="toponly" truevalue="--toponly" falsevalue="" checked="False" type="boolean"
92 label="Only search the top (Watson) strand of target sequences" help="in the sequence database"/> 94 label="Only search the top (Watson) strand of target sequences" help="in the sequence database"/>
93 95
94 <param name="cyk" truevalue="--cyk" falsevalue="" checked="False" type="boolean" 96 <param name="cyk" truevalue="--cyk" falsevalue="" checked="False" type="boolean"
95 label="Use the CYK algorithm, not Inside, to determine the final score of all hits" help=""/> 97 label="Use the CYK algorithm, not Inside, to determine the final score of all hits" help=""/>
96 <param name="--acyk" truevalue="--cyk" falsevalue="" checked="False" type="boolean" 98 <param name="--acyk" truevalue="--cyk" falsevalue="" checked="False" type="boolean"
97 label="Use the CYK algorithm to align hits" help="By default, the Durbin/Holmes optimal accuracy algorithm is used, which finds the alignment that maximizes the expected accuracy of all aligned residues."/> 99 label="Use the CYK algorithm to align hits" help="By default, the Durbin/Holmes optimal accuracy algorithm is used, which finds the alignment that maximizes the expected accuracy of all aligned residues."/>
98 100
99 <param name="notrunc" truevalue="--notrunc" falsevalue="" checked="False" type="boolean" 101 <param name="notrunc" truevalue="--notrunc" falsevalue="" checked="False" type="boolean"
100 label="Turn off truncated hit detection" help=""/> 102 label="Turn off truncated hit detection" help=""/>
101 103
102 <!-- accelleration pipeline --> 104 <!-- accelleration pipeline -->
103 105
104 <param name="max" truevalue="--max" falsevalue="" checked="False" type="boolean" 106 <param name="max" truevalue="--max" falsevalue="" checked="False" type="boolean"
105 label="Turn off all filters, and run non-banded Inside on every full-length target sequence" help="This 107 label="Turn off all filters, and run non-banded Inside on every full-length target sequence" help="This
106 increases sensitivity somewhat, at an extremely large cost in speed."/> 108 increases sensitivity somewhat, at an extremely large cost in speed."/>
107 109
108 <param name="nohmm" truevalue="--nohmm" falsevalue="" checked="False" type="boolean" 110 <param name="nohmm" truevalue="--nohmm" falsevalue="" checked="False" type="boolean"
109 label="Turn off all HMM filter stages " help=""/> 111 label="Turn off all HMM filter stages " help=""/>
110 112
111 <param name="mid" truevalue="--mid" falsevalue="" checked="False" type="boolean" 113 <param name="mid" truevalue="--mid" falsevalue="" checked="False" type="boolean"
112 label="Turn off the HMM SSV and Viterbi filter stages" help=""/> 114 label="Turn off the HMM SSV and Viterbi filter stages" help=""/>
113 115
114 116
115 <!-- Options for model-specific score thresholding --> 117 <!-- Options for model-specific score thresholding -->
116 <!-- 118 <!--
180 </sanitizer> 182 </sanitizer>
181 </param> 183 </param>
182 </when> 184 </when>
183 </conditional> 185 </conditional>
184 186
185 <param name="A" truevalue="-A" falsevalue="" checked="False" type="boolean" 187 <param name="A" truevalue="-A" falsevalue="" checked="False" type="boolean"
186 label="Save a multiple alignment of all significant hits" help="... those satisfying inclusion thresholds"/> 188 label="Save a multiple alignment of all significant hits" help="... those satisfying inclusion thresholds"/>
187 189
188 </inputs> 190 </inputs>
189 <outputs> 191 <outputs>
190 192
193 <filter>A is True</filter> 195 <filter>A is True</filter>
194 </data> 196 </data>
195 197
196 </outputs> 198 </outputs>
197 <help> 199 <help>
200 <![CDATA[
198 201
199 202
200 **What it does** 203 **What it does**
201 204
202 Infernal is used to search sequence databases for homologs of structural RNA sequences, and to make 205 Infernal is used to search sequence databases for homologs of structural RNA sequences, and to make
203 sequence- and structure-based RNA sequence alignments. Infernal needs a profile from a structurally 206 sequence- and structure-based RNA sequence alignments. Infernal needs a profile from a structurally
204 annotated multiple sequence alignment of an RNA family with a position-specific scoring system for substitutions, 207 annotated multiple sequence alignment of an RNA family with a position-specific scoring system for substitutions,
205 insertions, and deletions. Positions in the profile that are basepaired in the consensus secondary 208 insertions, and deletions. Positions in the profile that are basepaired in the consensus secondary
206 structure of the alignment are modeled as dependent on one another, allowing Infernal’s scoring system to 209 structure of the alignment are modeled as dependent on one another, allowing Infernal’s scoring system to
207 consider the secondary structure, in addition to the primary sequence, of the family being modeled. Infernal 210 consider the secondary structure, in addition to the primary sequence, of the family being modeled. Infernal
208 profiles are probabilistic models called “covariance models”, a specialized type of stochastic context-free 211 profiles are probabilistic models called “covariance models”, a specialized type of stochastic context-free
209 grammar (SCFG) (Lari and Young, 1990). 212 grammar (SCFG) (Lari and Young, 1990).
210 213
211 Compared to other alignment and database search tools based only on sequence comparison, Infernal 214 Compared to other alignment and database search tools based only on sequence comparison, Infernal
212 aims to be significantly more accurate and more able to detect remote homologs because it models sequence 215 aims to be significantly more accurate and more able to detect remote homologs because it models sequence
213 and structure. 216 and structure.
214 217
215 218
216 Output format 219 Output format
217 ------------- 220 -------------
223 (5) mdl (model): Which type of model was used to compute the final score. Either ’cm’ or ’hmm’. A CM is used to compute the final hit scores unless the model has zero basepairs or the --hmmonly option is used, in which case a HMM will be used. 226 (5) mdl (model): Which type of model was used to compute the final score. Either ’cm’ or ’hmm’. A CM is used to compute the final hit scores unless the model has zero basepairs or the --hmmonly option is used, in which case a HMM will be used.
224 (6) mdl from (model coord): The start of the alignment of this hit with respect to the profile (CM or HMM), numbered 1..N for a profile of N consensus positions. 227 (6) mdl from (model coord): The start of the alignment of this hit with respect to the profile (CM or HMM), numbered 1..N for a profile of N consensus positions.
225 (7) mdl to (model coord): The end of the alignment of this hit with respect to the profile (CM or HMM), numbered 1..N for a profile of N consensus positions. 228 (7) mdl to (model coord): The end of the alignment of this hit with respect to the profile (CM or HMM), numbered 1..N for a profile of N consensus positions.
226 (8) seq from (ali coord): The start of the alignment of this hit with respect to the sequence, numbered 1..L for a sequence of L residues. 229 (8) seq from (ali coord): The start of the alignment of this hit with respect to the sequence, numbered 1..L for a sequence of L residues.
227 (9) seq to (ali coord): The end of the alignment of this hit with respect to the sequence, numbered 1..L for a sequence of L residues. 230 (9) seq to (ali coord): The end of the alignment of this hit with respect to the sequence, numbered 1..L for a sequence of L residues.
228 (10) strand: The strand on which the hit occurs on the sequence. ’+’ if the hit is on the top (Watson) strand, ’-’ if the hit is on the bottom (Crick) strand. If on the top strand, the “seq from” value will be less than or equal to the “seq to” value, else it will be greater than or equal to it. 231 (10) strand: The strand on which the hit occurs on the sequence. ’+’ if the hit is on the top (Watson) strand, ’-’ if the hit is on the bottom (Crick) strand. If on the top strand, the “seq from” value will be less than or equal to the “seq to” value, else it will be greater than or equal to it.
229 (11) trunc: Indicates if this is predicted to be a truncated CM hit or not. This will be “no” if it is a CM hit that is not predicted to be truncated by the end of the sequence, “5’ ” or “3’ ” if the hit is predicted to have one or more 5’ or 3’ residues missing due to a artificial truncation of the sequence, or “5’&amp;3”’ if the hit is predicted to have one or more 5’ residues missing and one or more 3’ residues missing. If the hit is an HMM hit, this will always be ’-’. 232 (11) trunc: Indicates if this is predicted to be a truncated CM hit or not. This will be “no” if it is a CM hit that is not predicted to be truncated by the end of the sequence, “5’ ” or “3’ ” if the hit is predicted to have one or more 5’ or 3’ residues missing due to a artificial truncation of the sequence, or “5’&3”’ if the hit is predicted to have one or more 5’ residues missing and one or more 3’ residues missing. If the hit is an HMM hit, this will always be ’-’.
230 (12) pass: Indicates what “pass” of the pipeline the hit was detected on. This is probably only useful for testing and debugging. Non-truncated hits are found on the first pass, truncated hits are found on successive passes. 233 (12) pass: Indicates what “pass” of the pipeline the hit was detected on. This is probably only useful for testing and debugging. Non-truncated hits are found on the first pass, truncated hits are found on successive passes.
231 (13) gc: Fraction of G and C nucleotides in the hit. 234 (13) gc: Fraction of G and C nucleotides in the hit.
232 (14) bias: The biased-composition correction: the bit score difference contributed by the null3 model for CM hits, or the null2 model for HMM hits. High bias scores may be a red flag for a false positive. It is difficult to correct for all possible ways in which a nonrandom but nonhomologous biological sequences can appear to be similar, such as short-period tandem repeats, so there are cases where the bias correction is not strong enough (creating false positives). 235 (14) bias: The biased-composition correction: the bit score difference contributed by the null3 model for CM hits, or the null2 model for HMM hits. High bias scores may be a red flag for a false positive. It is difficult to correct for all possible ways in which a nonrandom but nonhomologous biological sequences can appear to be similar, such as short-period tandem repeats, so there are cases where the bias correction is not strong enough (creating false positives).
233 (15) score: The score (in bits) for this target/query comparison. It includes the biased-composition cor-rection (the “null3” model for CM hits, or the “null2” model for HMM hits). 236 (15) score: The score (in bits) for this target/query comparison. It includes the biased-composition cor-rection (the “null3” model for CM hits, or the “null2” model for HMM hits).
234 (16) E-value: The expectation value (statistical significance) of the target. This is a per query E-value; i.e. calculated as the expected number of false positives achieving this comparison’s score for a single query against the search space Z. For cmsearch Z is defined as the total number of nucleotides in the target dataset multiplied by 2 because both strands are searched. For cmscan Z is the total number of nucleotides in the query sequence multiplied by 2 because both strands are searched and multiplied by the number of models in the target database. If you search with multiple queries and if you want to control the overall false positive rate of that search rather than the false positive rate per query, you will want to multiply this per-query E-value by how many queries you’re doing. 237 (16) E-value: The expectation value (statistical significance) of the target. This is a per query E-value; i.e. calculated as the expected number of false positives achieving this comparison’s score for a single query against the search space Z. For cmsearch Z is defined as the total number of nucleotides in the target dataset multiplied by 2 because both strands are searched. For cmscan Z is the total number of nucleotides in the query sequence multiplied by 2 because both strands are searched and multiplied by the number of models in the target database. If you search with multiple queries and if you want to control the overall false positive rate of that search rather than the false positive rate per query, you will want to multiply this per-query E-value by how many queries you’re doing.
242 245
243 246
244 How do I cite Infernal? 247 How do I cite Infernal?
245 ----------------------- 248 -----------------------
246 249
247 The recommended citation for using Infernal 1.1 is E. P. Nawrocki and S. R. Eddy, Infernal 1.1: 100-fold faster RNA homology searches , Bioinformatics 29:2933-2935 (2013). 250 The recommended citation for using Infernal 1.1 is E. P. Nawrocki and S. R. Eddy, Infernal 1.1: 100-fold faster RNA homology searches , Bioinformatics 29:2933-2935 (2013).
248 251
249 **Galaxy Wrapper Author**:: 252 **Galaxy Wrapper Author**::
250 253
251 * Bjoern Gruening, University of Freiburg 254 * Bjoern Gruening, University of Freiburg
252 255
256 ]]>
253 </help> 257 </help>
254 </tool> 258 </tool>