view cmalign.xml @ 1:55bb96edfc07 draft

Uploaded
author bgruening
date Thu, 24 Apr 2014 15:02:05 -0400
parents
children fac157e22e1b
line wrap: on
line source

<tool id="infernal_cmsearch" name="Search covariance model(s)" version="1.1.0.2">
    <description>against a sequence database (cmsearch)</description>
    <parallelism method="multi" split_inputs="seqdb" split_mode="to_size" split_size="100" shared_inputs="" merge_outputs="outfile,multiple_alignment_output"></parallelism>
    <requirements>
        <requirement type="package">infernal</requirement>
        <requirement type="package" version="1.1">infernal</requirement>
        <requirement type="package" version="8.21">gnu_coreutils</requirement>
    </requirements>
    <command>
        ## a temp file is needed, because the standard tabular output from infernal is not usefull in Galaxy
        ## it will be converted to a tab delimited file and piped to Galaxy
        temp_tabular_output=\$(mktemp);

        cmsearch 
            ## Infernal Options
            --cpu "\${GALAXY_SLOTS:-12}"
            -o /dev/null
            --tformat $seqdb.ext ##target format: fasta, embl, genbank, ddbj, stockholm, pfam, a2m, afa, clustal, and phylip 
            $bottomonly
            $toponly
            $cyk
            $notrunc
            $max
            $nohmm
            $mid
            ##$bitscore_thresholds
            --tblout \$temp_tabular_output
            $g
            #if $A:
                $A $multiple_alignment_output
            #end if

            #if $inclusion_thresholds_opts.inclusion_thresholds_selector == "--incE":
                --incE $inclusion_thresholds_opts.incE
            #elif $inclusion_thresholds_opts.inclusion_thresholds_selector == "--incT":
                --incT $inclusion_thresholds_opts.incT
            #end if

            #if $reporting_thresholds_opts.reporting_thresholds_selector == "-E":
                -E $reporting_thresholds_opts.E
            #elif $reporting_thresholds_opts.reporting_thresholds_selector == "-T":
                -T $reporting_thresholds_opts.T
            #end if

            ## CM file from the history or stored as database on disc

            #if $cm_opts.cm_opts_selector == "db":
                $cm_opts.database.fields.path
            #else:
                $cm_opts.cmfile
            #end if

            ## sequence file
            $seqdb
            2>&#38;1
            ;

            ## 1. replace all lines starting # (comment lines)
            ## 2. replace the first 18 spaces with tabs, 18th field is a free text field (can contain spaces)
            sed -e 's/#.*$//' -e '/^$/d' -e 's/ /\t/g' -e 's/\t/ /18g' \$temp_tabular_output > $outfile

    </command>
        <inputs>

            <param name="seqdb" type="data" format="fasta" label="Sequence database"/>
            <conditional name="cm_opts">
                <param name="cm_opts_selector" type="select" label="Subject covariance models">
                  <option value="db" selected="True">Locally installed covariance models</option>
                  <option value="histdb">Covariance model from your history</option>
                </param>
                <when value="db">
                    <param name="database" type="select" label="Covariance models">
                        <options from_file="infernal.loc">
                          <column name="value" index="0"/>
                          <column name="name" index="1"/>
                          <column name="path" index="2"/>
                        </options>
                    </param>
                </when>
                <when value="histdb">
                    <param name="cmfile" type="data" format="txt" label="Covariance models file from the history."/>
                </when>
            </conditional>

            <param name="g" truevalue="-g" falsevalue="" checked="False" type="boolean" 
                label="Turn on the glocal alignment algorithm" help="... global with respect to the query model and local with respect to the target database."/>

            <param name="bottomonly" truevalue="--bottomonly" falsevalue="" checked="False" type="boolean" 
                label="Only search the bottom (Crick) strand of target sequences" help="in the sequence database"/>
            <param name="toponly" truevalue="--toponly" falsevalue="" checked="False" type="boolean" 
                label="Only search the top (Watson) strand of target sequences" help="in the sequence database"/>

            <param name="cyk" truevalue="--cyk" falsevalue="" checked="False" type="boolean" 
                label="Use the CYK algorithm, not Inside, to determine the final score of all hits" help=""/>
            <param name="--acyk" truevalue="--cyk" falsevalue="" checked="False" type="boolean" 
                label="Use the CYK algorithm to align hits" help="By default, the Durbin/Holmes optimal accuracy algorithm is used, which finds the alignment that maximizes the expected accuracy of all aligned residues."/>

            <param name="notrunc" truevalue="--notrunc" falsevalue="" checked="False" type="boolean" 
                label="Turn off truncated hit detection" help=""/>

            <!-- accelleration pipeline -->

            <param name="max" truevalue="--max" falsevalue="" checked="False" type="boolean" 
                label="Turn off all filters, and run non-banded Inside on every full-length target sequence" help="This
                increases sensitivity somewhat, at an extremely large cost in speed."/>

            <param name="nohmm" truevalue="--nohmm" falsevalue="" checked="False" type="boolean" 
                label="Turn off all HMM filter stages " help=""/>

            <param name="mid" truevalue="--mid" falsevalue="" checked="False" type="boolean" 
                label="Turn off the HMM SSV and Viterbi filter stages" help=""/>


            <!-- Options for model-specific score thresholding -->
            <!--
            <param name="bitscore_thresholds" type="select" label="Bit score thresholds" help="Curated CM databases may define specific bit score thresholds for each CM, superseding any thresholding based on statistical significance alone.">
                <option value="" selected="true">None</option>
                <option value=" - -cut_ga">GA (gathering) bit scores</option>
                <option value=" - -cut_nc">NC (noise cutoff) bit score</option>
                <option value=" - -cut_tc">TC (trusted cutoff) bit score</option>
            </param>
            -->
            <!-- Options for inclusion thresholds -->


            <conditional name="inclusion_thresholds_opts">
                <param name="inclusion_thresholds_selector" type="select" label="Inclusion thresholds"
                help="Inclusion thresholds are stricter than reporting thresholds. Inclusion thresholds control which hits are considered to be reliable enough to be included in an output alignment or in a possible subsequent search round, or marked as significant (”!”) as opposed to questionable (”?”) in hit output.">
                    <option value="" selected="true">default</option>
                    <option value="--incE">Use E-value</option>
                    <option value="--incT">Use bit score</option>
                </param>
                <when />
                <when value="--incE">
                    <param name="incE" type="float" value="0.01" size="5" label="Use E-value" help="of &lt;= X as the hit inclusion threshold.">
                        <sanitizer>
                            <valid initial="string.printable">
                                <remove value="&apos;"/>
                            </valid>
                        </sanitizer>
                    </param>
                </when>
                <when value="--incT">
                    <param name="incT" type="integer" size="5" value="0" label="Use bit score" help="of >= X as the hit inclusion threshold.">
                        <sanitizer>
                            <valid initial="string.printable">
                                <remove value="&apos;"/>
                            </valid>
                        </sanitizer>
                    </param>
                </when>
            </conditional>

            <!-- Options controlling reporting thresholds -->

            <conditional name="reporting_thresholds_opts">
                <param name="reporting_thresholds_selector" type="select" label="reporting thresholds"
                help="Reporting thresholds control which hits are reported in output files">
                    <option value="" selected="true">default</option>
                    <option value="-E">Use E-value</option>
                    <option value="-T">Use bit score</option>
                </param>
                <when />
                <when value="-E">
                    <param name="E" type="float" value="10.0" size="5" label="Use E-value" help="of &lt;= X as the hit reporting threshold. The default is 10.0, meaning that on average, about 10 false positives will be reported per query, so you can see the top of the noise and decide for yourself if it’s really noise.">
                        <sanitizer>
                            <valid initial="string.printable">
                                <remove value="&apos;"/>
                            </valid>
                        </sanitizer>
                    </param>
                </when>
                <when value="-T">
                    <param name="T" type="integer" size="5" value="0" label="Use bit score" help="of >= X as the hit reporting threshold.">
                        <sanitizer>
                            <valid initial="string.printable">
                                <remove value="&apos;"/>
                            </valid>
                        </sanitizer>
                    </param>
                </when>
            </conditional>

            <param name="A" truevalue="-A" falsevalue="" checked="False" type="boolean" 
                label="Save a multiple alignment of all significant hits" help="... those satisfying inclusion thresholds"/>

        </inputs>
    <outputs>

        <data format="tabular" name="outfile" label="cmsearch on ${on_string}"/>
        <data format="tabular" name="multiple_alignment_output" label="cmsearch on ${on_string} (multi alignment)">
            <filter>A is True</filter>
        </data>

    </outputs>
    <help>


**What it does**

cmalign aligns the RNA sequences in <seqfile> to the covariance model (CM) in <cmfile>. The new alignment is
output to stdout in Stockholm format, but can be redirected to a file <f> with the -o <f> option.
Either <cmfile> or <seqfile> (but not both) may be ’-’ (dash), which means reading this input from stdin rather than a
file.
The sequence file <seqfile> must be in FASTA or Genbank format.
cmalign uses an HMM banding technique to accelerate alignment by default as described below for the --hbanded
option. HMM banding can be turned off with the --nonbanded option.
By default, cmalign computes the alignment with maximum expected accuracy that is consistent with constraints
(bands) derived from an HMM, using a banded version of the Durbin/Holmes optimal accuracy algorithm. This be-
havior can be changed with the --cyk or --sample options.
cmalign takes special care to correctly align truncated sequences, where some nucleotides from the beginning (5’)
and/or end (3’) of the actual full length biological sequence are not present in the input sequence (see DL Kolbe and
SR Eddy, Bioinformatics, 25:1236-1243, 2009). This behavior is on by default, but can be turned off with --notrunc. In
previous versions of cmalign the --sub option was required to appropriately handle truncated sequences. The --sub
option is still available in this version, but the new default method for handling truncated sequences should be as good
or superior to the sub method in nearly all cases.
The --mapali <s> option allows inclusion of the fixed training alignment used to build the CM from file <s> within the
output alignment of cmalign.
It is possible to merge two or more alignments created by the same CM using the Easel miniapp esl-alimerge (included
in the easel/miniapps/ subdirectory of Infernal). Previous versions of cmalign included options to merge alignments
but they were deprecated upon development of esl-alimerge, which is significantly more memory efficient.
By default, cmalign will output the alignment to stdout. The alignment can be redirected to an output file <f> with the
-o <f> option. With -o, information on each aligned sequence, including score and model alignment boundaries will be
printed to stdout (more on this below).
The output alignment will be in Stockholm format by default. This can be changed to Pfam, aligned FASTA (AFA), A2M,
Clustal, or Phylip format using the --outformat <s> option, where <s> is the name of the desired format. As a special
case, if the output alignment is large (more than 10,000 sequences or more than 10,000,000 total nucleotides) than the
output format will be Pfam format, with each sequence appearing on a single line, for reasons of memory efficiency. For
alignments larger than this, using --ileaved will force interleaved Stockholm format, but the user should be aware that
this may require a lot of memory. --ileaved will only work for alignments up to 100,000 sequences or 100,000,000 total
nucleotides.
If the output alignment format is Stockholm or Pfam, the output alignment will be annotated with posterior probabilities
which estimate the confidence level of each aligned nucleotide. This annotation appears as lines beginning with ”#=GR
<seq name> PP”, one per sequence, each immediately below the corresponding aligned sequence ”<seq name>”.
Characters in PP lines have 12 possible values: ”0-9”, ”*”, or ”.”. If ”.”, the position corresponds to a gap in the sequence.
A value of ”0” indicates a posterior probability of between 0.0 and 0.05, ”1” indicates between 0.05 and 0.15, ”2”
indicates between 0.15 and 0.25 and so on up to ”9” which indicates between 0.85 and 0.95. A value of ”*” indicates
a posterior probability of between 0.95 and 1.0. Higher posterior probabilities correspond to greater confidence that
the aligned nucleotide belongs where it appears in the alignment. With --nonbanded, the calculation of the posterior
probabilities considers all possible alignments of the target sequence to the CM. Without --nonbanded (i.e. in default
mode), the calculation considers only possible alignments within the HMM bands. Further, the posterior probabilities
are conditional on the truncation mode of the alignment. For example, if the sequence alignment is truncated 5’, a PP
value of ”9” indicates between 0.85 and 0.95 of all 5’ truncated alignments include the given nucleotide at the given
position. The posterior annotation can be turned off with the --noprob option. If --small is enabled, posterior annotation
must also be turned off using --noprob.
The tabular output that is printed to stdout if the -o option is used includes one line per sequence and twelve fields
per line: ”idx”: the index of the sequence in the input file, ”seq name”: the sequence name; ”length”: the length of the
sequence; ”cm from” and ”cm to”: the model start and end positions of the alignment; ”trunc”: ”no” if the sequence is
not truncated, ”5’” if the beginning of the sequence truncated 5’, ”3’” if the end of the sequence is truncated, and ”5’&3’”
if both the beginning and the end are truncated; ”bit sc”: the bit score of the alignment, ”avg pp” the average posterior
probability of all aligned nucleotides in the alignment; ”band calc”, ”alignment” and ”total”: the time in seconds required
for calculating HMM bands, computing the alignment, and complete processing of the sequence, respectively; ”mem
(Mb)”: the size in Mb of all dynamic programming matrices required for aligning the sequence. This tabular data can be
saved to file <f> with the --sfile <f> option.


Options for controlling the alignment algorithm
--optacc Align sequences using the Durbin/Holmes optimal accuracy algorithm. This is the default.
The optimal accuracy alignment will be constrained by HMM bands for acceleration unless
the --nonbanded option is enabled. The optimal accuracy algorithm determines the align-
ment that maximizes the posterior probabilities of the aligned nucleotides within it. The
posterior probabilites are determined using (possibly HMM banded) variants of the Inside
and Outside algorithms.
--cyk Do not use the Durbin/Holmes optimal accuracy alignment to align the sequences, instead
use the CYK algorithm which determines the optimally scoring (maximum likelihood) align-
ment of the sequence to the model, given the HMM bands (unless --nonbanded is also
enabled).
--sample Sample an alignment from the posterior distribution of alignments. The posterior distribution
is determined using an HMM banded (unless --nonbanded) variant of the Inside algorithm.
--seed <n> Seed the random number generator with <n>, an integer >= 0. This option can only be
used in combination with --sample. If <n> is nonzero, stochastic sampling of alignments
will be reproducible; the same command will give the same results. If <n> is 0, the random
number generator is seeded arbitrarily, and stochastic samplings may vary from run to run
of the same command. The default seed is 181.
--notrunc Turn off truncated alignment algorithms. All sequences in the input file will be assumed to be
full length, unless --sub is also used, in which case the program can still handle truncated
sequences but will use an alternative strategy for their alignment.
--sub Turn on the sub model construction and alignment procedure. For each sequence, an HMM
is first used to predict the model start and end consensus columns, and a new sub CM is
constructed that only models consensus columns from start to end. The sequence is then
aligned to this sub CM. Sub alignment is an older method than the default one for aligning
sequences that are possibly truncated. By default, cmalign uses special DP algorithms to
handle truncated sequences which should be more accurate than the sub method in most
cases. --sub is still included as an option mainly for testing against this default truncated
sequence handling. This ”sub CM” procedure is not the same as the ”sub CMs” described
by Weinberg and Ruzzo.


Other options
--mapali <f> Reads the alignment from file <f> used to build the model aligns it as a single object to
the CM; e.g. the alignment in <f> is held fixed. This allows you to align sequences to a
model with cmalign and view them in the context of an existing trusted multiple alignment.
<f> must be the alignment file that the CM was built from. The program verifies that the
checksum of the file matches that of the file used to construct the CM. A similar option to
this one was called --withali in previous versions of cmalign.
--mapstr Must be used in combination with --mapali <f>. Propogate structural information for any
pseudoknots that exist in <f> to the output alignment. A similar option to this one was called
--withstr in previous versions of cmalign.
--informat <s> Assert that the input <seqfile> is in format <s>. Do not run Babelfish format autodec-
tion. This increases the reliability of the program somewhat, because the Babelfish can
make mistakes; particularly recommended for unattended, high-throughput runs of Infernal.
Acceptable formats are: FASTA, GENBANK, and DDBJ. <s> is case-insensitive.
--outformat <s> Specify the output alignment format as <s>. Acceptable formats are: Pfam, AFA, A2M,
Clustal, and Phylip. AFA is aligned fasta. Only Pfam and Stockholm alignment formats
will include consensus structure annotation and posterior probability annotation of aligned
residues.
--dnaout Output the alignments as DNA sequence alignments, instead of RNA ones.
--noprob Do not annotate the output alignment with posterior probabilities.
--matchonly Only include match columns in the output alignment, do not include any insertions relative
to the consensus model. This option may be useful when creating very large alignments
that require a lot of memory and disk space, most of which is necessary only to deal with
insert columns that are gaps in most sequences.
--ileaved Output the alignment in interleaved Stockholm format of a fixed width that may be more con-
venient for examination. This was the default output alignment format of previous versions
of cmalign. Note that cmalign requires more memory when this option is used. For this
reason, --ileaved will only work for alignments of up to 100,000 sequences or a total of
100,000,000 aligned nucleotides.
--regress <s> Save an additional copy of the output alignment with no author information to file <s>.
--verbose Output additional information in the tabular scores output (output to stdout if -o is used, or
to <f> if --sfile <f> is used). These are mainly useful for testing and debugging.
--cpu <n> Specify that <n> parallel CPU workers be used. If <n> is set as ”0”, then the program will
be run in serial mode, without using threads. You can also control this number by setting an
environment variable, INFERNAL NCPU. This option will only be available if the machine on
which Infernal was built is capable of using POSIX threading (see the Installation section of
the user guide for more information).
--mpi Run as an MPI parallel program. This option will only be available if Infernal has been
configured and built with the ”--enable-mpi” flag (see the Installation section of the user
guide for more information).







Output format
-------------

(1) target name: The name of the target sequence or profile.
(2) accession: The accession of the target sequence or profile, or ’-’ if none.
(3) query name: The name of the query sequence or profile.
(4) accession: The accession of the query sequence or profile, or ’-’ if none.
(5) mdl (model): Which type of model was used to compute the final score. Either ’cm’ or ’hmm’. A CM is used to compute the final hit scores unless the model has zero basepairs or the --hmmonly option is used, in which case a HMM will be used.
(6) mdl from (model coord): The start of the alignment of this hit with respect to the profile (CM or HMM), numbered 1..N for a profile of N consensus positions.
(7) mdl to (model coord): The end of the alignment of this hit with respect to the profile (CM or HMM), numbered 1..N for a profile of N consensus positions.
(8) seq from (ali coord): The start of the alignment of this hit with respect to the sequence, numbered 1..L for a sequence of L residues.
(9) seq to (ali coord): The end of the alignment of this hit with respect to the sequence, numbered 1..L for a sequence of L residues.
(10) strand: The strand on which the hit occurs on the sequence. ’+’ if the hit is on the top (Watson) strand, ’-’ if the hit is on the bottom (Crick) strand. If on the top strand, the “seq from” value will be less than or equal to the “seq to” value, else it will be greater than or equal to it. 
(11) trunc: Indicates if this is predicted to be a truncated CM hit or not. This will be “no” if it is a CM hit that is not predicted to be truncated by the end of the sequence, “5’ ” or “3’ ” if the hit is predicted to have one or more 5’ or 3’ residues missing due to a artificial truncation of the sequence, or “5’&amp;3”’ if the hit is predicted to have one or more 5’ residues missing and one or more 3’ residues missing. If the hit is an HMM hit, this will always be ’-’.
(12) pass: Indicates what “pass” of the pipeline the hit was detected on. This is probably only useful for testing and debugging. Non-truncated hits are found on the first pass, truncated hits are found on successive passes.
(13) gc: Fraction of G and C nucleotides in the hit.
(14) bias: The biased-composition correction: the bit score difference contributed by the null3 model for CM hits, or the null2 model for HMM hits. High bias scores may be a red flag for a false positive. It is difficult to correct for all possible ways in which a nonrandom but nonhomologous biological sequences can appear to be similar, such as short-period tandem repeats, so there are cases where the bias correction is not strong enough (creating false positives).
(15) score: The score (in bits) for this target/query comparison. It includes the biased-composition cor-rection (the “null3” model for CM hits, or the “null2” model for HMM hits).
(16) E-value: The expectation value (statistical significance) of the target. This is a per query E-value; i.e. calculated as the expected number of false positives achieving this comparison’s score for a single query against the search space Z. For cmsearch Z is defined as the total number of nucleotides in the target dataset multiplied by 2 because both strands are searched. For cmscan Z is the total number of nucleotides in the query sequence multiplied by 2 because both strands are searched and multiplied by the number of models in the target database. If you search with multiple queries and if you want to control the overall false positive rate of that search rather than the false positive rate per query, you will want to multiply this per-query E-value by how many queries you’re doing.
(17) inc: Indicates whether or not this hit achieves the inclusion threshold: ’!’ if it does, ’?’ if it does not (and rather only achieves the reporting threshold). By default, the inclusion threshold is an E-value of 0.01 and the reporting threshold is an E-value of 10.0, but these can be changed with command line options as described in the manual pages.
(18) description of target: The remainder of the line is the target’s description line, as free text.


For further questions please refere to the Infernal Userguide_.

.. _Userguide: http://selab.janelia.org/software/infernal/Userguide.pdf


How do I cite Infernal?
-----------------------

The recommended citation for using Infernal 1.1 is E. P. Nawrocki and S. R. Eddy, Infernal 1.1: 100-fold faster RNA homology searches , Bioinformatics 29:2933-2935 (2013). 

**Galaxy Wrapper Author**::

    *  Bjoern Gruening, University of Freiburg

    </help>
</tool>