view salmon.xml @ 0:91f3a2147127 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/salmon commit d7607abc9feea09f0f8a227ead4da09323e167bb
author bgruening
date Tue, 15 Nov 2016 11:36:41 -0500
parents
children c1d822f84e1a
line wrap: on
line source

<tool id="salmon" name="Salmon" version="0.7.2">

    <description>Transcript Quantification from RNA-seq data</description>

    <macros>
        <xml name="strandedness">
            <param name="strandedness" type="select" label="Specify the strandedness of the reads">
                <option value="U" selected="True">Not stranded (U)</option>
                <option value="SF">read 1 (or single-end read) comes from the forward strand (SF)</option>
                <option value="SR">read 1 (or single-end read) comes from the reverse strand (SR)</option>
            </param>
        </xml>
    </macros>

    <requirements>
        <requirement type="package" version="0.7.2">salmon</requirement>
    </requirements>

    <stdio>
        <exit_code range="1:" />
        <exit_code range=":-1" />
        <regex match="Error:" />
        <regex match="Exception:" />
        <regex match="Exception :" />
    </stdio>
    <version_command>salmon -version</version_command>
    <command><![CDATA[
        mkdir ./index
        &&
        mkdir ./output
        &&
        #if $refTranscriptSource.TranscriptSource == "history":
            salmon index
                --transcripts $refTranscriptSource.ownFile
                --kmerLen $refTranscriptSource.kmerLen
                --threads "\${GALAXY_SLOTS:-4}"
                --index './index'
                --type '$quasi_orphans.type'
                $perfectHash
                #if str($sasamp):
                    --sasamp $sasamp
                #end if
            #set $index_path = './index'
        #else:
            #set $index_path = $refTranscriptSource.index.fields.path
        #end if
        &&
        #if $single_or_paired.single_or_paired_opts == 'single':
            #if $single_or_paired.input_singles.ext == 'fasta':
                #set $ext = 'fasta'
            #else:
                #set $ext = 'fastq'
            #end if
            ln -s $single_or_paired.input_singles ./single.$ext &&
        #else:
            #if $single_or_paired.input_mate1.ext == 'fasta':
                #set $ext = 'fasta'
            #else:
                #set $ext = 'fastq'
            #end if
            ln -s $single_or_paired.input_mate1 ./mate1.$ext &&
            ln -s $single_or_paired.input_mate2 ./mate2.$ext &&
        #end if
        #if $geneMap:
            ln -s "$geneMap" ./geneMap.${geneMap.ext} &&
        #end if
        salmon quant
            --index $index_path
            #if $single_or_paired.single_or_paired_opts == 'single':
                --libType ${single_or_paired.strandedness}
                --unmatedReads ./single.$ext
            #else:
                --mates1 ./mate1.$ext
                --mates2 ./mate2.$ext
                --libType "${single_or_paired.orientation}${single_or_paired.strandedness}"
            #end if
            --output ./output
            #if str($quasi_orphans.type) == 'quasi':
                --allowOrphans
            #else:
                $quasi_orphans.allowOrphans
            #end if
            $seqBias
            $gcBias
            --threads "\${GALAXY_SLOTS:-4}"
            --incompatPrior $adv.incompatPrior
            $adv.consistentHits
            $adv.dumpEq
            #if str($adv.gcSizeSamp):
                --gcSizeSamp $adv.gcSizeSamp
            #end if
            #if str($adv.biasSpeedSamp):
                --biasSpeedSamp $adv.biasSpeedSamp
            #end if
            $adv.strictIntersect
            #if str($adv.fldMax):
                --fldMax $adv.fldMax
            #end if
            #if str($adv.fldMean):
                --fldMean $adv.fldMean
            #end if
            #if str($adv.fldSD):
                --fldSD $adv.fldSD
            #end if
            #if $adv.forgettingFactor:
                --forgettingFactor $adv.forgettingFactor
            #end if
            $adv.writeMappings
            #if str($adv.maxOcc):
                --maxOcc $adv.maxOcc
            #end if
            $adv.initUniform
            $adv.noFragLengthDist
            $adv.noBiasLengthThreshold
            #if str($adv.maxReadOcc):
                --maxReadOcc $adv.maxReadOcc
            #end if
            #if $geneMap:
                --geneMap ./geneMap.${geneMap.ext}
            #end if
            $adv.noEffectiveLengthCorrection
            $adv.useVBOpt
            #if str($adv.numBiasSamples):
                --numBiasSamples $adv.numBiasSamples
            #end if
            #if str($adv.numAuxModelSamples):
                --numAuxModelSamples $adv.numAuxModelSamples
            #end if
            #if str($adv.numPreAuxModelSamples):
                --numPreAuxModelSamples $adv.numPreAuxModelSamples
            #end if
            #if str($adv.numGibbsSamples):
                --numGibbsSamples $adv.numGibbsSamples
            #end if
            #if str($adv.numBootstraps):
                --numBootstraps $adv.numBootstraps
            #end if
            $adv.perTranscriptPrior
            #if $adv.vbPrior:
                --vbPrior $adv.vbPrior
            #end if
            $adv.writeUnmappedNames
]]>
    </command>

    <inputs>
        <conditional name="refTranscriptSource">
            <param name="TranscriptSource" type="select" label="Select a reference transcriptome from your history or use a built-in index?" help="Built-ins were indexed using default options">
                <option value="indexed">Use a built-in index</option>
                <option value="history" selected="True">Use one from the history</option>
            </param>
            <when value="indexed">
                <param name="index" type="select" label="Select a reference transcriptome" help="If your transcriptome of interest is not listed, contact your Galaxy admin">
                    <options from_data_table="salmon_indexes">
                        <filter type="sort_by" column="2"/>
                        <validator type="no_options" message="No indexes are available for the selected input dataset"/>
                    </options>
                </param>
            </when>  <!-- build-in -->
            <when value="history">
              <param name="ownFile" type="data" format="fasta" label="Select the reference transcriptome" help="in FASTA format" />
                <param argument="kmerLen" type="integer" value="31" label="The size should be odd number."/>
            </when>  <!-- history -->
        </conditional>
        <conditional name="single_or_paired">
            <param name="single_or_paired_opts" type="select" label="Is this library mate-paired?">
                <option value="single">Single-end</option>
                <option value="paired">Paired-end</option>
            </param>
            <when value="single">
                <param name="input_singles" type="data" format="fastq,fasta" label="FASTQ/FASTA file" help="FASTQ file." />
                <expand macro="strandedness" />
            </when>
            <when value="paired">
                <param name="input_mate1" type="data" format="fastq,fasta" label="Mate pair 1" help="FASTQ file." />
                <param name="input_mate2" type="data" format="fastq,fasta" label="Mate pair 2" help="FASTQ file." />
                <param name="orientation" type="select" label="Relative orientation of reads within a pair">
                    <option value="M">Mates are oriented in the same direction (M = matching)</option>
                    <option value="O">Mates are oriented away from each other (O = outward)</option>
                    <option value="I" selected="True">Mates are oriented toward each other (I = inward)</option>
                </param>
                <expand macro="strandedness" />
            </when>
        </conditional>
        <conditional name="quasi_orphans">
            <param argument="--type" type="select" label="Type of index" help="When using quasi, orphaned reads will be considered when performing lightweight-alignment.">
                <option value="quasi" selected="True">quasi</option>
                <option value="fmd">fmd</option>
            </param>
            <when value="quasi">
            </when>  <!-- build-in -->
            <when value="fmd">
                <param argument="--allowOrphans" type="boolean" truevalue="--allowOrphans" falsevalue="" checked="True"
                    label="Consider orphaned reads as valid hits when performing lightweight-alignment"
                    help="This option will increase sensitivity (allow more reads to map and more transcripts to be detected), but may decrease specificity as orphaned alignments are more likely to be spurious."/>
            </when>  <!-- history -->
        </conditional>
        <param argument="--perfectHash" type="boolean" truevalue="--perfectHash" falsevalue="" checked="False"
            label="Perfect Hash"
            help="Build the index using a perfect hash rather than a dense hash.  This will require  less memory (especially during quantification), but will take longer to construct "/>
        <param argument="--sasamp" type="integer" value="1" optional="True" label="Suffix Array"
            help="The interval at which the suffix array should be sampled. Smaller values are faster, but produce a larger index. The default should be OK, unless your transcriptome is huge. This value should be a power of 2."/>
        <param argument="--seqBias" type="boolean" truevalue="--seqBias" falsevalue="" checked="False"
            label="Perform sequence-specific bias correction"
            help=""/>
        <param argument="--gcBias" type="boolean" truevalue="--gcBias" falsevalue="" checked="False"
            label="Perform fragment GC bias correction"
            help=""/>
        <param argument="--geneMap" type="data" format="tabular,gff,gtf" optional="True"
            label="File containing a mapping of transcripts to genes.  If this file is provided Salmon will output both quant.sf and quant.genes.sf files, where the latter contains aggregated gene-level abundance estimates. The transcript to gene mapping should be provided as either a GTF file, or a in a simple tab-delimited format where each line contains the name of a transcript and the gene to which it belongs separated by a tab." />
        <section name="adv" title="Additional Options">
            <param argument="--writeMappings" type="boolean" truevalue="--writeMappings" falsevalue="" checked="False"
                label="Write Mappings"
                help=" Setting this option then the quasi-mapping results will be written out in SAM-cpmpatible format. By default, output will be directed to stdout, but an alternative file name can be provided instead." />
            <param argument="--incompatPrior" type="float" optional="True" value="9.9999999999999995e-21"
                label="Incompatible Prior"
                help="This option sets the prior probability that an alignment that disagrees with the specified library type (--libType) results from the true fragment origin. Setting this to 0 specifies that alignments that disagree with the library type should be 'impossible', while setting it to 1 says that alignments that disagree with the library type are no less likely than those that do" />
            <param argument="--dumpEq" type="boolean" truevalue="--dumpEq" falsevalue="" checked="False"
                label="Dump the equivalence class counts that were computed during quasi-mapping." help=""/>
            <param argument="--gcSizeSamp" type="integer" value="1" optional="True"
                label="The value by which to down-sample transcripts when representing the GC content" help="Larger values will reduce memory usage, but may decrease the fidelity of bias modeling results."/>
            <param argument="--biasSpeedSamp" type="integer" value="1" optional="True"
                label="The value at which the fragment length PMF is down-sampled when evaluating GC fragment bias." help="Larger values speed up effective length correction, but may decrease the fidelity of bias modeling results."/>
            <param argument="--strictIntersect" type="boolean" truevalue="--strictIntersect" falsevalue="" checked="False"
                label="Modifies how orphans are assigned." help="When this flag is set, if the intersection of the quasi-mappings for the left and right is empty, then all mappings for the left and all mappings for the right read are reported as orphaned quasi-mappings."/>
            <param argument="--minLen" type="integer" value="19" optional="True"
                label=" (S)MEMs smaller than this size won't be considered." help="" />
            <param argument="--sensitive" type="boolean" truevalue="--sensitive" falsevalue="" checked="False"
                label="Perform sensitive quantification"
                help=" Setting this option enables the splitting of SMEMs that are larger than 1.5 times the minimum seed length (minLen/k above).  This may reveal high scoring chains of MEMs that are masked by long SMEMs.  However, this option makes lightweight-alignment a bit slower and is usually not necessary if the reference is of reasonable quality." />
            <param argument="--consistentHits" type="boolean" truevalue="--consistentHits" falsevalue="" checked="False"
                label="Force hits gathered during quasi-mapping to be consistent"
                help="" />
            <param argument="--extraSensitive" type="boolean" truevalue="--extraSensitive" falsevalue="" checked="False"
                label="Perform extra sensitive quantification"
                help="Setting this option enables an extra pass of 'seed' search. Enabling this option may improve sensitivity (the number of reads having sufficient coverage), but will typically slow down quantification by ~40%.  Consider enabling this option if you find the mapping rate to be significantly lower than expected."/>
            <param argument="--coverage" type="float" value="0.69999999999999996" optional="True"
                label="Required coverage of read by union of SMEMs to consider it a hit"
                help="" />
            <param argument="--fldMax" type="integer" value="1000" optional="True"
                label="The maximum fragment length to consider when building the empirical distribution."
                help=""/>
            <param argument="--fldMean" type="integer" value="200" optional="True"
                label="The mean used in the fragment length distribution prior"
                help="If single end reads are being used for quantification, or there are an insufficient number of uniquely mapping reads when performing paired-end quantification to estimate the empirical fragment length distribution, then use this value to calculate effective lengths."/>
            <param argument="--fldSD" type="integer" value="80" optional="True"
                label="Standard deviation"
                help="The standard deviation used in the fragment length distribution prior."/>
            <param argument="--forgettingFactor" type="float" value="0.65000000000000002" optional="True"
                label="The forgetting factor used in the online learning schedule."
                help=" A smaller value results in quicker learning, but higher variance and may be unstable. A larger value results in slower learning but may be more stable.  Value should be in the interval (0.5, 1.0]." />
            <param argument="--maxOcc" type="integer" value="200" optional="True"
                label="(S)MEMs occuring more than this many times won't be considered"
                help=""/>
            <param argument="--initUniform" type="boolean" truevalue="--initUniform" falsevalue="" checked="False"
                label="Initialization with uniform parameters"
                help="initialize the offline inference with uniform parameters, rather than seeding with online parameters." />
            <param argument="--maxReadOcc" type="integer" value="100" optional="True"
                label="Maximal read mapping occurence"
                help="Reads mapping to more than this many places won't be considered."/>
            <param argument="--noEffectiveLengthCorrection" type="boolean" truevalue="--noEffectiveLengthCorrection" falsevalue="" checked="False"
                label="Disable effective length correction"
                help="Disables effective length correction when computing the probability that a fragment was generated from a transcript. If this flag is passed in, the fragment length distribution is not taken into account when computing this probability."/>
            <param argument="--noFragLengthDist" type="boolean" truevalue="--noFragLengthDist" falsevalue="" checked="False"
                label="Ignore fragment length distribution"
                help="[experimental] : Don't consider concordance with the learned fragment length distribution when trying to determine the probability that a fragment has originated from a specified location.  Normally, Fragments with unlikely lengths will be assigned a smaller relative probability than those with more likely lengths. When this flag is passed in, the observed fragment length has no effect on that fragment's a priori probability." />
            <param argument="--noBiasLengthThreshold" type="boolean" truevalue="--noBiasLengthThreshold" falsevalue="" checked="False"
                label="[experimental] : If this option is enabled, then no (lower) threshold will be set on how short bias correction can make effecctive lengths."
                help="This can increase the precision of bias correction, but harm robustness. The difault correction applies a threshold." />
            <param argument="--numBiasSamples" type="integer" value="2000000" optional="True"
                label="Number of fragment mappings to use when learning the sequence-specific bias model."
                help="" />
            <param argument="--numAuxModelSamples" type="integer" value="5000000" optional="True"
                label="The first numAuxModelSamples are used to train the auxiliary model parameters."
                help="(e.g. fragment length distribution, bias, etc.). After ther first numAuxModelSamples observations the auxiliary model parameters will be assumed to have converged and will be fixed." />
            <param argument="--numPreAuxModelSamples" type="integer" value="1000000" optional="True"
                label="The first numPreAuxModelSamples will have their assignment likelihoods and contributions to the transcript abundances computed without applying any auxiliary models."
                help=" The purpose of ignoring the auxiliary models for the first numPreAuxModelSamples observations is to avoid applying these models before thier parameters have been learned sufficiently well." />
            <param argument="--splitWidth" type="integer" value="0" optional="True"
                label=" If (S)MEM occurs fewer than this many times, search for smaller, contained  MEMs"
                help="The default value will not split (S)MEMs, a higher value will result in more MEMs being explore and, thus, will result in increased running time." />
            <param argument="--splitSpanningSeeds" type="boolean" truevalue="--splitSpanningSeeds" falsevalue="" checked="False"
                label="Attempt to split seeds that happen to fall on the boundary between two transcripts."
                help="This can improve the fragment hit-rate, but is usually not necessary."/>
            <param argument="--useVBOpt" type="boolean" truevalue="--useVBOpt" falsevalue="" checked="False"
                label="Use the Variational Bayesian EM rather than the traditional EM algorithm for optimization in the batch passes."
                help=""/>
            <param argument="--numGibbsSamples" type="integer" value="0" optional="True"
                label=" Number of Gibbs sampling rounds to perform."
                help="" />
            <param argument="--numBootstraps" type="integer" value="0" optional="True"
                label="Number of bootstrap samples to generate. Note: This is mutually exclusive with Gibbs sampling."
                help="" />
            <param argument="--perTranscriptPrior" type="boolean" truevalue="--perTranscriptPrior" falsevalue="" checked="False"
                label="The prior will be interpreted as a transcript-level prior."
                help="either the default or the argument provided via --vbPrior" />
            <param argument="--vbPrior" type="float" value="0.001" optional="True"
                label="The prior that will be used in the VBEM algorithm."
                help="This is interpreted as a per-nucleotide prior, unless the --perTranscriptPrior flag is also given, in which case this is used as a transcript-level prior." />
            <param argument="--writeUnmappedNames" type="boolean" truevalue="--writeUnmappedNames" falsevalue="" checked="False"
                label="Write the names of un-mapped reads to the file unmapped_names.txt."
                help=""/>
        </section>
    </inputs>

    <outputs>
        <data name="output_quant" format="tabular" from_work_dir="output/quant.sf" label="${tool.name} on ${on_string} (Quantification)" />
        <data name="output_gene_quant" format="tabular" from_work_dir="output/quant.genes.sf" label="${tool.name} on ${on_string} (Gene Quantification)">
            <filter>geneMap</filter>
        </data>
    </outputs>

    <tests>
        <test>
            <param name="single_or_paired_opts" value="paired" />
            <param name="input_mate1" value="reads_1.fastq" />
            <param name="input_mate2" value="reads_2.fastq" />
            <param name="biasCorrect" value="False" />
            <param name="TranscriptSource" value="history" />
            <param name="ownFile" value="transcripts.fasta" ftype="fasta" />
            <output name="output_quant">
                <assert_contents>
                    <has_text text="EffectiveLength" />
                    <has_text text="TPM" />
                    <has_text text="NM_001168316" />
                    <has_text text="NM_174914" />
                    <has_text text="NM_018953" />
                    <has_text text="NR_003084" />
                    <has_text text="NM_017410" />
                    <has_text text="NM_153693" />
                    <has_text text="NR_031764" />
                    <has_n_columns n="5" />
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="single_or_paired_opts" value="paired" />
            <param name="input_mate1" value="reads_1.fastq" />
            <param name="input_mate2" value="reads_2.fastq" />
            <param name="TranscriptSource" value="history" />
            <param name="ownFile" value="transcripts.fasta" ftype="fasta" />
            <param name="geneMap" value="gene_map.tab" ftype="tabular" />
            <output name="output_quant">
                <assert_contents>
                    <has_text text="EffectiveLength" />
                    <has_text text="TPM" />
                    <has_text text="NM_001168316" />
                    <has_text text="NM_174914" />
                    <has_text text="NM_018953" />
                    <has_text text="NR_003084" />
                    <has_text text="NM_017410" />
                    <has_text text="NM_153693" />
                    <has_text text="NR_031764" />
                    <has_n_columns n="5" />
                </assert_contents>
            </output>
            <output name="output_gene_quant">
                <assert_contents>
                    <has_text text="EffectiveLength" />
                    <has_text text="TPM" />
                    <has_text text="baz" />
                    <has_text text="bar" />
                    <has_text text="2283" />
                    <has_text text="1640" />
                    <has_n_columns n="5" />
                </assert_contents>
            </output>
        </test>
    </tests>

    <help><![CDATA[
**What it does**
salmon is a tool for transcript quantification from RNA-seq data.  It
requires a set of target transcripts (either from a reference or de-novo
assembly) to quantify.  All you need to run Salmon is a fasta file containing
your reference transcripts and a (set of) fasta/fastq file(s) containing your
reads.  Salmon runs in two phases; indexing and quantification.  The indexing
step is independent of the reads, and only need to be run one for a particular
set of reference transcripts and choice of k (the k-mer size). The
quantification step, obviously, is specific to the set of RNA-seq reads and is
thus run more frequently.
When the quantification output contains a number of columns:
(1) Transcript ID,
(2) Transcript Length,
(3) Transcripts per Million (TPM) and
(4) Estimated number of reads (an estimate of the number of reads drawn from this transcript given the transcript’s relative abundance and length).
The first two columns are self-explanatory, the next four are measures of transcript abundance and the final is a commonly used input for differential expression tools.
The Transcripts per Million quantification number is computed as described in [1], and is meant as an estimate of the number of transcripts, per million observed transcripts,
originating from each isoform. Its benefit over the F/RPKM measure is that it is independent of the mean expressed transcript length
(i.e. if the mean expressed transcript length varies between samples, for example, this alone can affect differential analysis based on the K/RPKM.).


Fragment Library Types
======================

There are numerous library preparation protocols for RNA-seq that result in
sequencing reads with different characteristics.  For example, reads can be
single end (only one side of a fragment is recorded as a read) or paired-end
(reads are generated from both ends of a fragment).  Further, the sequencing
reads themselves may be unstraned or strand-specific.  Finally, paired-end
protocols will have a specified relative orientation.  To characterize the
various different typs of sequencing libraries, we've created a miniature
"language" that allows for the succinct description of the many different types
of possible fragment libraries.  For paired-end reads, the possible
orientations, along with a graphical description of what they mean, are
illustrated below:
.. image:: ReadLibraryIllustration.png
The library type string consists of three parts: the relative orientation of
the reads, the strandedness of the library, and the directionality of the
reads.
The first part of the library string (relative orientation) is only provided if
the library is paired-end. The possible options are:
::

    I = inward
    O = outward
    M = matching

The second part of the read library string specifies whether the protocol is
stranded or unstranded; the options are:
::

    S = stranded
    U = unstranded

If the protocol is unstranded, then we're done.  The final part of the library
string specifies the strand from which the read originates in a strand-specific
protocol — it is only provided if the library is stranded (i.e. if the
library format string is of the form S).  The possible values are:
::

    F = read 1 (or single-end read) comes from the forward strand
    R = read 1 (or single-end read) comes from the reverse strand

So, for example, if you wanted to specify a fragment library of strand-specific
paired-end reads, oriented toward each other, where read 1 comes from the
forward strand and read 2 comes from the reverse strand, you would specify ``-l
ISF`` on the command line.  This designates that the library being processed has
the type "ISF" meaning, **I**\ nward (the relative orientation), **S**\ tranted
(the protocol is strand-specific), **F**\ orward (read 1 comes from the forward
strand).
The single end library strings are a bit simpler than their pair-end counter
parts, since there is no relative orientation of which to speak.  Thus, the
only possible library format types for single-end reads are ``U`` (for
unstranded), ``SF`` (for strand-specific reads coming from the forward strand)
and ``SR`` (for strand-specific reads coming from the reverse strand).
A few more examples of some library format strings and their interpretations are:
::

    IU (an unstranded paired-end library where the reads face each other)

::

    SF (a stranded single-end protocol where the reads come from the forward strand)

::

    OSR (a stranded paired-end protocol where the reads face away from each other,
         read1 comes from reverse strand and read2 comes from the forward strand)

.. note:: Correspondence to TopHat library types

   The popular `TopHat <http://ccb.jhu.edu/software/tophat/index.shtml>`_ RNA-seq
   read aligner has a different convention for specifying the format of the library.
   Below is a table that provides the corresponding Salmon/salmon library format
   string for each of the potential TopHat library types:

   +---------------------+-------------------------+
   | TopHat              | Salmon (and Sailfish)   |
   +=====================+============+============+
   |                     | Paired-end | Single-end |
   +---------------------+------------+------------+
   |``-fr-unstranded``   |``-l IU``   |``-l U``    |
   +---------------------+------------+------------+
   |``-fr-firststrand``  |``-l ISR``  |``-l SR``   |
   +---------------------+------------+------------+
   |``-fr-secondstrand`` |``-l ISF``  |``-l SF``   |
   +---------------------+------------+------------+

   The remaining salmon library format strings are not directly expressible in terms
   of the TopHat library types, and so there is no direct mapping for them.
]]> </help>
    <citations>
        <citation type="doi">10.1101/021592</citation>
    </citations>
</tool>