comparison sailfish.xml @ 0:3b4ed0e473dc draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/sailfish commit bd2dd2419ea52f30cd7de2f7109a12b49b5d0dba-dirty
author bgruening
date Fri, 16 Oct 2015 15:09:03 -0400
parents
children 06646e81c543
comparison
equal deleted inserted replaced
-1:000000000000 0:3b4ed0e473dc
1 <tool id="sailfish" name="Sailfish" version="0.7.6.0">
2 <description>transcript quantification from RNA-seq data</description>
3 <requirements>
4 <requirement type="package" version="0.7.6">sailfish</requirement>
5 </requirements>
6 <macros>
7 <xml name="strandedness">
8 <param name="strandedness" type="select" label="Specify the strandedness of the reads">
9 <option value="U" selected="True">Not stranded</option>
10 <option value="SF">read 1 (or single-end read) comes from the forward strand</option>
11 <option value="SR">read 1 (or single-end read) comes from the reverse strand</option>
12 </param>
13 </xml>
14 </macros>
15 <stdio>
16 <exit_code range="1:" />
17 <exit_code range=":-1" />
18 <regex match="Error:" />
19 <regex match="Exception:" />
20 </stdio>
21 <version_command>sailfish -version</version_command>
22 <command>
23 <![CDATA[
24
25 #if $refTranscriptSource.TranscriptSource == "history":
26 sailfish index
27 --transcripts $refTranscriptSource.ownFile
28 --kmerSize $refTranscriptSource.kmerSize
29 --out ./index_dir
30 --threads "\${GALAXY_SLOTS:-4}"
31 #set $index_path = './index_dir'
32 #else:
33 #set $index_path = $refTranscriptSource.index.fields.path
34 #end if
35
36 &&
37
38 #if $single_or_paired.single_or_paired_opts == 'single':
39 ln -s $single_or_paired.input_singles ./single.$single_or_paired.input_singles.ext &&
40 #else:
41 ln -s $single_or_paired.input_mate1 ./mate1.$single_or_paired.input_mate1.ext &&
42 ln -s $single_or_paired.input_mate2 ./mate2.$single_or_paired.input_mate2.ext &&
43 #end if
44
45
46 #if $geneMap:
47 ln -s "$geneMap" ./geneMap.$geneMap.ext &&
48 #end if
49
50 sailfish quant
51 --index $index_path
52 #if $single_or_paired.single_or_paired_opts == 'single':
53 --libType "${single_or_paired.orientation}${single_or_paired.strandedness}"
54 --unmated_reads ./single.$single_or_paired.input_singles.ext
55 #else:
56 --mates1 ./mate1.$single_or_paired.input_mate1.ext
57 --mates2 ./mate2.$single_or_paired.input_mate2.ext
58 --libType "${single_or_paired.orientation}${single_or_paired.strandedness}"
59 #end if
60 --output ./
61 $biasCorrect
62 --threads "\${GALAXY_SLOTS:-4}"
63
64 #if $fldMean:
65 --fldMean $fldMean
66 #end if
67
68 #if $fldSD:
69 --fldSD $fldSD
70 #end if
71
72 #if $maxReadOcc:
73 --maxReadOcc $maxReadOcc
74 #end if
75
76 #if $geneMap:
77 --geneMap ./geneMap.${geneMap.ext}
78 #end if
79
80 $noEffectiveLengthCorrection
81 $useVBOpt
82 $allowOrphans
83
84 $unsmoothedFLD
85 --maxFragLen ${maxFragLen}
86 --txpAggregationKey "${txpAggregationKey}"
87
88 ]]>
89 </command>
90 <inputs>
91 <conditional name="refTranscriptSource">
92 <param name="TranscriptSource" type="select" label="Select a reference transcriptome from your history or use a built-in index?" help="Built-ins were indexed using default options">
93 <option value="indexed">Use a built-in index</option>
94 <option value="history" selected="True">Use one from the history</option>
95 </param>
96 <when value="indexed">
97 <param name="index" type="select" label="Select a reference transcriptome" help="If your transcriptome of interest is not listed, contact your Galaxy admin">
98 <options from_data_table="sailfish_indexes">
99 <filter type="sort_by" column="2"/>
100 <validator type="no_options" message="No indexes are available for the selected input dataset"/>
101 </options>
102 </param>
103 </when> <!-- build-in -->
104 <when value="history">
105 <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select the reference transcriptome" />
106 <param argument="kmerSize" type="integer" value="21" max="32" label="The size of the k-mer on which the index is built"
107 help="There is a tradeoff here between the distinctiveness of the k-mers and their robustness to errors.
108 The shorter the k-mers, the more robust they will be to errors in the reads, but the longer the k-mers,
109 the more distinct they will be. We generally recommend using a k-mer size of at least 20."/>
110 </when> <!-- history -->
111 </conditional> <!-- refTranscriptSource -->
112
113 <conditional name="single_or_paired">
114 <param name="single_or_paired_opts" type="select" label="Is this library mate-paired?">
115 <option value="single">Single-end</option>
116 <option value="paired">Paired-end</option>
117 </param>
118 <when value="single">
119 <param name="input_singles" type="data" format="fastq,fasta" label="FASTQ/FASTA file" help="FASTQ file." />
120 <expand macro="strandedness" />
121 </when>
122 <when value="paired">
123 <param name="input_mate1" type="data" format="fastq,fasta" label="Mate pair 1" help="FASTQ file." />
124 <param name="input_mate2" type="data" format="fastq,fasta" label="Mate pair 2" help="FASTQ file." />
125 <param name="orientation" type="select" label="Relative orientation of reads within a pair">
126 <option value="M">Mates are oriented in the same direction (M = matching)</option>
127 <option value="O">Mates are oriented away from each other (O = outward)</option>
128 <option value="I" selected="True">Mates are oriented toward each other (I = inward)</option>
129 </param>
130 <expand macro="strandedness" />
131 </when>
132 </conditional>
133
134 <param argument="--geneMap" type="data" format="tabular,gff,gtf" optional="True" label="File containing a mapping of transcripts to genes"
135 help="Calculates the aggregated gene-level abundance estimations. This file should be eiher a GTF file or tab-delimited format
136 where each line contains the name of a transcript and the gene to which it belongs separated by a tab." />
137
138 <param argument="--biasCorrect" type="boolean" truevalue="--biasCorrect" falsevalue="" checked="False"
139 label="Perform bias correction" help=""/>
140
141 <param argument="--fldMean" type="integer" value="200" optional="True" label="Calculate effective lengths"
142 help="If single end reads are being used for quantification, or there are an insufficient number of uniquely mapping reads when performing paired-end quantification
143 to estimate the empirical fragment length distribution, then use this value to calculate effective lengths."/>
144
145 <param argument="--fldSD" type="integer" value="80" optional="True" label="Standard deviation"
146 help="The standard deviation used in the fragment length distribution for single-end quantification or when an empirical distribution cannot be learned."/>
147
148 <param argument="--maxReadOcc" type="integer" value="200" optional="True" label="Maximal read mapping occurence"
149 help="Reads mapping to more than this many places won't be considered."/>
150
151 <param argument="--noEffectiveLengthCorrection" type="boolean" truevalue="--noEffectiveLengthCorrection" falsevalue="" checked="False"
152 label="Disable effective length correction" help="Disables effective length correction when computing the probability that a fragment was generated from a transcript.
153 If this flag is passed in, the fragment length distribution is not taken into account when computing this probability."/>
154
155 <param argument="--useVBOpt" type="boolean" truevalue="--useVBOpt" falsevalue="" checked="False"
156 label="Use Variational Bayesian EM algorithm for optimization" help=""/>
157
158 <param argument="--allowOrphans" type="boolean" truevalue="--allowOrphans" falsevalue="" checked="False"
159 label="Consider orphaned reads as valid hits when performing lightweight-alignment"
160 help="This option will increase sensitivity (allow more reads to map and more transcripts to be detected), but may decrease specificity as orphaned alignments are more likely to be spurious."/>
161
162 <param argument="--unsmoothedFLD" type="boolean" truevalue="--unsmoothedFLD" falsevalue="" checked="False"
163 label="Use the un-smoothed approach to effective length correction" help="This traditional approach works by convolving the FLD with the characteristic function over each transcript."/>
164
165 <param argument="--maxFragLen" type="integer" value="1000" optional="True"
166 label="The maximum length of a fragment to consider when building the empirical fragment length distribution"
167 help=""/>
168
169 <param argument="--txpAggregationKey" value="gene_id" type="text" label="The key for aggregating transcripts during gene-level estimates"
170 help="The default is the gene_id field, but other fields (e.g. gene_name) might be useful depending on the specifics of the annotation being used." />
171
172 </inputs>
173 <outputs>
174 <data name="output_quant" format="tabular" from_work_dir="quant.sf" label="${tool.name} on ${on_string} (Quantification)" />
175 <data name="output_bias_corrected_quant" format="tabular" from_work_dir="quant_bias_corrected.sf" label="${tool.name} on ${on_string} (Bias corrected Quantification)">
176 <filter>bias_correct == '--biasCorrect'</filter>
177 </data>
178 <data name="output_gene_quant" format="tabular" from_work_dir="quant.genes.sf" label="${tool.name} on ${on_string} (Gene Quantification)">
179 <filter>geneMap is True</filter>
180 </data>
181 </outputs>
182 <tests>
183 <test>
184 <param name="single_or_paired_opts" value="paired" />
185 <param name="input_mate1" value="reads_1.fastq" />
186 <param name="input_mate2" value="reads_2.fastq" />
187 <param name="biasCorrect" value="True" />
188 <param name="TranscriptSource" value="history" />
189 <param name="ownFile" value="transcripts.fasta" ftype="fasta" />
190 <output file="sailfish_quant_result1.tab" ftype="tabular" name="output_quant" />
191 <output file="sailfish_bias_result1.tab" ftype="tabular" name="output_bias_corrected_quant" />
192 </test>
193 </tests>
194 <help>
195 <![CDATA[
196 **What it does**
197
198 Sailfish is a tool for transcript quantification from RNA-seq data. It
199 requires a set of target transcripts (either from a reference or _de-novo_
200 assembly) to quantify. All you need to run Sailfish is a fasta file containing
201 your reference transcripts and a (set of) fasta/fastq file(s) containing your
202 reads. Sailfish runs in two phases; indexing and quantification. The indexing
203 step is independent of the reads, and only need to be run one for a particular
204 set of reference transcripts and choice of k (the k-mer size). The
205 quantification step, obviously, is specific to the set of RNA-seq reads and is
206 thus run more frequently.
207
208 When the quantification output contains a number of columns:
209 (1) Transcript ID,
210 (2) Transcript Length,
211 (3) Transcripts per Million (TPM) and
212 (4) Estimated number of reads (an estimate of the number of reads drawn from this transcript given the transcript’s relative abundance and length).
213
214 The first two columns are self-explanatory, the next four are measures of transcript abundance and the final is a commonly used input for differential expression tools.
215 The Transcripts per Million quantification number is computed as described in [1], and is meant as an estimate of the number of transcripts, per million observed transcripts,
216 originating from each isoform. Its benefit over the F/RPKM measure is that it is independent of the mean expressed transcript length
217 (i.e. if the mean expressed transcript length varies between samples, for example, this alone can affect differential analysis based on the K/RPKM.).
218
219
220
221 Fragment Library Types
222 ======================
223
224 There are numerous library preparation protocols for RNA-seq that result in
225 sequencing reads with different characteristics. For example, reads can be
226 single end (only one side of a fragment is recorded as a read) or paired-end
227 (reads are generated from both ends of a fragment). Further, the sequencing
228 reads themselves may be unstraned or strand-specific. Finally, paired-end
229 protocols will have a specified relative orientation. To characterize the
230 various different typs of sequencing libraries, we've created a miniature
231 "language" that allows for the succinct description of the many different types
232 of possible fragment libraries. For paired-end reads, the possible
233 orientations, along with a graphical description of what they mean, are
234 illustrated below:
235
236 .. image:: ReadLibraryIllustration.png
237
238 The library type string consists of three parts: the relative orientation of
239 the reads, the strandedness of the library, and the directionality of the
240 reads.
241
242 The first part of the library string (relative orientation) is only provided if
243 the library is paired-end. The possible options are:
244
245 ::
246
247 I = inward
248 O = outward
249 M = matching
250
251 The second part of the read library string specifies whether the protocol is
252 stranded or unstranded; the options are:
253
254 ::
255
256 S = stranded
257 U = unstranded
258
259 If the protocol is unstranded, then we're done. The final part of the library
260 string specifies the strand from which the read originates in a strand-specific
261 protocol — it is only provided if the library is stranded (i.e. if the
262 library format string is of the form S). The possible values are:
263
264 ::
265
266 F = read 1 (or single-end read) comes from the forward strand
267 R = read 1 (or single-end read) comes from the reverse strand
268
269 So, for example, if you wanted to specify a fragment library of strand-specific
270 paired-end reads, oriented toward each other, where read 1 comes from the
271 forward strand and read 2 comes from the reverse strand, you would specify ``-l
272 ISF`` on the command line. This designates that the library being processed has
273 the type "ISF" meaning, **I**\ nward (the relative orientation), **S**\ tranted
274 (the protocol is strand-specific), **F**\ orward (read 1 comes from the forward
275 strand).
276
277 The single end library strings are a bit simpler than their pair-end counter
278 parts, since there is no relative orientation of which to speak. Thus, the
279 only possible library format types for single-end reads are ``U`` (for
280 unstranded), ``SF`` (for strand-specific reads coming from the forward strand)
281 and ``SR`` (for strand-specific reads coming from the reverse strand).
282
283 A few more examples of some library format strings and their interpretations are:
284
285 ::
286
287 IU (an unstranded paired-end library where the reads face each other)
288
289 ::
290
291 SF (a stranded single-end protocol where the reads come from the forward strand)
292
293 ::
294
295 OSR (a stranded paired-end protocol where the reads face away from each other,
296 read1 comes from reverse strand and read2 comes from the forward strand)
297
298 .. note:: Correspondence to TopHat library types
299
300 The popular `TopHat <http://ccb.jhu.edu/software/tophat/index.shtml>`_ RNA-seq
301 read aligner has a different convention for specifying the format of the library.
302 Below is a table that provides the corresponding sailfish/salmon library format
303 string for each of the potential TopHat library types:
304
305
306 +---------------------+-------------------------+
307 | TopHat | Salmon (and Sailfish) |
308 +=====================+============+============+
309 | | Paired-end | Single-end |
310 +---------------------+------------+------------+
311 |``-fr-unstranded`` |``-l IU`` |``-l U`` |
312 +---------------------+------------+------------+
313 |``-fr-firststrand`` |``-l ISR`` |``-l SR`` |
314 +---------------------+------------+------------+
315 |``-fr-secondstrand`` |``-l ISF`` |``-l SF`` |
316 +---------------------+------------+------------+
317
318 The remaining salmon library format strings are not directly expressible in terms
319 of the TopHat library types, and so there is no direct mapping for them.
320
321
322 ]]>
323 </help>
324 </tool>