Mercurial > repos > bgruening > sailfish
comparison sailfish.xml @ 0:3b4ed0e473dc draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/sailfish commit bd2dd2419ea52f30cd7de2f7109a12b49b5d0dba-dirty
author | bgruening |
---|---|
date | Fri, 16 Oct 2015 15:09:03 -0400 |
parents | |
children | 06646e81c543 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3b4ed0e473dc |
---|---|
1 <tool id="sailfish" name="Sailfish" version="0.7.6.0"> | |
2 <description>transcript quantification from RNA-seq data</description> | |
3 <requirements> | |
4 <requirement type="package" version="0.7.6">sailfish</requirement> | |
5 </requirements> | |
6 <macros> | |
7 <xml name="strandedness"> | |
8 <param name="strandedness" type="select" label="Specify the strandedness of the reads"> | |
9 <option value="U" selected="True">Not stranded</option> | |
10 <option value="SF">read 1 (or single-end read) comes from the forward strand</option> | |
11 <option value="SR">read 1 (or single-end read) comes from the reverse strand</option> | |
12 </param> | |
13 </xml> | |
14 </macros> | |
15 <stdio> | |
16 <exit_code range="1:" /> | |
17 <exit_code range=":-1" /> | |
18 <regex match="Error:" /> | |
19 <regex match="Exception:" /> | |
20 </stdio> | |
21 <version_command>sailfish -version</version_command> | |
22 <command> | |
23 <![CDATA[ | |
24 | |
25 #if $refTranscriptSource.TranscriptSource == "history": | |
26 sailfish index | |
27 --transcripts $refTranscriptSource.ownFile | |
28 --kmerSize $refTranscriptSource.kmerSize | |
29 --out ./index_dir | |
30 --threads "\${GALAXY_SLOTS:-4}" | |
31 #set $index_path = './index_dir' | |
32 #else: | |
33 #set $index_path = $refTranscriptSource.index.fields.path | |
34 #end if | |
35 | |
36 && | |
37 | |
38 #if $single_or_paired.single_or_paired_opts == 'single': | |
39 ln -s $single_or_paired.input_singles ./single.$single_or_paired.input_singles.ext && | |
40 #else: | |
41 ln -s $single_or_paired.input_mate1 ./mate1.$single_or_paired.input_mate1.ext && | |
42 ln -s $single_or_paired.input_mate2 ./mate2.$single_or_paired.input_mate2.ext && | |
43 #end if | |
44 | |
45 | |
46 #if $geneMap: | |
47 ln -s "$geneMap" ./geneMap.$geneMap.ext && | |
48 #end if | |
49 | |
50 sailfish quant | |
51 --index $index_path | |
52 #if $single_or_paired.single_or_paired_opts == 'single': | |
53 --libType "${single_or_paired.orientation}${single_or_paired.strandedness}" | |
54 --unmated_reads ./single.$single_or_paired.input_singles.ext | |
55 #else: | |
56 --mates1 ./mate1.$single_or_paired.input_mate1.ext | |
57 --mates2 ./mate2.$single_or_paired.input_mate2.ext | |
58 --libType "${single_or_paired.orientation}${single_or_paired.strandedness}" | |
59 #end if | |
60 --output ./ | |
61 $biasCorrect | |
62 --threads "\${GALAXY_SLOTS:-4}" | |
63 | |
64 #if $fldMean: | |
65 --fldMean $fldMean | |
66 #end if | |
67 | |
68 #if $fldSD: | |
69 --fldSD $fldSD | |
70 #end if | |
71 | |
72 #if $maxReadOcc: | |
73 --maxReadOcc $maxReadOcc | |
74 #end if | |
75 | |
76 #if $geneMap: | |
77 --geneMap ./geneMap.${geneMap.ext} | |
78 #end if | |
79 | |
80 $noEffectiveLengthCorrection | |
81 $useVBOpt | |
82 $allowOrphans | |
83 | |
84 $unsmoothedFLD | |
85 --maxFragLen ${maxFragLen} | |
86 --txpAggregationKey "${txpAggregationKey}" | |
87 | |
88 ]]> | |
89 </command> | |
90 <inputs> | |
91 <conditional name="refTranscriptSource"> | |
92 <param name="TranscriptSource" type="select" label="Select a reference transcriptome from your history or use a built-in index?" help="Built-ins were indexed using default options"> | |
93 <option value="indexed">Use a built-in index</option> | |
94 <option value="history" selected="True">Use one from the history</option> | |
95 </param> | |
96 <when value="indexed"> | |
97 <param name="index" type="select" label="Select a reference transcriptome" help="If your transcriptome of interest is not listed, contact your Galaxy admin"> | |
98 <options from_data_table="sailfish_indexes"> | |
99 <filter type="sort_by" column="2"/> | |
100 <validator type="no_options" message="No indexes are available for the selected input dataset"/> | |
101 </options> | |
102 </param> | |
103 </when> <!-- build-in --> | |
104 <when value="history"> | |
105 <param name="ownFile" type="data" format="fasta" metadata_name="dbkey" label="Select the reference transcriptome" /> | |
106 <param argument="kmerSize" type="integer" value="21" max="32" label="The size of the k-mer on which the index is built" | |
107 help="There is a tradeoff here between the distinctiveness of the k-mers and their robustness to errors. | |
108 The shorter the k-mers, the more robust they will be to errors in the reads, but the longer the k-mers, | |
109 the more distinct they will be. We generally recommend using a k-mer size of at least 20."/> | |
110 </when> <!-- history --> | |
111 </conditional> <!-- refTranscriptSource --> | |
112 | |
113 <conditional name="single_or_paired"> | |
114 <param name="single_or_paired_opts" type="select" label="Is this library mate-paired?"> | |
115 <option value="single">Single-end</option> | |
116 <option value="paired">Paired-end</option> | |
117 </param> | |
118 <when value="single"> | |
119 <param name="input_singles" type="data" format="fastq,fasta" label="FASTQ/FASTA file" help="FASTQ file." /> | |
120 <expand macro="strandedness" /> | |
121 </when> | |
122 <when value="paired"> | |
123 <param name="input_mate1" type="data" format="fastq,fasta" label="Mate pair 1" help="FASTQ file." /> | |
124 <param name="input_mate2" type="data" format="fastq,fasta" label="Mate pair 2" help="FASTQ file." /> | |
125 <param name="orientation" type="select" label="Relative orientation of reads within a pair"> | |
126 <option value="M">Mates are oriented in the same direction (M = matching)</option> | |
127 <option value="O">Mates are oriented away from each other (O = outward)</option> | |
128 <option value="I" selected="True">Mates are oriented toward each other (I = inward)</option> | |
129 </param> | |
130 <expand macro="strandedness" /> | |
131 </when> | |
132 </conditional> | |
133 | |
134 <param argument="--geneMap" type="data" format="tabular,gff,gtf" optional="True" label="File containing a mapping of transcripts to genes" | |
135 help="Calculates the aggregated gene-level abundance estimations. This file should be eiher a GTF file or tab-delimited format | |
136 where each line contains the name of a transcript and the gene to which it belongs separated by a tab." /> | |
137 | |
138 <param argument="--biasCorrect" type="boolean" truevalue="--biasCorrect" falsevalue="" checked="False" | |
139 label="Perform bias correction" help=""/> | |
140 | |
141 <param argument="--fldMean" type="integer" value="200" optional="True" label="Calculate effective lengths" | |
142 help="If single end reads are being used for quantification, or there are an insufficient number of uniquely mapping reads when performing paired-end quantification | |
143 to estimate the empirical fragment length distribution, then use this value to calculate effective lengths."/> | |
144 | |
145 <param argument="--fldSD" type="integer" value="80" optional="True" label="Standard deviation" | |
146 help="The standard deviation used in the fragment length distribution for single-end quantification or when an empirical distribution cannot be learned."/> | |
147 | |
148 <param argument="--maxReadOcc" type="integer" value="200" optional="True" label="Maximal read mapping occurence" | |
149 help="Reads mapping to more than this many places won't be considered."/> | |
150 | |
151 <param argument="--noEffectiveLengthCorrection" type="boolean" truevalue="--noEffectiveLengthCorrection" falsevalue="" checked="False" | |
152 label="Disable effective length correction" help="Disables effective length correction when computing the probability that a fragment was generated from a transcript. | |
153 If this flag is passed in, the fragment length distribution is not taken into account when computing this probability."/> | |
154 | |
155 <param argument="--useVBOpt" type="boolean" truevalue="--useVBOpt" falsevalue="" checked="False" | |
156 label="Use Variational Bayesian EM algorithm for optimization" help=""/> | |
157 | |
158 <param argument="--allowOrphans" type="boolean" truevalue="--allowOrphans" falsevalue="" checked="False" | |
159 label="Consider orphaned reads as valid hits when performing lightweight-alignment" | |
160 help="This option will increase sensitivity (allow more reads to map and more transcripts to be detected), but may decrease specificity as orphaned alignments are more likely to be spurious."/> | |
161 | |
162 <param argument="--unsmoothedFLD" type="boolean" truevalue="--unsmoothedFLD" falsevalue="" checked="False" | |
163 label="Use the un-smoothed approach to effective length correction" help="This traditional approach works by convolving the FLD with the characteristic function over each transcript."/> | |
164 | |
165 <param argument="--maxFragLen" type="integer" value="1000" optional="True" | |
166 label="The maximum length of a fragment to consider when building the empirical fragment length distribution" | |
167 help=""/> | |
168 | |
169 <param argument="--txpAggregationKey" value="gene_id" type="text" label="The key for aggregating transcripts during gene-level estimates" | |
170 help="The default is the gene_id field, but other fields (e.g. gene_name) might be useful depending on the specifics of the annotation being used." /> | |
171 | |
172 </inputs> | |
173 <outputs> | |
174 <data name="output_quant" format="tabular" from_work_dir="quant.sf" label="${tool.name} on ${on_string} (Quantification)" /> | |
175 <data name="output_bias_corrected_quant" format="tabular" from_work_dir="quant_bias_corrected.sf" label="${tool.name} on ${on_string} (Bias corrected Quantification)"> | |
176 <filter>bias_correct == '--biasCorrect'</filter> | |
177 </data> | |
178 <data name="output_gene_quant" format="tabular" from_work_dir="quant.genes.sf" label="${tool.name} on ${on_string} (Gene Quantification)"> | |
179 <filter>geneMap is True</filter> | |
180 </data> | |
181 </outputs> | |
182 <tests> | |
183 <test> | |
184 <param name="single_or_paired_opts" value="paired" /> | |
185 <param name="input_mate1" value="reads_1.fastq" /> | |
186 <param name="input_mate2" value="reads_2.fastq" /> | |
187 <param name="biasCorrect" value="True" /> | |
188 <param name="TranscriptSource" value="history" /> | |
189 <param name="ownFile" value="transcripts.fasta" ftype="fasta" /> | |
190 <output file="sailfish_quant_result1.tab" ftype="tabular" name="output_quant" /> | |
191 <output file="sailfish_bias_result1.tab" ftype="tabular" name="output_bias_corrected_quant" /> | |
192 </test> | |
193 </tests> | |
194 <help> | |
195 <![CDATA[ | |
196 **What it does** | |
197 | |
198 Sailfish is a tool for transcript quantification from RNA-seq data. It | |
199 requires a set of target transcripts (either from a reference or _de-novo_ | |
200 assembly) to quantify. All you need to run Sailfish is a fasta file containing | |
201 your reference transcripts and a (set of) fasta/fastq file(s) containing your | |
202 reads. Sailfish runs in two phases; indexing and quantification. The indexing | |
203 step is independent of the reads, and only need to be run one for a particular | |
204 set of reference transcripts and choice of k (the k-mer size). The | |
205 quantification step, obviously, is specific to the set of RNA-seq reads and is | |
206 thus run more frequently. | |
207 | |
208 When the quantification output contains a number of columns: | |
209 (1) Transcript ID, | |
210 (2) Transcript Length, | |
211 (3) Transcripts per Million (TPM) and | |
212 (4) Estimated number of reads (an estimate of the number of reads drawn from this transcript given the transcript’s relative abundance and length). | |
213 | |
214 The first two columns are self-explanatory, the next four are measures of transcript abundance and the final is a commonly used input for differential expression tools. | |
215 The Transcripts per Million quantification number is computed as described in [1], and is meant as an estimate of the number of transcripts, per million observed transcripts, | |
216 originating from each isoform. Its benefit over the F/RPKM measure is that it is independent of the mean expressed transcript length | |
217 (i.e. if the mean expressed transcript length varies between samples, for example, this alone can affect differential analysis based on the K/RPKM.). | |
218 | |
219 | |
220 | |
221 Fragment Library Types | |
222 ====================== | |
223 | |
224 There are numerous library preparation protocols for RNA-seq that result in | |
225 sequencing reads with different characteristics. For example, reads can be | |
226 single end (only one side of a fragment is recorded as a read) or paired-end | |
227 (reads are generated from both ends of a fragment). Further, the sequencing | |
228 reads themselves may be unstraned or strand-specific. Finally, paired-end | |
229 protocols will have a specified relative orientation. To characterize the | |
230 various different typs of sequencing libraries, we've created a miniature | |
231 "language" that allows for the succinct description of the many different types | |
232 of possible fragment libraries. For paired-end reads, the possible | |
233 orientations, along with a graphical description of what they mean, are | |
234 illustrated below: | |
235 | |
236 .. image:: ReadLibraryIllustration.png | |
237 | |
238 The library type string consists of three parts: the relative orientation of | |
239 the reads, the strandedness of the library, and the directionality of the | |
240 reads. | |
241 | |
242 The first part of the library string (relative orientation) is only provided if | |
243 the library is paired-end. The possible options are: | |
244 | |
245 :: | |
246 | |
247 I = inward | |
248 O = outward | |
249 M = matching | |
250 | |
251 The second part of the read library string specifies whether the protocol is | |
252 stranded or unstranded; the options are: | |
253 | |
254 :: | |
255 | |
256 S = stranded | |
257 U = unstranded | |
258 | |
259 If the protocol is unstranded, then we're done. The final part of the library | |
260 string specifies the strand from which the read originates in a strand-specific | |
261 protocol — it is only provided if the library is stranded (i.e. if the | |
262 library format string is of the form S). The possible values are: | |
263 | |
264 :: | |
265 | |
266 F = read 1 (or single-end read) comes from the forward strand | |
267 R = read 1 (or single-end read) comes from the reverse strand | |
268 | |
269 So, for example, if you wanted to specify a fragment library of strand-specific | |
270 paired-end reads, oriented toward each other, where read 1 comes from the | |
271 forward strand and read 2 comes from the reverse strand, you would specify ``-l | |
272 ISF`` on the command line. This designates that the library being processed has | |
273 the type "ISF" meaning, **I**\ nward (the relative orientation), **S**\ tranted | |
274 (the protocol is strand-specific), **F**\ orward (read 1 comes from the forward | |
275 strand). | |
276 | |
277 The single end library strings are a bit simpler than their pair-end counter | |
278 parts, since there is no relative orientation of which to speak. Thus, the | |
279 only possible library format types for single-end reads are ``U`` (for | |
280 unstranded), ``SF`` (for strand-specific reads coming from the forward strand) | |
281 and ``SR`` (for strand-specific reads coming from the reverse strand). | |
282 | |
283 A few more examples of some library format strings and their interpretations are: | |
284 | |
285 :: | |
286 | |
287 IU (an unstranded paired-end library where the reads face each other) | |
288 | |
289 :: | |
290 | |
291 SF (a stranded single-end protocol where the reads come from the forward strand) | |
292 | |
293 :: | |
294 | |
295 OSR (a stranded paired-end protocol where the reads face away from each other, | |
296 read1 comes from reverse strand and read2 comes from the forward strand) | |
297 | |
298 .. note:: Correspondence to TopHat library types | |
299 | |
300 The popular `TopHat <http://ccb.jhu.edu/software/tophat/index.shtml>`_ RNA-seq | |
301 read aligner has a different convention for specifying the format of the library. | |
302 Below is a table that provides the corresponding sailfish/salmon library format | |
303 string for each of the potential TopHat library types: | |
304 | |
305 | |
306 +---------------------+-------------------------+ | |
307 | TopHat | Salmon (and Sailfish) | | |
308 +=====================+============+============+ | |
309 | | Paired-end | Single-end | | |
310 +---------------------+------------+------------+ | |
311 |``-fr-unstranded`` |``-l IU`` |``-l U`` | | |
312 +---------------------+------------+------------+ | |
313 |``-fr-firststrand`` |``-l ISR`` |``-l SR`` | | |
314 +---------------------+------------+------------+ | |
315 |``-fr-secondstrand`` |``-l ISF`` |``-l SF`` | | |
316 +---------------------+------------+------------+ | |
317 | |
318 The remaining salmon library format strings are not directly expressible in terms | |
319 of the TopHat library types, and so there is no direct mapping for them. | |
320 | |
321 | |
322 ]]> | |
323 </help> | |
324 </tool> |