2
|
1 <tool id="shRNAseq" name="shRNAseq Tool" version="1.0.5">
|
|
2 <description>
|
|
3 Analyse hairpin differential representation using edgeR
|
|
4 </description>
|
|
5
|
|
6 <requirements>
|
|
7 <requirement type="R-module">edgeR</requirement>
|
|
8 <requirement type="R-module">limma</requirement>
|
|
9 </requirements>
|
|
10
|
|
11 <stdio>
|
|
12 <exit_code range="1:" level="fatal" description="Tool exception" />
|
|
13 </stdio>
|
|
14
|
|
15 <command interpreter="Rscript">
|
|
16 hairpinTool.R $inputOpt.type
|
|
17 #if $inputOpt.type=="fastq":
|
|
18 #for $i, $fas in enumerate($inputOpt.fastq):
|
|
19 fastq::$fas.file
|
|
20 #end for
|
|
21
|
|
22 $inputOpt.hairpin
|
|
23 $inputOpt.samples
|
|
24
|
|
25 #if $inputOpt.positions.option=="yes":
|
|
26 $inputOpt.positions.barstart
|
|
27 $inputOpt.positions.barend
|
|
28 $inputOpt.positions.hpstart
|
|
29 $inputOpt.positions.hpend
|
|
30 #else:
|
|
31 1
|
|
32 5
|
|
33 37
|
|
34 57
|
|
35 #end if
|
|
36 #else:
|
|
37 $inputOpt.counts
|
|
38 $inputOpt.anno
|
|
39 "$inputOpt.factors"
|
|
40 0 0 0
|
|
41 #end if
|
|
42
|
|
43 #if $filterCPM.option=="yes":
|
|
44 $filterCPM.cpmReq
|
|
45 $filterCPM.sampleReq
|
|
46 #else:
|
|
47 -Inf
|
|
48 -Inf
|
|
49 #end if
|
|
50
|
|
51 $fdr
|
|
52 $lfc
|
|
53 $workMode.mode
|
|
54 $outFile
|
|
55 $outFile.files_path
|
|
56
|
|
57 #if $workMode.mode=="classic":
|
|
58 "$workMode.pair1"
|
|
59 "$workMode.pair2"
|
|
60 #else:
|
|
61 "$workMode.contrast"
|
|
62 $workMode.roast.option
|
|
63 #if $workMode.roast.option=="yes":
|
|
64 $workMode.roast.hairpinReq
|
|
65 $workMode.roast.select.option
|
|
66 "$workMode.roast.select.selection"
|
|
67 #else:
|
|
68 0
|
|
69 0
|
|
70 0
|
|
71 #end if
|
|
72 #end if
|
|
73 </command>
|
|
74
|
|
75 <inputs>
|
|
76 <conditional name="inputOpt">
|
|
77 <param name="type" type="select" label="Input File Type">
|
|
78 <option value="fastq">FastQ File</option>
|
|
79 <option value="counts">Table of Counts</option>
|
|
80 </param>
|
|
81
|
|
82 <when value="fastq">
|
|
83 <param name="hairpin" type="data" format="tabular"
|
|
84 label="Hairpin Annotation"/>
|
|
85
|
|
86
|
|
87 <param name="samples" type="data" format="tabular"
|
|
88 label="Sample Annotation"/>
|
|
89
|
|
90 <repeat name="fastq" title="FastQ Files">
|
|
91 <param name="file" type="data" format="fastq"/>
|
|
92 </repeat>
|
|
93
|
|
94 <conditional name="positions">
|
|
95 <param name="option" type="select"
|
|
96 label="Specify Barcode and Hairpin Locations?"
|
|
97 help="Default Positions: Barcode: 1 to 5, Hairpin: 37 to 57.">
|
|
98 <option value="no" selected="True">No</option>
|
|
99 <option value="yes">Yes</option>
|
|
100 </param>
|
|
101
|
|
102 <when value="yes">
|
|
103 <param name="barstart" type="integer" value="1"
|
|
104 label="Barcode Starting Position"/>
|
|
105 <param name="barend" type="integer" value="5"
|
|
106 label="Barcode Ending Position"/>
|
|
107
|
|
108 <param name="hpstart" type="integer" value="37"
|
|
109 label="Hairpin Starting Position"/>
|
|
110
|
|
111 <param name="hpend" type="integer" value="57"
|
|
112 label="Hairpin Ending Position"/>
|
|
113 </when>
|
|
114
|
|
115 <when value="no"/>
|
|
116 </conditional>
|
|
117 </when>
|
|
118
|
|
119 <when value="counts">
|
|
120 <param name="counts" type="data" format="tabular" label="Counts Table"/>
|
|
121 <param name="anno" type="data" format="tabular"
|
|
122 label="Hairpin Annotation"/>
|
|
123 <param name="factors" type="data" format="tabular"
|
|
124 label="Sample Annotation"/>
|
|
125 </when>
|
|
126 </conditional>
|
|
127
|
|
128 <conditional name="filterCPM">
|
|
129 <param name="option" type="select" label="Filter Low CPM?"
|
|
130 help="Ignore hairpins with very low representation when performing
|
|
131 analysis.">
|
|
132 <option value="yes">Yes</option>
|
|
133 <option value="no">No</option>
|
|
134 </param>
|
|
135
|
|
136 <when value="yes">
|
|
137 <param name="cpmReq" type="float" value="0.5" min="0" max="1"
|
|
138 label="Minimum CPM"/>
|
|
139
|
|
140 <param name="sampleReq" type="integer" value="1" min="0"
|
|
141 label="Minimum Samples"
|
|
142 help="Filter out all the genes that do not meet the minimum
|
|
143 CPM in at least this many samples."/>
|
|
144 </when>
|
|
145
|
|
146 <when value="no"/>
|
|
147
|
|
148 </conditional>
|
|
149
|
|
150 <conditional name="workMode">
|
|
151 <param name="mode" type="select" label="Analysis Type"
|
|
152 help="Classic Exact Tests are useful for simple comparisons across
|
|
153 two sampling groups. Generalised linear models allow for more
|
|
154 complex contrasts and gene level analysis to be made.">
|
|
155 <option value="classic">Classic Exact Test</option>
|
|
156 <option value="glm">Generalised Linear Model</option>
|
|
157 </param>
|
|
158
|
|
159 <when value="classic">
|
|
160 <param name="pair1" type="text" label="Compare" size="40"/>
|
|
161 <param name="pair2" type="text" label="To" size="40"
|
|
162 help="The analysis will subtract values of this group from those
|
|
163 in the group above to establish the difference."/>
|
|
164 </when>
|
|
165
|
|
166 <when value="glm">
|
|
167 <param name="contrast" type="text" size="60"
|
|
168 label="Contrasts of interest"
|
|
169 help="Specify equations defining contrasts to be made. Eg.
|
|
170 KD-Control will result in positive fold change if KD has
|
|
171 greater expression and negative if Control has greater
|
|
172 expression."/>
|
|
173
|
|
174 <conditional name="roast">
|
|
175 <param name="option" type="select"
|
|
176 label="Perform Gene Level Analysis?"
|
|
177 help="Analyse LogFC tendencies for hairpins belonging
|
|
178 to the same gene.">
|
|
179 <option value="no">No</option>
|
|
180 <option value="yes">Yes</option>
|
|
181 </param>
|
|
182
|
|
183 <when value="yes">
|
|
184 <param name="hairpinReq" type="integer" value="2" min="2"
|
|
185 label="Minimum Hairpins"
|
|
186 help="Only genes with at least this many hairpins will
|
|
187 be analysed."/>
|
|
188
|
|
189 <conditional name="select">
|
|
190 <param name="option" type="select"
|
|
191 label="Gene Selection Method">
|
|
192 <option value="rank">By p-value Rank</option>
|
|
193 <option value="geneID">By Gene Identifier</option>
|
|
194 </param>
|
|
195 <when value="rank">
|
|
196 <param name="selection" type="text" size="40" value="1:5"
|
|
197 label="Ranks of Top Genes to Plot"
|
|
198 help="Genes are ranked in ascending p-value for
|
|
199 differential representation, individual ranks can
|
|
200 be entered seperated by comma or a range seperated
|
|
201 by colon."/>
|
|
202 </when>
|
|
203 <when value="geneID">
|
|
204 <param name="selection" type="text" size="80" value=""
|
|
205 label="Symbols of Genes to Plot"
|
|
206 help="Select genes based on their identifier in the
|
|
207 'Gene' column of the sample information file.
|
|
208 Please ensure exact match with the values in input
|
|
209 file and separate selections with commas."/>
|
|
210 </when>
|
|
211 </conditional>
|
|
212
|
|
213
|
|
214 </when>
|
|
215
|
|
216 <when value="no"/>
|
|
217 </conditional>
|
|
218 </when>
|
|
219 </conditional>
|
|
220
|
|
221 <param name="fdr" type="float" value="0.05" min="0" max="1"
|
|
222 label="FDR Threshold"
|
|
223 help="All observations below this threshold will be highlighted
|
|
224 in the smear plot."/>
|
|
225 <param name="lfc" type="float" value="0" min="0"
|
|
226 label="Absolute LogFC Threshold"
|
|
227 help="In additional to meeting the FDR requirement, the absolute
|
|
228 value of the log-fold-change of the observation must be above
|
|
229 this threshold to be highlighted."/>
|
|
230 </inputs>
|
|
231
|
|
232 <outputs>
|
|
233 <data format="html" name="outFile" label="shRNAseq Analysis"/>
|
|
234 </outputs>
|
|
235
|
|
236 <help>
|
|
237 .. class:: infomark
|
|
238
|
|
239 **What it does**
|
|
240
|
|
241 Given tables containing information about the hairpins and their associated
|
|
242 barcodes, information about the samples and fastq file containing the hairpin
|
|
243 reads. This tool will generate plots and tables for the analysis of differential
|
|
244 representation.
|
|
245
|
|
246 -----
|
|
247
|
|
248 .. class:: infomark
|
|
249
|
|
250 **INPUTS**
|
|
251
|
|
252 **Input File Type:**
|
|
253
|
|
254 This tool is able to either generate counts from a raw FastQ file given the
|
|
255 information regarding the samples and hairpins. Alternatively if a table of
|
|
256 counts has already been generated it can also be used.
|
|
257
|
|
258 **Counts Table (Counts Input):**
|
|
259
|
|
260 A tab delimited text table of information regarding the counts of hairpins.
|
|
261 Should have a column 'ID' to denote the hairpins that counts correspond to. Each
|
|
262 additional column should have titles corresponding to the label for the sample.
|
|
263
|
|
264 Example::
|
|
265
|
|
266 ID Sample1 Sample2 Sample3
|
|
267 Control1 49802 48014 40148
|
|
268 Control2 12441 16352 14232
|
|
269 Control3 9842 9148 9111
|
|
270 Hairpin1 3300 3418 2914
|
|
271 Hairpin2 91418 95812 93174
|
|
272 Hairpin3 32985 31975 35104
|
|
273 Hairpin4 12082 14081 14981
|
|
274 Hairpin5 2491 2769 2691
|
|
275 Hairpin6 1294 1486 1642
|
|
276 Hairpin7 49501 49076 47611
|
|
277 ...
|
|
278
|
|
279 **Hairpin Annotation:**
|
|
280
|
|
281 A tab delimited text table of information regarding the hairpins. Should have
|
|
282 columns 'ID', 'Sequences' and 'Gene' to uniquely identify the hairpin, align it
|
|
283 with the reads to produce counts and identify which gene the hairpin acts on.
|
|
284
|
|
285 NOTE: the column names are case sensitive and should be input exactly as they
|
|
286 are shown here.
|
|
287
|
|
288 Example::
|
|
289
|
|
290 ID Sequences Gene
|
|
291 Control1 TCTCGCTTGGGCGAGAGTAAG 2
|
|
292 Control2 CCGCCTGAAGTCTCTGATTAA 2
|
|
293 Control3 AGGAATTATAATGCTTATCTA 2
|
|
294 Hairpin1 AAGGCAGAGACTGACCACCTA 4
|
|
295 Hairpin2 GAGCGACCTGGTGTTACTCTA 4
|
|
296 Hairpin3 ATGGTGTAAATAGAGCTGTTA 4
|
|
297 Hairpin4 CAGCTCATCTTCTGTGAAGAA 4
|
|
298 Hairpin5 CAGCTCTGTGGGTCAGAAGAA 4
|
|
299 Hairpin6 CCAGGCACAGATCTCAAGATA 4
|
|
300 Hairpin7 ATGACAAGAAAGACATCTCAA 7
|
|
301 ...
|
|
302
|
|
303 **Sample Annotation (FastQ Input):**
|
|
304
|
|
305 A tab delimited text table of information regarding the samples. Should have
|
|
306 columns 'ID', 'Sequences' and 'group' to uniquely identify each sample, identify
|
|
307 the sample in the reads by its barcode sequence and correctly group replicates
|
|
308 for analysis. Additional columns may inserted for annotation purposes and will
|
|
309 not interfere with analysis as long as the necessary columns are present.
|
|
310
|
|
311 NOTE: the column names are case sensitive and should be input exactly as they
|
|
312 are shown here.
|
|
313
|
|
314 Example::
|
|
315
|
|
316 ID Sequences group Replicate
|
|
317 3 GAAAG Day 2 1
|
|
318 6 GAACC Day 10 1
|
|
319 9 GAAGA Day 5 GFP neg 1
|
|
320 16 GAATT Day 5 GFP pos 1
|
|
321 18 GACAC Day 2 2
|
|
322 21 GACCA Day 10 2
|
|
323 28 GACGT Day 5 GFP neg 2
|
|
324 31 GACTG Day 5 GFP pos 2
|
|
325 33 GAGAA Day 2 3
|
|
326 40 GAGCT Day 10 3
|
|
327 ...
|
|
328
|
|
329 **Specify Barcode and Hairpin Locations (FastQ Input):**
|
|
330
|
|
331 It is assumed that in the sequencing reads that the first 5 bases are the
|
|
332 barcodes and that bases 37-57 are the hairpins. If this is not the case then the
|
|
333 values of the positions can be changed, however it still requires the barcodes
|
|
334 and hairpins to be in a consistent location an in a continuous sequence.
|
|
335
|
|
336 **Filter Low CPM?:**
|
|
337
|
|
338 Often in a large screen there may members with very low counts which are of no
|
|
339 interest in the experiment, these may be filtered out to speed up computations.
|
|
340 Filtering will be based on counts per million in a required number of samples.
|
|
341
|
|
342 **Analysis Type:**
|
|
343
|
|
344 * **Classic Exact Test:** This allows two experimental groups to be compared and
|
|
345 p-values for differential representation derivec for each hairpin. Simple and
|
|
346 fast for straightforward comparisons. In this option you will have the option of
|
|
347 "*Compare* x *To* y" which implicitly subtracts the data from y from that of x
|
|
348 to produce the comparison.
|
|
349
|
|
350 * **Generalised Linear Model:** This allow for complex contrasts to be specified
|
|
351 and also gene level analysis to be performed. If this option is chosen then
|
|
352 contrasts must be explicitly stated in equations and multiple contrasts can be
|
|
353 made. In addition there will be the option to analyse hairpins on a per-gene
|
|
354 basis to see if hairpins belonging to a particular gene have any overall
|
|
355 tendencies for the direction of their log-fold-change.
|
|
356
|
|
357 **FDR Threshold:**
|
|
358 The smear plot in the output will have hairpins highlighted to signify
|
|
359 significant differential representation. The significance is determined by
|
|
360 contorlling the false discovery rate, only those with a FDR lower than the
|
|
361 threshold will be highlighted in the plot.
|
|
362
|
|
363 -----
|
|
364
|
|
365 **Citations:**
|
|
366
|
|
367 .. class:: infomark
|
|
368
|
|
369 limma
|
|
370
|
|
371 Please cite the paper below for the limma software itself. Please also try
|
|
372 to cite the appropriate methodology articles that describe the statistical
|
|
373 methods implemented in limma, depending on which limma functions you are
|
|
374 using. The methodology articles are listed in Section 2.1 of the limma
|
|
375 User's Guide.
|
|
376
|
|
377 * Smyth, GK (2005). Limma: linear models for microarray data. In:
|
|
378 'Bioinformatics and Computational Biology Solutions using R and
|
|
379 Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry,
|
|
380 W. Huber (eds), Springer, New York, pages 397-420.
|
|
381
|
|
382 .. class:: infomark
|
|
383
|
|
384 edgeR
|
|
385
|
|
386 Please cite the first paper for the software itself and the other papers for
|
|
387 the various original statistical methods implemented in edgeR. See
|
|
388 Section 1.2 in the User's Guide for more detail.
|
|
389
|
|
390 * Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor
|
|
391 package for differential expression analysis of digital gene expression
|
|
392 data. Bioinformatics 26, 139-140
|
|
393
|
|
394 * Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing
|
|
395 differences in tag abundance. Bioinformatics 23, 2881-2887
|
|
396
|
|
397 * Robinson MD and Smyth GK (2008). Small-sample estimation of negative
|
|
398 binomial dispersion, with applications to SAGE data.
|
|
399 Biostatistics, 9, 321-332
|
|
400
|
|
401 * McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis
|
|
402 of multifactor RNA-Seq experiments with respect to biological variation.
|
|
403 Nucleic Acids Research 40, 4288-4297
|
|
404
|
|
405 .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
|
|
406 .. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html
|
|
407 </help>
|
|
408 </tool>
|
|
409
|