comparison edger.xml @ 0:9bdff28ae1b1 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/edger commit eac022c9c6e51e661c1513306b9fefdad673487d
author iuc
date Tue, 07 Nov 2017 08:18:14 -0500
parents
children 2a16413ec60d
comparison
equal deleted inserted replaced
-1:000000000000 0:9bdff28ae1b1
1 <tool id="edger" name="edgeR" version="3.16.5">
2 <description>
3 Perform differential expression of count data
4 </description>
5
6 <requirements>
7 <requirement type="package" version="3.16.5">bioconductor-edger</requirement>
8 <requirement type="package" version="0.2.15">r-rjson</requirement>
9 <requirement type="package" version="1.20.0">r-getopt</requirement>
10 <!-- required for alpha function used with plotMD -->
11 <requirement type="package" version="0.4.1">r-scales</requirement>
12 <!-- This should be in limma conda now - check why still needed? -->
13 <requirement type="package" version="1.4.30">r-statmod</requirement>
14 </requirements>
15
16 <version_command><![CDATA[
17 echo $(R --version | grep version | grep -v GNU)", edgeR version" $(R --vanilla --slave -e "library(edgeR); cat(sessionInfo()\$otherPkgs\$edgeR\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", scales version" $(R --vanilla --slave -e "library(scales); cat(sessionInfo()\$otherPkgs\$scales\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", rjson version" $(R --vanilla --slave -e "library(rjson); cat(sessionInfo()\$otherPkgs\$rjson\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", getopt version" $(R --vanilla --slave -e "library(getopt); cat(sessionInfo()\$otherPkgs\$getopt\$Version)" 2> /dev/null | grep -v -i "WARNING: ")
18 ]]></version_command>
19
20 <command detect_errors="exit_code"><![CDATA[
21 #import json
22 Rscript '$__tool_directory__/edger.R'
23
24 -R '$outReport'
25 -o '$outReport.files_path'
26
27 #if $input.format=="files":
28
29 ## Adapted from DESeq2 wrapper
30 #set $temp_factor_names = list()
31 #for $fact in $input.rep_factor:
32 #set $temp_factor = list()
33 #for $g in $fact.rep_group:
34 #set $count_files = list()
35 #for $file in $g.countsFile:
36 $count_files.append(str($file))
37 #end for
38 $temp_factor.append( {str($g.groupName): $count_files} )
39 #end for
40
41 $temp_factor.reverse()
42 $temp_factor_names.append([str($fact.factorName), $temp_factor])
43 #end for
44 -j '#echo json.dumps(temp_factor_names)#'
45
46 #elif $input.format=="matrix":
47 -m '$input.counts'
48 #if $input.fact.ffile=='yes':
49 -f '$input.fact.finfo'
50 #else:
51 -i '${ '|'.join( ['%s::%s' % ($x.factorName, $x.groupNames) for x in $input.fact.rep_factor] ) }'
52 #end if
53 #end if
54
55 #if $anno.annoOpt=='yes':
56 -a '$anno.geneanno'
57 #end if
58
59 -C '${ ','.join( ['%s' % $x.contrast for x in $rep_contrast] ) }'
60
61 #if $f.filt.filt_select == 'yes':
62 #if $f.filt.cformat.format_select == 'cpm':
63 -c '$f.filt.cformat.cpmReq'
64 -s '$f.filt.cformat.cpmSampleReq'
65 #elif $f.filt.cformat.format_select == 'counts':
66 -z '$f.filt.cformat.cntReq'
67 #if $f.filt.cformat.samples.count_select == 'total':
68 -y
69 #elif $f.filt.cformat.samples.count_select == 'sample':
70 -s '$f.filt.cformat.samples.cntSampleReq'
71 #end if
72 #end if
73 #end if
74
75 #if $out.normCounts:
76 -x
77 #end if
78 #if $out.rdaOption:
79 -r
80 #end if
81
82 -l '$adv.lfc'
83 -p '$adv.pVal'
84 -d '$adv.pAdjust'
85 -n '$adv.normalisationOption'
86 #if $adv.robOption:
87 -b
88 #end if
89 #if $adv.lrtOption:
90 -t
91 #end if
92
93 &&
94 mkdir ./output_dir
95
96 &&
97 cp '$outReport.files_path'/*.tsv output_dir/
98 ]]></command>
99
100 <inputs>
101
102 <!-- Counts and Factors -->
103 <conditional name="input">
104 <param name="format" type="select" label="Count Files or Matrix?"
105 help="You can choose to input either separate count files (one per sample) or a single count matrix">
106 <option value="files">Separate Count Files</option>
107 <option value="matrix">Single Count Matrix</option>
108 </param>
109
110 <when value="files">
111 <repeat name="rep_factor" title="Factor" min="1">
112 <param name="factorName" type="text" label="Name" help="Name of experiment factor of interest (e.g. Genotype). One factor must be entered and there must be two or more groups per factor. Optional additional factors (e.g. Batch) can be entered using the Insert Factor button below, see Help section for more information. NOTE: Please only use letters, numbers or underscores.">
113 <sanitizer>
114 <valid initial="string.letters,string.digits"><add value="_" /></valid>
115 </sanitizer>
116 </param>
117 <repeat name="rep_group" title="Group" min="2" default="2">
118 <param name="groupName" type="text" label="Name"
119 help="Name of group that the counts files(s) belong to (e.g. WT or Mut). NOTE: Please only use letters, numbers or underscores (case sensitive).">
120 <sanitizer>
121 <valid initial="string.letters,string.digits"><add value="_" /></valid>
122 </sanitizer>
123 </param>
124 <param name="countsFile" type="data" format="tabular" multiple="true" label="Counts file(s)"/>
125 </repeat>
126 </repeat>
127 </when>
128
129 <when value="matrix">
130 <param name="counts" type="data" format="tabular" label="Count Matrix"/>
131
132 <conditional name="fact">
133 <param name="ffile" type="select" label="Input factor information from file?"
134 help="You can choose to input the factor and group information for the samples from a file or manually enter below.">
135 <option value="no">No</option>
136 <option value="yes">Yes</option>
137 </param>
138 <when value="yes">
139 <param name="finfo" type="data" format="tabular" label="Factor File"/>
140 </when>
141 <when value="no" >
142 <repeat name="rep_factor" title="Factor" min="1">
143 <param name="factorName" type="text" label="Factor Name"
144 help="Name of experiment factor of interest (e.g. Genotype). One factor must be entered and there must be two or more groups per factor. Additional factors (e.g. Batch) can be entered using the Insert Factor button below, see Help section below. NOTE: Please only use letters, numbers or underscores.">
145 <validator type="empty_field" />
146 <validator type="regex" message="Please only use letters, numbers or underscores">^[\w]+$</validator>
147 </param>
148 <param name="groupNames" type="text" label="Groups"
149 help="Enter the group names for the samples separated with commas e.g. WT,WT,WT,Mut,Mut,Mut. The order of the names must match the order of the samples in the columns of the count matrix. NOTE: Please only use letters, numbers or underscores (case sensitive).">
150 <validator type="empty_field" />
151 <validator type="regex" message="Please only use letters, numbers or underscores, and separate levels by commas">^[\w,]+$</validator>
152 </param>
153 </repeat>
154 </when>
155 </conditional>
156 </when>
157 </conditional>
158
159 <!-- Gene Annotations -->
160 <conditional name="anno">
161 <param name="annoOpt" type="select" label="Use Gene Annotations?"
162 help="If you provide an annotation file, annotations will be added to the table(s) of differential expression results to provide descriptions for each gene. See Help section below.">
163 <option value="no">No</option>
164 <option value="yes">Yes</option>
165 </param>
166 <when value="yes">
167 <param name="geneanno" type="data" format="tabular" label="Gene Annotations"/>
168 </when>
169 <when value="no" />
170 </conditional>
171
172 <!-- Contrasts -->
173 <repeat name="rep_contrast" title="Contrast" min="1" default="1">
174 <param name="contrast" type="text" label="Contrast of Interest" help="Names of two groups to compare separated by a hyphen e.g. Mut-WT. If the order is Mut-WT the fold changes in the results will be up/down in Mut relative to WT. If you have more than one contrast enter each separately using the Insert Contrast button below. For more info, see Chapter 8 in the limma User's guide: https://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf">
175 <validator type="empty_field" />
176 <validator type="regex" message="Please only use letters, numbers or underscores">^[\w-]+$</validator>
177 </param>
178 </repeat>
179
180 <!-- Filter Options -->
181 <section name="f" expanded="false" title="Filter Low Counts">
182 <conditional name="filt">
183 <param name="filt_select" type="select" label="Filter lowly expressed genes?" help="Treat genes with very low expression as unexpressed and filter out. See the Filter Low Counts section below for more information. Default: No">
184 <option value="no" selected="true">No</option>
185 <option value="yes">Yes</option>
186 </param>
187 <when value="yes">
188 <conditional name="cformat">
189 <param name="format_select" type="select" label="Filter on CPM or Count values?" help="It is slightly better to base the filtering on count-per-million (CPM) rather than the raw count values so as to avoid favoring genes expressed in samples sequenced to a higher depth. ">
190 <option value="cpm">CPM</option>
191 <option value="counts">Counts</option>
192 </param>
193 <when value="cpm">
194 <param name="cpmReq" type="float" value="1" min="0" label="Minimum CPM" help="Treat genes with CPM below this value as unexpressed and filter out. See the Filter Low Counts section below for more information."/>
195 <param name="cpmSampleReq" type="integer" value="0" min="0" label="Minimum Samples"
196 help="Filter out all genes that do not meet the Minimum CPM in at least this many samples. See the Filter Low Counts section below for more information."/>
197 </when>
198 <when value="counts">
199 <param name="cntReq" type="integer" value="0" min="0" label="Minimum Count" help="Filter out all genes that do not meet this minimum count. You can choose below to apply this filter to the total count for all samples or specify the number of samples under Minimum Samples. See the Filter Low Counts section below for more information." />
200 <conditional name="samples">
201 <param name="count_select" type="select" label="Filter on Total Count or per Sample Count values?" >
202 <option value="total">Total</option>
203 <option value="sample">Sample</option>
204 </param>
205 <when value="total">
206 <param name="totReq" type="boolean" truevalue="1" falsevalue="0" checked="false" label="Filter on Total Count" help="Apply the Minimum Count filter to genes after summing counts for all samples. See the Filter Low Counts section below for more information." />
207 </when>
208 <when value="sample">
209 <param name="cntSampleReq" type="integer" value="0" min="0" label="Minimum Samples"
210 help="Filter out all genes that do not meet the Minimum Count in at least this many samples. See the Filter Low Counts section below for more information."/>
211 </when>
212 </conditional>
213 </when>
214 </conditional>
215 </when>
216 <when value="no" />
217 </conditional>
218 </section>
219
220 <!-- Output Options -->
221 <section name="out" expanded="false" title="Output Options">
222 <param name="normCounts" type="boolean" truevalue="1" falsevalue="0" checked="false"
223 label="Output Normalised Counts Table?"
224 help="Output a file containing the normalised counts, these are in log2 counts per million (logCPM). Default: No">
225 </param>
226 <param name="rdaOption" type="boolean" truevalue="1" falsevalue="0" checked="false"
227 label="Output RData file?"
228 help="Output all the data used by R to construct the plots and tables, can be loaded into R. A link to the RData file will be provided in the HTML report. Default: No">
229 </param>
230 </section>
231
232 <!-- Advanced Options -->
233 <section name="adv" expanded="false" title="Advanced Options">
234 <param name="lfc" type="float" value="0" min="0"
235 label="Minimum Log2 Fold Change"
236 help="Genes above this threshold and below the p-value threshold are considered significant and highlighted in the MD plot. Default: 0."/>
237 <param name="pVal" type="float" value="0.05" min="0" max="1"
238 label="P-Value Adjusted Threshold"
239 help="Genes below this threshold are considered significant and highlighted in the MD plot. If either BH(1995) or BY(2001) are selected then this value is a false-discovery-rate control. If Holm(1979) is selected then this is an adjusted p-value for family-wise error rate. Default: 0.05."/>
240 <param name="pAdjust" type="select" label="P-Value Adjustment Method" help="Default: BH">
241 <option value="BH" selected="true">Benjamini and Hochberg (1995)</option>
242 <option value="BY">Benjamini and Yekutieli (2001)</option>
243 <option value="holm">Holm (1979)</option>
244 <option value="none">None</option>
245 </param>
246 <param name="normalisationOption" type="select" label="Normalisation Method" help="Default: TMM">
247 <option value="TMM" selected="true">TMM</option>
248 <option value="RLE">RLE</option>
249 <option value="upperquartile">Upperquartile</option>
250 <option value="none">None (Don't normalise)</option>
251 </param>
252 <param name="robOption" type="boolean" truevalue="1" falsevalue="0" checked="true" label="Use Robust Settings?" help="Using robust settings is usually recommended to protect against outlier genes. Default: Yes" />
253 <param name="lrtOption" type="boolean" truevalue="1" falsevalue="0" checked="false" label="Use Likelihood Ratio Test?" help="Use likelihood ratio test instead of quasi-likelihood F-test. Default: No"/>
254 </section>
255
256 </inputs>
257
258 <outputs>
259 <data name="outReport" format="html" label="${tool.name} on ${on_string}: Report" />
260 <collection name="outTables" type="list" label="${tool.name} on ${on_string}: Tables">
261 <discover_datasets pattern="(?P&lt;name&gt;.+)\.tsv$" format="tabular" directory="output_dir" visible="false" />
262 </collection>
263 </outputs>
264
265 <tests>
266 <!-- Ensure report is output -->
267 <test>
268 <param name="format" value="matrix" />
269 <param name="counts" value="matrix.txt" />
270 <repeat name="rep_factor">
271 <param name="factorName" value="Genotype"/>
272 <param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT" />
273 </repeat>
274 <repeat name="rep_contrast">
275 <param name="contrast" value="Mut-WT" />
276 </repeat>
277 <repeat name="rep_contrast">
278 <param name="contrast" value="WT-Mut" />
279 </repeat>
280 <param name="normalisationOption" value="TMM" />
281 <output_collection name="outTables" count="2">
282 <element name="edgeR_Mut-WT" ftype="tabular" file="edgeR_Mut-WT.tsv" />
283 <element name="edgeR_WT-Mut" ftype="tabular" file="edgeR_WT-Mut.tsv" />
284 </output_collection>
285 <output name="outReport" >
286 <assert_contents>
287 <has_text text="edgeR Analysis Output" />
288 <has_text text="quasi-likelihood" />
289 <not_has_text text="likelihood ratio" />
290 <not_has_text text="RData" />
291 </assert_contents>
292 </output>
293 </test>
294 <!-- Ensure annotation file input works -->
295 <test>
296 <param name="format" value="matrix" />
297 <param name="annoOpt" value="yes" />
298 <param name="geneanno" value="anno.txt" />
299 <param name="counts" value="matrix.txt" />
300 <repeat name="rep_factor">
301 <param name="factorName" value="Genotype"/>
302 <param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT" />
303 </repeat>
304 <repeat name="rep_contrast">
305 <param name="contrast" value="Mut-WT" />
306 </repeat>
307 <param name="normalisationOption" value="TMM" />
308 <output_collection name="outTables" count="1">
309 <element name="edgeR_Mut-WT" ftype="tabular" file="edgeR_Mut-WT_anno.tsv" />
310 </output_collection>
311 </test>
312 <!-- Ensure RData file can be output -->
313 <test>
314 <param name="format" value="matrix" />
315 <param name="rdaOption" value="true" />
316 <param name="counts" value="matrix.txt" />
317 <repeat name="rep_factor">
318 <param name="factorName" value="Genotype"/>
319 <param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT" />
320 </repeat>
321 <repeat name="rep_contrast">
322 <param name="contrast" value="Mut-WT" />
323 </repeat>
324 <param name="normalisationOption" value="TMM" />
325 <output name="outReport" >
326 <assert_contents>
327 <has_text text="RData" />
328 </assert_contents>
329 </output>
330 </test>
331 <!-- Ensure secondary factors work -->
332 <test>
333 <param name="format" value="matrix" />
334 <param name="counts" value="matrix.txt" />
335 <repeat name="rep_factor">
336 <param name="factorName" value="Genotype"/>
337 <param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT" />
338 </repeat>
339 <repeat name="rep_factor">
340 <param name="factorName" value="Batch"/>
341 <param name="groupNames" value="b1,b2,b3,b1,b2,b3"/>
342 </repeat>
343 <repeat name="rep_contrast">
344 <param name="contrast" value="Mut-WT" />
345 </repeat>
346 <param name="normalisationOption" value="TMM" />
347 <output_collection name="outTables" count="1" >
348 <element name="edgeR_Mut-WT" ftype="tabular" file="edgeR_Mut-WT_2fact.tsv" />
349 </output_collection>
350 </test>
351 <!-- Ensure factors file input works -->
352 <test>
353 <param name="format" value="matrix" />
354 <param name="ffile" value="yes" />
355 <param name="finfo" value="factorinfo.txt" />
356 <param name="counts" value="matrix.txt" />
357 <repeat name="rep_contrast">
358 <param name="contrast" value="Mut-WT" />
359 </repeat>
360 <param name="normalisationOption" value="TMM" />
361 <output_collection name="outTables" count="1">
362 <element name="edgeR_Mut-WT" ftype="tabular" file="edgeR_Mut-WT_2fact.tsv" />
363 </output_collection>
364 </test>
365 <!-- Ensure normalised counts file output works-->
366 <test>
367 <param name="format" value="matrix" />
368 <param name="normCounts" value="true" />
369 <param name="counts" value="matrix.txt" />
370 <repeat name="rep_factor">
371 <param name="factorName" value="Genotype"/>
372 <param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT" />
373 </repeat>
374 <repeat name="rep_contrast">
375 <param name="contrast" value="Mut-WT" />
376 </repeat>
377 <param name="normalisationOption" value="TMM" />
378 <output_collection name="outTables" count="2">
379 <element name="edgeR_Mut-WT" ftype="tabular" file="edgeR_Mut-WT.tsv" />
380 <element name="edgeR_normcounts" ftype="tabular" file="edgeR_normcounts.tsv" />
381 </output_collection>
382 </test>
383 <!-- Ensure likelihood ratio option works -->
384 <test>
385 <param name="format" value="matrix" />
386 <param name="counts" value="matrix.txt" />
387 <repeat name="rep_factor">
388 <param name="factorName" value="Genotype"/>
389 <param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT" />
390 </repeat>
391 <repeat name="rep_contrast">
392 <param name="contrast" value="Mut-WT" />
393 </repeat>
394 <param name="normalisationOption" value="TMM" />
395 <param name="lrtOption" value="true" />
396 <output name="outReport" >
397 <assert_contents>
398 <has_text text="likelihood ratio" />
399 <not_has_text text="quasi-likelihood" />
400 </assert_contents>
401 </output>
402 </test>
403 <!-- Ensure multiple counts files input works -->
404 <test>
405 <param name="format" value="files" />
406 <repeat name="rep_factor">
407 <param name="factorName" value="Genotype"/>
408 <repeat name="rep_group">
409 <param name="groupName" value="WT"/>
410 <param name="countsFile" value="WT1.counts,WT2.counts,WT3.counts"/>
411 </repeat>
412 <repeat name="rep_group">
413 <param name="groupName" value="Mut"/>
414 <param name="countsFile" value="Mut1.counts,Mut2.counts,Mut3.counts"/>
415 </repeat>
416 </repeat>
417 <repeat name="rep_factor">
418 <param name="factorName" value="Batch"/>
419 <repeat name="rep_group">
420 <param name="groupName" value="b1"/>
421 <param name="countsFile" value="WT1.counts,Mut1.counts"/>
422 </repeat>
423 <repeat name="rep_group">
424 <param name="groupName" value="b2"/>
425 <param name="countsFile" value="WT2.counts,Mut2.counts"/>
426 </repeat>
427 <repeat name="rep_group">
428 <param name="groupName" value="b3"/>
429 <param name="countsFile" value="WT3.counts,Mut3.counts"/>
430 </repeat>
431 </repeat>
432 <param name="annoOpt" value="yes" />
433 <param name="geneanno" value="anno.txt" />
434 <repeat name="rep_contrast">
435 <param name="contrast" value="Mut-WT" />
436 </repeat>
437 <repeat name="rep_contrast">
438 <param name="contrast" value="WT-Mut" />
439 </repeat>
440 <param name="normCounts" value="true" />
441 <output_collection name="outTables" count="3">
442 <element name="edgeR_Mut-WT" ftype="tabular" file="edgeR_Mut-WT_2fact_anno.tsv" />
443 <element name="edgeR_WT-Mut" ftype="tabular" file="edgeR_WT-Mut_2fact_anno.tsv" />
444 <element name="edgeR_normcounts" ftype="tabular" file="edgeR_normcounts_anno.tsv" />
445 </output_collection>
446 </test>
447 <!-- Ensure filtering on CPM in Mnimum Samples works -->
448 <test>
449 <param name="format" value="matrix" />
450 <param name="counts" value="matrix.txt" />
451 <repeat name="rep_factor">
452 <param name="factorName" value="Genotype"/>
453 <param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT" />
454 </repeat>
455 <repeat name="rep_contrast">
456 <param name="contrast" value="Mut-WT" />
457 </repeat>
458 <param name="normalisationOption" value="TMM" />
459 <param name="filt_select" value="yes" />
460 <param name="format_select" value="cpm" />
461 <!-- real cpmReq values would be a lot lower
462 this is just for this tiny test dataset -->
463 <param name="cpmReq" value="1000" />
464 <param name="cpmSampleReq" value="3" />
465 <output name="outReport" >
466 <assert_contents>
467 <has_text text="CPM in at least" />
468 <not_has_text text="after summing counts for all samples" />
469 <not_has_text text="counts in at least" />
470 </assert_contents>
471 </output>
472 <output_collection name="outTables" count="1" >
473 <element name="edgeR_Mut-WT" ftype="tabular" file="edgeR_Mut-WT_filt.tsv" />
474 </output_collection>
475 </test>
476 <!-- Ensure filtering on Count in Minmum Samples works -->
477 <test>
478 <param name="format" value="matrix" />
479 <param name="counts" value="matrix.txt" />
480 <repeat name="rep_factor">
481 <param name="factorName" value="Genotype"/>
482 <param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT" />
483 </repeat>
484 <repeat name="rep_contrast">
485 <param name="contrast" value="Mut-WT" />
486 </repeat>
487 <param name="normalisationOption" value="TMM" />
488 <param name="filt_select" value="yes" />
489 <param name="format_select" value="counts" />
490 <param name="cntReq" value="10" />
491 <param name="count_select" value="sample" />
492 <param name="cntSampleReq" value="3" />
493 <output name="outReport" >
494 <assert_contents>
495 <has_text text="counts in at least" />
496 <not_has_text text="after summing counts for all samples" />
497 <not_has_text text="CPM in at least" />
498 </assert_contents>
499 </output>
500 <output_collection name="outTables" count="1" >
501 <element name="edgeR_Mut-WT" ftype="tabular" file="edgeR_Mut-WT_filt.tsv" />
502 </output_collection>
503 </test>
504 <!-- Ensure filtering on Total Count works -->
505 <test>
506 <param name="format" value="matrix" />
507 <param name="counts" value="matrix.txt" />
508 <repeat name="rep_factor">
509 <param name="factorName" value="Genotype"/>
510 <param name="groupNames" value="Mut,Mut,Mut,WT,WT,WT" />
511 </repeat>
512 <repeat name="rep_contrast">
513 <param name="contrast" value="Mut-WT" />
514 </repeat>
515 <param name="normalisationOption" value="TMM" />
516 <param name="filt_select" value="yes" />
517 <param name="format_select" value="counts" />
518 <!-- real cntReq values would be a lot lower
519 this is just for this tiny test dataset -->
520 <param name="cntReq" value="1000" />
521 <param name="count_select" value="total" />
522 <param name="totReq" value="true" />
523 <output name="outReport" >
524 <assert_contents>
525 <has_text text="after summing counts for all samples" />
526 <not_has_text text="counts in at least" />
527 <not_has_text text="CPM in at least" />
528 </assert_contents>
529 </output>
530 <output_collection name="outTables" count="1" >
531 <element name="edgeR_Mut-WT" ftype="tabular" file="edgeR_Mut-WT_filt.tsv" />
532 </output_collection>
533 </test>
534 </tests>
535
536 <help><![CDATA[
537 .. class:: infomark
538
539 **What it does**
540
541 Given a counts matrix, or a set of counts files, for example from **featureCounts**, and optional information about the genes, this tool
542 produces plots and tables useful in the analysis of differential gene expression.
543
544 This tool uses the `edgeR`_ quasi-likelihood pipeline (edgeR-quasi) for differential expression analysis. This statistical methodology uses negative binomial generalized linear models, but with F-tests instead of likelihood ratio tests. This method provides stricter error rate control than other negative binomial based pipelines, including the traditional edgeR pipelines or DESeq2. While the limma pipelines are recommended for large-scale datasets, because of their speed and flexibility, the edgeR-quasi pipeline gives better performance in low-count situations. For the data analyzed in this `edgeR workflow article`_ ,the edgeR-quasi, limma-voom and limma-trend pipelines are all equally suitable and give similar results.
545
546 .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
547 .. _edgeR workflow article: https://f1000research.com/articles/5-1438
548
549 -----
550
551 **Inputs**
552
553 **Counts Data:**
554
555 The counts data can either be input as separate counts files (one sample per file) or a single count matrix (one sample per column). The rows correspond to genes, and columns correspond to the counts for the samples. Values must be tab separated, with the first row containing the sample/column labels and the first column containing the row/gene labels. Gene identifiers can be of any type but must be unique and not repeated within a counts file.
556
557 Example - **Separate Count Files**:
558
559 ========== =======
560 **GeneID** **WT1**
561 ---------- -------
562 11287 1699
563 11298 1905
564 11302 6
565 11303 2099
566 11304 356
567 11305 2528
568 ========== =======
569
570 Example - **Single Count Matrix**:
571
572 ========== ======= ======= ======= ======== ======== ========
573 **GeneID** **WT1** **WT2** **WT3** **Mut1** **Mut2** **Mut3**
574 ---------- ------- ------- ------- -------- -------- --------
575 11287 1699 1528 1601 1463 1441 1495
576 11298 1905 1744 1834 1345 1291 1346
577 11302 6 8 7 5 6 5
578 11303 2099 1974 2100 1574 1519 1654
579 11304 356 312 337 361 397 346
580 11305 2528 2438 2493 1762 1942 2027
581 ========== ======= ======= ======= ======== ======== ========
582
583 **Factor Information:**
584 Enter factor names and groups in the tool form, or provide a tab-separated file that has the samples in the same order as listed in the columns of the counts matrix. The second column should contain the primary factor levels (e.g. WT, Mut) with optional additional columns for any secondary factors.
585
586 Example:
587
588 ========== ============ =========
589 **Sample** **Genotype** **Batch**
590 ---------- ------------ ---------
591 WT1 WT b1
592 WT2 WT b2
593 WT3 WT b3
594 Mut1 Mut b1
595 Mut2 Mut b2
596 Mut3 Mut b3
597 ========== ============ =========
598
599 *Factor Name:* The name of the experimental factor being investigated e.g. Genotype, Treatment. One factor must be entered and spaces must not be used. Optionally, additional factors can be included, these are variables that might influence your experiment e.g. Batch, Gender, Subject. If additional factors are entered, edgeR will fit an additive linear model.
600
601 *Groups:* The names of the groups for the factor. These must be entered in the same order as the samples (to which the groups correspond) are listed in the columns of the counts matrix. Spaces must not be used and if entered into the tool form above, the values should be separated by commas.
602
603
604 **Gene Annotations:**
605 Optional input for gene annotations, this can contain more
606 information about the genes than just an ID number. The annotations will
607 be available in the differential expression results table and the optional normalised counts table.
608
609 Example:
610
611 ========== ========== ===================================================
612 **GeneID** **Symbol** **GeneName**
613 ---------- ---------- ---------------------------------------------------
614 1287 Pzp pregnancy zone protein
615 1298 Aanat arylalkylamine N-acetyltransferase
616 1302 Aatk apoptosis-associated tyrosine kinase
617 1303 Abca1 ATP-binding cassette, sub-family A (ABC1), member 1
618 1304 Abca4 ATP-binding cassette, sub-family A (ABC1), member 4
619 1305 Abca2 ATP-binding cassette, sub-family A (ABC1), member 2
620 ========== ========== ===================================================
621
622 **Contrasts of Interest:**
623 The contrasts you wish to make between levels.
624 A common contrast would be a simple difference between two levels: "Mut-WT"
625 represents the difference between the mutant and wild type genotypes.
626 Multiple contrasts must be entered separately using the Insert Contrast button, spaces must not be used.
627
628 **Filter Low Counts:**
629 Genes with very low counts across all libraries provide little evidence for differential expression.
630 In the biological point of view, a gene must be expressed at some minimal level before
631 it is likely to be translated into a protein or to be biologically important. In addition, the
632 pronounced discreteness of these counts interferes with some of the statistical approximations
633 that are used later in the pipeline. These genes should be filtered out prior to further
634 analysis.
635 As a rule of thumb, genes are dropped if they can’t possibly be expressed in all the samples
636 for any of the conditions. Users can set their own definition of genes being expressed. Usually
637 a gene is required to have a count of 5-10 in a library to be considered expressed in that
638 library. Users should also filter with count-per-million (CPM) rather than filtering on the
639 counts directly, as the latter does not account for differences in library sizes between samples.
640
641 Option to ignore the genes that do not show significant levels of
642 expression, this filtering is dependent on two criteria: CPM/count and number of samples. You can specify to filter on CPM (Minimum CPM) or count (Minimum Count) values:
643
644 * **Minimum CPM:** This is the minimum count per million that a gene must have in at
645 least the number of samples specified under Minimum Samples.
646
647 * **Minimum Count:** This is the minimum count that a gene must have. It can be combined with either Filter
648 on Total Count or Minimum Samples.
649
650 * **Filter on Total Count:** This can be used with the Minimum Count filter to keep genes
651 with a minimum total read count.
652
653 * **Minimum Samples:** This is the number of samples in which the Minimum CPM/Count
654 requirement must be met in order for that gene to be kept.
655
656 If the Minimum Samples filter is applied, only genes that exhibit a CPM/count greater than the required amount in at least the number of samples specified will be used for analysis. Care should be taken to
657 ensure that the sample requirement is appropriate. In the case of an experiment
658 with two experimental groups each with two members, if there is a change from
659 insignificant CPM/count to significant CPM/count but the sample requirement is set to 3,
660 then this will cause that gene to fail the criteria. When in doubt simply do not
661 filter or consult the `edgeR workflow article`_ for filtering recommendations.
662
663 **Advanced Options:**
664
665 By default error rate for multiple testing is controlled using Benjamini and
666 Hochberg's false discovery rate control at a threshold value of 0.05. However
667 there are options to change this to custom values.
668
669 * **Minimum log2-fold-change Required:**
670 In addition to meeting the requirement for the adjusted statistic for
671 multiple testing, the observation must have an absolute log2-fold-change
672 greater than this threshold to be considered significant, thus highlighted
673 in the MD plot.
674
675 * **Adjusted Threshold:**
676 Set the threshold for the resulting value of the multiple testing control
677 method. Only observations whose statistic falls below this value is
678 considered significant, thus highlighted in the MD plot.
679
680 * **P-Value Adjustment Method:**
681 Change the multiple testing control method, the options are BH(1995) and
682 BY(2001) which are both false discovery rate controls. There is also
683 Holm(1979) which is a method for family-wise error rate control.
684
685 **Normalisation Method:**
686 The most obvious technical factor that affects the read counts, other than gene expression
687 levels, is the sequencing depth of each RNA sample. edgeR adjusts any differential expression
688 analysis for varying sequencing depths as represented by differing library sizes. This is
689 part of the basic modeling procedure and flows automatically into fold-change or p-value
690 calculations. It is always present, and doesn’t require any user intervention.
691 The second most important technical influence on differential expression is one that is less
692 obvious. RNA-seq provides a measure of the relative abundance of each gene in each RNA
693 sample, but does not provide any measure of the total RNA output on a per-cell basis.
694 This commonly becomes important when a small number of genes are very highly expressed
695 in one sample, but not in another. The highly expressed genes can consume a substantial
696 proportion of the total library size, causing the remaining genes to be under-sampled in that
697 sample. Unless this RNA composition effect is adjusted for, the remaining genes may falsely
698 appear to be down-regulated in that sample . The edgeR `calcNormFactors` function normalizes for RNA composition by finding a set of scaling factors for the library sizes that minimize the log-fold changes between the samples for most genes. The default method for computing these scale factors uses a trimmed mean of M values (TMM) between each pair of samples. We call the product of the original library size and the scaling factor the *effective library size*. The effective library size replaces the original library size in all downsteam analyses. TMM is the recommended method for most RNA-Seq data where the majority (more than half) of the genes are believed not differentially expressed between any pair of the samples. You can change the normalisation method under **Advanced Options** above. For more information, see the `calcNormFactors` section in the `edgeR User's Guide`_.
699
700 **Robust Settings**
701 Option to use robust settings. Using robust settings (robust=TRUE) with the edgeR estimateDisp and glmQLFit functions is usually recommended to protect against outlier genes. This is turned on by default. Note that it is only used with the quasi-likelihood F test method. For more information, see the `edgeR workflow article`_.
702
703 **Test Method**
704 Option to use the likelihood ratio test instead of the quasi-likelihood F test. For more information, see the `edgeR workflow article`_.
705
706 .. _edgeR User's Guide: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
707
708 -----
709
710 **Outputs**
711
712 This tool outputs
713
714 * a table of differentially expressed genes for each contrast of interest
715 * a HTML report with plots and additional information
716
717 Optionally, under **Output Options** you can choose to output
718
719 * a normalised counts table
720 * an RData file
721
722 -----
723
724 **Citations**
725
726 Please try to cite the appropriate articles when you publish results obtained using software, as such citation is the main means by which the authors receive credit for their work. For the edgeR method itself, please cite Robinson et al., 2010, and for this tool (which was developed from the Galaxy limma-voom tool) please cite Liu et al., 2015.
727
728 ]]></help>
729 <citations>
730 <citation type="doi">10.1093/bioinformatics/btp616</citation>
731 <citation type="doi">10.1093/nar/gkv412</citation>
732 </citations>
733 </tool>