Mercurial > repos > iuc > goseq
comparison goseq.xml @ 2:ab492df30cdf draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/goseq commit 4a3c9f195ba5d899b1a1ce5e80281cdf230f456a
author | iuc |
---|---|
date | Mon, 23 Oct 2017 11:19:12 -0400 |
parents | 9d1256d9ef0b |
children | 783e8b70b047 |
comparison
equal
deleted
inserted
replaced
1:9d1256d9ef0b | 2:ab492df30cdf |
---|---|
1 <tool id="goseq" name="goseq" version="0.2.2"> | 1 <tool id="goseq" name="goseq" version="1.26.0"> |
2 <description>tests for overrepresented gene categories</description> | 2 <description>tests for overrepresented gene categories</description> |
3 <requirements> | 3 <requirements> |
4 <requirement type="package" version="1.3.2">r-optparse</requirement> | 4 <requirement type="package" version="1.3.2">r-optparse</requirement> |
5 <requirement type="package" version="1.22.0">bioconductor-goseq</requirement> | 5 <requirement type="package" version="1.26.0">bioconductor-goseq</requirement> |
6 <requirement type="package" version="3.3.0">bioconductor-org.hs.eg.db</requirement> | |
7 <requirement type="package" version="3.4.0">bioconductor-org.dm.eg.db</requirement> | |
8 <requirement type="package" version="3.4.1">bioconductor-org.dr.eg.db</requirement> | |
9 <requirement type="package" version="3.4.0">bioconductor-org.mm.eg.db</requirement> | |
6 </requirements> | 10 </requirements> |
7 <stdio> | 11 <stdio> |
8 <regex match="Execution halted" | 12 <regex match="Execution halted" |
9 source="both" | 13 source="both" |
10 level="fatal" | 14 level="fatal" |
16 <regex match="Fatal error" | 20 <regex match="Fatal error" |
17 source="both" | 21 source="both" |
18 level="fatal" | 22 level="fatal" |
19 description="An undefined error occured, please check your input carefully and contact your administrator." /> | 23 description="An undefined error occured, please check your input carefully and contact your administrator." /> |
20 </stdio> | 24 </stdio> |
25 <version_command><![CDATA[ | |
26 echo $(R --version | grep version | grep -v GNU)", goseq version" $(R --vanilla --slave -e "library(goseq); cat(sessionInfo()\$otherPkgs\$goseq\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", optparse version" $(R --vanilla --slave -e "library(optparse); cat(sessionInfo()\$otherPkgs\$optparse\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", org.Hs.eg.db version" $(R --vanilla --slave -e "library(org.Hs.eg.db); cat(sessionInfo()\$otherPkgs\$org.Hs.eg.db\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", org.Dr.eg.db version" $(R --vanilla --slave -e "library(org.Dr.eg.db); cat(sessionInfo()\$otherPkgs\$org.Dr.eg.db\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", org.Dm.eg.db version" $(R --vanilla --slave -e "library(org.Dm.eg.db); cat(sessionInfo()\$otherPkgs\$org.Dm.eg.db\$Version)" 2> /dev/null | grep -v -i "WARNING: ")", org.Mm.eg.db version" $(R --vanilla --slave -e "library(org.Mm.eg.db); cat(sessionInfo()\$otherPkgs\$org.Mm.eg.db\$Version)" 2> /dev/null | grep -v -i "WARNING: ") | |
27 ]]></version_command> | |
21 <command><![CDATA[ | 28 <command><![CDATA[ |
22 Rscript '$__tool_directory__'/goseq.r --dge_file '$dge_file' | 29 Rscript '$__tool_directory__/goseq.r' |
23 --length_file '$length_file' | 30 |
24 --category_file '$category_file' | 31 --dge_file '$dge_file' |
25 #if $methods['wallenius']: | 32 --length_file '$length_file' |
26 --wallenius_tab '$wallenius_tab' | 33 |
27 #end if | 34 #if $categorySource.catSource == 'getgo': |
28 #if $methods['hypergeometric']: | 35 --genome $categorySource.genome |
29 --nobias_tab '$nobias_tab' | 36 --gene_id $categorySource.gene_id |
30 #end if | 37 --fetch_cats '$categorySource.fetchcats' |
31 --repcnt '$methods.repcnt' | 38 #elif $categorySource.catSource == 'history': |
32 --sampling_tab '$sampling_tab' | 39 --category_file '$categorySource.category_file' |
33 --p_adj_method '$p_adj_method' | 40 #end if |
34 --use_genes_without_cat '$use_genes_without_cat' | 41 |
35 --make_plots '$make_plots' | 42 #if $methods['wallenius']: |
36 --length_bias_plot '$length_bias_plot' | 43 --wallenius_tab '$wallenius_tab' |
37 --sample_vs_wallenius_plot '$sample_vs_wallenius_plot' | 44 #end if |
45 #if $methods['hypergeometric']: | |
46 --nobias_tab '$nobias_tab' | |
47 #end if | |
48 --repcnt '$methods.repcnt' | |
49 --sampling_tab '$sampling_tab' | |
50 | |
51 --make_plots '$out.make_plots' | |
52 --length_bias_plot '$length_bias_plot' | |
53 --sample_vs_wallenius_plot '$sample_vs_wallenius_plot' | |
54 --rdata '$out.rdata_out' | |
55 | |
56 --p_adj_method '$adv.p_adj_method' | |
57 --use_genes_without_cat '$adv.use_genes_without_cat' | |
58 | |
38 ]]></command> | 59 ]]></command> |
60 | |
61 <!-- Input Files--> | |
39 <inputs> | 62 <inputs> |
40 <param name="dge_file" help="A tabular file with gene names in the first column, and TRUE or FALSE in the last column. TRUE means a gene is differentially expressed. See help section for details." label="Differentially expressed gene file" type="data" format="tabular" /> | 63 <param name="dge_file" type="data" format="tabular" label="Differentially expressed genes file" help="A tabular file with Gene IDs in the first column, and True or False in the second column. True means a gene is differentially expressed. See Help section for details."/> |
41 <param name="length_file" label="Gene length file for length bias correction" help="You can calculate the gene length using the get length and gc content tool" type="data" format="tabular" /> | 64 <param name="length_file" type="data" format="tabular" label="Gene lengths file" help="You can calculate the gene lengths using featureCounts or the Gene length and GC content tool."/> |
42 <param name="category_file" label="Gene category file" help="You can obtain a mapping of gene id to gene ontology using the getgo tool" type="data" format="tabular" /> | 65 <conditional name="categorySource"> |
43 <param name="use_genes_without_cat" help="For example, a large number of gene may have no GO term annotated. If this option is set to FALSE, those genes will be ignored in the calculation of p-values. If this option is set to TRUE, then these genes will count towards the total number of genes outside the category being tested" | 66 <param name="catSource" type="select" format="tabular" label="Gene categories" help="You can obtain a mapping of genes to categories (for some genomes only) or you can provide your own category file."> |
44 label="Count genes without any category?" type="boolean"/> | 67 <option value="getgo" selected="true">Get categories</option> |
45 <section name="methods" title="Method options" expanded="True"> | 68 <option value="history">Use a category file from history</option> |
46 <param name="wallenius" type="boolean" checked="true" label="Use wallenius method" help="See help for details" /> | 69 </param> |
47 <param name="hypergeometric" type="boolean" checked="false" label="Use hypergeometric method" help="Does not use gene length information. See help for details" /> | 70 <when value="getgo"> |
48 <param name="repcnt" help="Draw this many random control gene sets. Set to 0 to not do sampling. Larger values take a long time" label="sampling depth" size="3" type="integer" min="0" max="10000" value="0" /> | 71 <param name="genome" type="select" label="Select a genome to use"> |
72 <option value="hg38">Human (hg38)</option> | |
73 <option value="mm10">Mouse (mm10)</option> | |
74 <option value="dm6">Fruit fly (dm6)</option> | |
75 <option value="danRer10">Zebrafish (danRer10)</option> | |
76 </param> | |
77 <param name="gene_id" type="select" label="Select Gene ID format" help="Supported Gene IDs to automatically fetch categories should either be Entrez, Ensembl, or gene symbols."> | |
78 <option value="ensGene">Ensembl Gene ID</option> | |
79 <option value="knownGene">Entrez Gene ID</option> | |
80 <option value="geneSymbol">Gene Symbol</option> | |
81 </param> | |
82 <param name="fetchcats" type="select" multiple="True" display="checkboxes" label="Select one or more categories" help="By default, goseq tests all three major Gene Ontology branches; Cellular Component, Biological Process and Molecular Function. However, it is possible to limit testing to any combination and/or to also use KEGG pathways."> | |
83 <option value="GO:CC" selected="True">GO: Cellular Component</option> | |
84 <option value="GO:BP" selected="True">GO: Biological Process</option> | |
85 <option value="GO:MF" selected="True">GO: Molecular Function</option> | |
86 <option value="KEGG">KEGG</option> | |
87 </param> | |
88 </when> | |
89 <when value="history"> | |
90 <param name="category_file" type="data" format="tabular" label="Gene category file"/> | |
91 </when> | |
92 </conditional> | |
93 | |
94 <!-- Method Options --> | |
95 <section name="methods" title="Method Options"> | |
96 <param name="wallenius" type="boolean" checked="true" label="Use Wallenius method" help="See help for details. Default: Yes" /> | |
97 <param name="hypergeometric" type="boolean" checked="false" label="Use Hypergeometric method" help="Does not use gene length information. See help for details. Default: No" /> | |
98 <param name="repcnt" type="integer" size="3" min="0" max="10000" value="0" label="Sampling number" help="Number of random samples to be calculated when sampling is used. Set to 0 to not do sampling. Larger values take a long time. Default: 0" /> | |
49 </section> | 99 </section> |
50 <param name="p_adj_method" type="select" label="Select a method for multiple hypothesis testing correction"> | 100 |
51 <option value="BH" selected="true">Benjamini-Hochberg [FDR] (1995)</option> | 101 <!-- Output Options --> |
52 <option value="holm">Holm (1979)</option> | 102 <section name="out" title="Output Options"> |
53 <option value="hommel">Hommel (1988)</option> | 103 <param name="make_plots" type="boolean" checked="false" label="Produce diagnostic plots?" help="This will produce the length bias (PWF) plot. If both sampling and wallenius methods are selected, it will also produce a plot comparing their p-values. These plots may help you compare the different p-value estimation methods that goseq can use. Default: No" /> |
54 <option value="hochberg">Hochberg (1988)</option> | 104 <param name="rdata_out" type="boolean" checked="false" label="Output RData file?" help="Output all the data used by R to construct the tables and plots, can be loaded into R. Default: No" /> |
55 <option value="bonferroni">Bonferroni</option> | 105 </section> |
56 <option value="BY">Benjamini - Yekutieli (2001)</option> | 106 |
57 </param> | 107 <!-- Advanced Options --> |
58 <param help="These plots may help you compare the different p-value estimation methods that goseq can use." label="Produce diagnostic plots?" name="make_plots" type="boolean"></param> | 108 <section name="adv" title="Advanced Options"> |
109 <param name="p_adj_method" type="select" label="Select a method for multiple hypothesis testing correction"> | |
110 <option value="BH" selected="True">Benjamini-Hochberg [FDR] (1995)</option> | |
111 <option value="holm">Holm (1979)</option> | |
112 <option value="hommel">Hommel (1988)</option> | |
113 <option value="hochberg">Hochberg (1988)</option> | |
114 <option value="bonferroni">Bonferroni</option> | |
115 <option value="BY">Benjamini - Yekutieli (2001)</option> | |
116 </param> | |
117 <param name="use_genes_without_cat" type="boolean" checked="false" label="Count genes without any category?" help="For example, a large number of genes may have no GO term annotated. If this option is set to No, those genes will be ignored in the calculation of p-values. If this option is set to Yes, then these genes will count towards the total number of genes outside the category being tested. This was the default behaviour for version 1.15.1 and earlier. Default: No"/> | |
118 </section> | |
59 </inputs> | 119 </inputs> |
120 | |
60 <outputs> | 121 <outputs> |
61 <data name="length_bias_plot" format="pdf" label="length bias plot"> | 122 <data name="wallenius_tab" format="tabular" label="${tool.name} on ${on_string}: Ranked category list - Wallenius method"> |
62 <filter>make_plots</filter> | 123 <filter>methods['wallenius]'</filter> |
124 </data> | |
125 <data name="sampling_tab" format="tabular" label="${tool.name} on ${on_string}: Ranked category list - Sampling method"> | |
126 <filter>methods['repcnt'] != 0</filter> | |
127 </data> | |
128 <data name="nobias_tab" format="tabular" label="${tool.name} on ${on_string}: Ranked category list - Hypergeometric method"> | |
63 <filter>methods['hypergeometric']</filter> | 129 <filter>methods['hypergeometric']</filter> |
64 </data> | 130 </data> |
65 <data name="sample_vs_wallenius_plot" format="pdf" label="Plot P-value from sampling against wallenius distribution"> | 131 <data name="length_bias_plot" format="pdf" label="${tool.name} on ${on_string}: Length bias plot"> |
132 <filter>out['make_plots']</filter> | |
133 </data> | |
134 <data name="sample_vs_wallenius_plot" format="pdf" label="${tool.name} on ${on_string}: Sampling vs Wallenius P-values plot"> | |
66 <filter>methods['repcnt'] != 0</filter> | 135 <filter>methods['repcnt'] != 0</filter> |
67 <filter>methods['wallenius']</filter> | 136 <filter>methods['wallenius']</filter> |
68 <filter>make_plots</filter> | 137 <filter>out['make_plots']</filter> |
69 </data> | 138 </data> |
70 <data name="nobias_tab" format="tabular" label="Ranked category list - no length bias correction"> | 139 <data name="rdata" format="rdata" from_work_dir="goseq_analysis.RData" label="${tool.name} on ${on_string}: RData file"> |
71 <filter>methods['hypergeometric']</filter> | 140 <filter>out['rdata_out']</filter> |
72 </data> | |
73 <data name="sampling_tab" format="tabular" label="Ranked category list - sampling"> | |
74 <filter>methods['repcnt'] != 0</filter> | |
75 </data> | |
76 <data name="wallenius_tab" format="tabular" label="Ranked category list - wallenius method"> | |
77 <filter>methods['wallenius']</filter> | |
78 </data> | 141 </data> |
79 </outputs> | 142 </outputs> |
143 | |
80 <tests> | 144 <tests> |
81 <test> | 145 <!-- Ensure Wallenius table is output --> |
146 <test expect_num_outputs="1"> | |
147 <param name="dge_file" value="dge_list.tab" ftype="tabular" /> | |
148 <param name="length_file" value="gene_length.tab" ftype="tabular" /> | |
149 <param name="catSource" value="history" /> | |
150 <param name="category_file" value="category.tab" ftype="tabular" /> | |
151 <param name="use_genes_without_cat" value="true" /> | |
152 <output name="wallenius_tab" file="wal.tab" compare="contains" /> | |
153 </test> | |
154 <!-- Ensure getting GO categories works --> | |
155 <test expect_num_outputs="1"> | |
82 <param name="dge_file" value="dge_list.tab" ftype="tabular"/> | 156 <param name="dge_file" value="dge_list.tab" ftype="tabular"/> |
83 <param name="length_file" value="gene_length.tab" ftype="tabular"/> | 157 <param name="length_file" value="gene_length.tab" ftype="tabular"/> |
84 <param name="category_file" value="category.tab" ftype="tabular"/> | 158 <param name="catSource" value="getgo" /> |
85 <param name="use_genes_without_cat" value="true" /> | 159 <param name="genome" value="hg38" /> |
86 <output name="wallenius_tab" file="wal.tab" compare="re_match"/> | 160 <param name="gene_id" value="ensGene" /> |
161 <param name="use_genes_without_cat" value="true" /> | |
162 <output name="wallenius_tab" ftype="tabular" file="getgo.hg38.tab" compare="contains"/> | |
163 </test> | |
164 <!-- Ensure getting GO categories for another genome (zebrafish) works --> | |
165 <test expect_num_outputs="1"> | |
166 <param name="dge_file" value="dge_list_zf.tab" ftype="tabular"/> | |
167 <param name="length_file" value="gene_length_zf.tab" ftype="tabular"/> | |
168 <param name="catSource" value="getgo" /> | |
169 <param name="genome" value="danRer10"/> | |
170 <param name="gene_id" value="ensGene" /> | |
171 <param name="use_genes_without_cat" value="true" /> | |
172 <output name="wallenius_tab" ftype="tabular" file="getgo.danRer10.tab" compare="contains"/> | |
173 </test> | |
174 <!-- Ensure length bias plot works --> | |
175 <test expect_num_outputs="2"> | |
176 <param name="dge_file" value="dge_list.tab" ftype="tabular" /> | |
177 <param name="length_file" value="gene_length.tab" ftype="tabular" /> | |
178 <param name="catSource" value="history" /> | |
179 <param name="category_file" value="category.tab" ftype="tabular" /> | |
180 <param name="make_plots" value="true" /> | |
181 <param name="use_genes_without_cat" value="true" /> | |
182 <output name="length_bias_plot" ftype="pdf" file="length_bias_plot.pdf" compare="sim_size" /> | |
183 </test> | |
184 <!-- Ensure hypergeometric works --> | |
185 <test expect_num_outputs="2"> | |
186 <param name="dge_file" value="dge_list.tab" ftype="tabular" /> | |
187 <param name="length_file" value="gene_length.tab" ftype="tabular" /> | |
188 <param name="catSource" value="history" /> | |
189 <param name="category_file" value="category.tab" ftype="tabular" /> | |
190 <param name="use_genes_without_cat" value="true" /> | |
191 <param name="hypergeometric" value="true" /> | |
192 <output name="nobias_tab" file="nobias.tab" compare="contains" /> | |
193 </test> | |
194 <!-- Ensure sampling vs wallenius works --> | |
195 <test expect_num_outputs="4"> | |
196 <param name="dge_file" value="dge_list.tab" ftype="tabular" /> | |
197 <param name="length_file" value="gene_length.tab" ftype="tabular" /> | |
198 <param name="catSource" value="history" /> | |
199 <param name="category_file" value="category.tab" ftype="tabular" /> | |
200 <param name="use_genes_without_cat" value="true" /> | |
201 <param name="make_plots" value="true" /> | |
202 <param name="repcnt" value="1000" /> | |
203 <output name="sampling_tab" file="samp.tab" compare="sim_size" /> | |
204 <output name="length_bias_plot" ftype="pdf" file="length_bias_plot.pdf" compare="sim_size" /> | |
205 <output name="sample_vs_wallenius_plot" ftype="pdf" file="sample_vs_wallenius_plot.pdf" compare="sim_size" /> | |
206 </test> | |
207 <!-- Ensure RData output works --> | |
208 <test expect_num_outputs="2"> | |
209 <param name="dge_file" value="dge_list.tab" ftype="tabular" /> | |
210 <param name="length_file" value="gene_length.tab" ftype="tabular" /> | |
211 <param name="catSource" value="history" /> | |
212 <param name="category_file" value="category.tab" ftype="tabular" /> | |
213 <param name="use_genes_without_cat" value="true" /> | |
214 <param name="rdata_out" value="true" /> | |
215 <output name="rdata" file="goseq_analysis.RData" compare="sim_size" /> | |
87 </test> | 216 </test> |
88 </tests> | 217 </tests> |
89 <help> | 218 |
90 | 219 <help><![CDATA[ |
91 **What it does** | 220 |
92 | 221 .. class:: infomark |
93 Detects Gene Ontology and/or other user defined categories which are over/under-represented in RNA-seq data. | 222 |
94 | 223 **What it does** |
95 Options map closely to the excellent manual_ | 224 |
96 | 225 `Gene Ontology`_ (GO) analysis is widely used to reduce complexity and highlight biological processes in genome-wide expression studies, but standard methods give biased results on RNA-seq data due to over-detection of differential expression for long and highly expressed transcripts. This tool provides methods for performing GO analysis of RNA-seq data, taking length bias into account. The methods and software used by goseq are equally applicable to other category based tests of RNA-seq data, such as KEGG_ pathway analysis. |
97 | 226 |
98 **Input files** | 227 Options map closely to the excellent goseq manual_. |
99 | 228 |
100 *DGE list:* | 229 ----- |
101 goseq needs a tabular file with genes in the first column, and TRUE or FALSE in the last column. | 230 |
102 TRUE means the gene should count as differentially expressed, FALSE means it is not differentially expressed. | 231 **Inputs** |
103 You can use the "Compute an expression on every row" tool to create a TRUE / FALSE column for your dataset. | 232 |
104 | 233 *Differentially expressed genes file* |
105 *Gene length file:* | 234 |
106 goseq needs information about the length of a gene to correct for potential length bias in differentially expressed genes | 235 goseq needs a tabular file containing information on differentially expressed genes. This should contain all genes assayed in the RNA-seq experiment. The file should have two columns with an optional header row. The first column should contain the Gene IDs, which must be unique within the file and not repeated. The second column should contain True or False. True means the gene should count as differentially expressed, False means it is not differentially expressed. You can use the "Compute an expression on every row" tool to create a True / False column for your dataset. |
107 using a prodbability weight function (PWF). | 236 |
108 The format of this file is tabular, with gene_id in the first column and length in the second column. | 237 Example: |
109 The "get length and gc content" tool can produce such a file. | 238 |
110 | 239 =============== ===== |
111 *Gene category file:* | 240 ENSG00000236824 False |
112 You will also need a file describing the membership of genes in categories. The format of this file is gene_id in the first column, | 241 ENSG00000162526 False |
113 category identifier in the second column. | 242 ENSG00000090402 True |
114 | 243 ENSG00000169188 False |
115 **Method options** | 244 ENSG00000124103 False |
116 | 245 =============== ===== |
117 3 methods, "Wallenius", "Sampling" and "Hypergeometric", can be used to calculate the p-values as follows. | 246 |
118 | 247 *Gene lengths file* |
119 *"Wallenius"* approximates the true distribution of numbers of members of a category amongst DE genes by the Wallenius non-central hypergeometric distribution. | 248 |
120 This distribution assumes that within a category all genes have the same probability of being chosen. | 249 goseq needs information about the length of a gene to correct for potential length bias in differentially expressed genes using a Probability Weight Function (PWF). The PWF can be thought of, as a function which gives the probability that a gene will be differentially expressed, based on its length alone. The gene length file should have two columns with an optional header row. The first column should contain the Gene IDs, and the second column should contain the gene length in bp. If length data is unavailable for some genes, that entry should be set to NA. The goseq authors recommend using the gene lengths obtained from upstream summarization programs, such as **featureCounts**, if provided. Alternatively, the **Gene length and GC content** tool can produce such a file. |
121 Therefore, this approximation works best when the range in probabilities obtained by the probability weighting function is small. | 250 |
122 | 251 Example: |
123 *"Sampling"* uses random sampling to approximate the true distribution and uses it to calculate the p-values for over (and under) representation of categories. | 252 |
124 Although this is the most accurate method given a high enough value of sampling depth, its use quickly becomes computationally prohibitive. | 253 =============== ===== |
125 | 254 ENSG00000236824 13458 |
126 *"Hypergeometric"* assumes there is no bias in power to detect differential expression at all and calculates the p-values using a standard hypergeometric distribution. | 255 ENSG00000162526 2191 |
127 Useful if you wish to test the effect of selection bias on your results. | 256 ENSG00000090402 6138 |
128 | 257 ENSG00000169188 3245 |
129 CAUTION: "Hypergeometric" should NEVER be used for producing results for biological interpretation. | 258 ENSG00000124103 1137 |
130 If there is genuinely no bias in power to detect DE in your experiment, the PWF will reflect this and the other methods will produce accuracte results. | 259 =============== ===== |
131 | 260 |
132 .. _manual: https://bioconductor.org/packages/release/bioc/vignettes/goseq/inst/doc/goseq.pdf | 261 *Gene categories file* |
133 | 262 |
134 | 263 This tool can get GO and KEGG categories for some genomes. The three GO categories are GO:MF (Molecular Function - molecular activities of gene products), GO:CC (Cellular Component - where gene products are active), GO:BP (Biological Process - pathways and larger processes made up of the activities of multiple gene products). If your genome is not available, you will also need a file describing the membership of genes in categories. The category file should have two columns with an optional header row. with Gene ID in the first column and category identifier in the second column. As the mapping between categories and genes is usually many-to-many, this table will usually have multiple rows with the same Gene ID and category identifier. |
135 </help> | 264 |
265 Example: | |
266 | |
267 =============== =========== | |
268 ENSG00000162526 GO\:0000003 | |
269 ENSG00000198648 GO\:0000278 | |
270 ENSG00000112312 GO\:0000278 | |
271 ENSG00000174442 GO\:0000278 | |
272 ENSG00000108953 GO\:0000278 | |
273 =============== =========== | |
274 | |
275 ----- | |
276 | |
277 **Outputs** | |
278 | |
279 * This tool outputs a tabular file containing a ranked list of gene categories, similar to below. The default output is the Wallenius method table. If the Sampling and/or Hypergeometric methods are also selected, additional tables are produced. | |
280 * Optionally, this tool can also output some diagnostic plots and an RData file, see **Output Options** above. | |
281 | |
282 Example: | |
283 | |
284 =========== =============== ================ ============ ========== ======================================== ========== =================== ==================== | |
285 *category* *over_rep_pval* *under_rep_pval* *numDEInCat* *numInCat* *term* *ontology* *p.adjust.over_rep* *p.adjust.under_rep* | |
286 ----------- --------------- ---------------- ------------ ---------- ---------------------------------------- ---------- ------------------- -------------------- | |
287 GO\:0005576 0.000054 0.999975 56 142 extracellular region CC 0.394825 1 | |
288 GO\:0005840 0.000143 0.999988 9 12 ribosome CC 0.394825 1 | |
289 GO\:0044763 0.000252 0.999858 148 473 single-organism cellular process BP 0.394825 1 | |
290 GO\:0044699 0.000279 0.999844 158 513 single-organism process BP 0.394825 1 | |
291 GO\:0065010 0.000428 0.999808 43 108 extracellular membrane-bounded organelle CC 0.394825 1 | |
292 GO\:0070062 0.000428 0.999808 43 108 extracellular exosome CC 0.394825 1 | |
293 =========== =============== ================ ============ ========== ======================================== ========== =================== ==================== | |
294 | |
295 ----- | |
296 | |
297 **Method options** | |
298 | |
299 3 methods, *Wallenius*, *Sampling* and *Hypergeometric*, can be used to calculate the p-values as follows. | |
300 | |
301 *Wallenius* | |
302 | |
303 approximates the true distribution of numbers of members of a category amongst DE genes by the Wallenius non-central hypergeometric distribution. | |
304 This distribution assumes that within a category all genes have the same probability of being chosen. Therefore, this approximation works best when the range in probabilities obtained by the probability weighting function is small. This is the method used by default. | |
305 | |
306 *Sampling* | |
307 | |
308 uses random sampling to approximate the true distribution and uses it to calculate the p-values for over (and under) representation of categories. | |
309 Although this is the most accurate method given a high enough value of sampling number, its use quickly becomes computationally prohibitive. It may sometimes be desirable to use random sampling to generate the null distribution for category | |
310 membership. For example, to check consistency against results from the Wallenius approximation. This is easily accomplished by using the method option to additionally specify sampling and the number of samples to generate. | |
311 | |
312 *Hypergeometric* | |
313 | |
314 assumes there is no bias in power to detect differential expression at all and calculates the p-values using a standard hypergeometric distribution (no length bias correction is performed). Useful if you wish to test the effect of length bias on your results. | |
315 Caution: Hypergeometric should NEVER be used for producing results for biological interpretation of RNA-seq data. If length bias is truly not present in your data, goseq will produce a nearly flat PWF plot, no length bias correction will be applied to your data, and all methods will produce the same results. | |
316 | |
317 ----- | |
318 | |
319 **More Information** | |
320 | |
321 In order to account for the length bias inherent to RNA-seq data when performing a GO analysis | |
322 (or other category based tests), one cannot simply use the hypergeometric distribution as the null | |
323 distribution for category membership, which is appropriate for data without DE length bias, such | |
324 as microarray data. GO analysis of RNA-seq data requires the use of random sampling in order | |
325 to generate a suitable null distribution for GO category membership and calculate each categories | |
326 significance for over representation amongst DE genes. | |
327 | |
328 However, this random sampling is computationally expensive. In most cases, the Wallenius | |
329 distribution can be used to approximate the true null distribution, without any significant loss in | |
330 accuracy. The goseq package implements this approximation as its default option. The option | |
331 to generate the null distribution using random sampling is also included as an option, but users | |
332 should be aware that the default number of samples generated will not be enough to accurately | |
333 call enrichment when there are a large number of go terms. | |
334 | |
335 Having established a null distribution, each category is then tested for over and under | |
336 representation amongst the set of differentially expressed genes and the null is used to calculate a | |
337 p-value for under and over representation. | |
338 | |
339 Having performed a GO analysis, you may now wish to interpret the results. If you wish to | |
340 identify categories significantly enriched/unenriched below some p-value cutoff, it is necessary to | |
341 first apply some kind of multiple hypothesis testing correction. For example, you can identify GO categories over | |
342 enriched using a 0.05 FDR (p.adjust) cutoff [Benjamini and Hochberg, 1995]. | |
343 | |
344 Unless you are a machine, GO and KEGG category identifiers are probably not very meaningful to you. | |
345 Information about each identifier can be obtained from the `Gene Ontology`_ and KEGG_ websites. | |
346 | |
347 .. _manual: https://bioconductor.org/packages/release/bioc/vignettes/goseq/inst/doc/goseq.pdf | |
348 .. _Gene Ontology: http://www.geneontology.org | |
349 .. _KEGG: http://www.genome.jp/kegg | |
350 | |
351 ]]></help> | |
136 <citations> | 352 <citations> |
137 <citation type="doi">10.1186/gb-2010-11-2-r14</citation> | 353 <citation type="doi">10.1186/gb-2010-11-2-r14</citation> |
138 </citations> | 354 </citations> |
139 </tool> | 355 </tool> |