changeset 8:8b3e3657034e draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/goseq commit 8e19f8bcaea6f607a1eaa14bb88f2d625ed63df0"
author iuc
date Fri, 06 Sep 2019 07:50:46 -0400
parents 67c29afac85f
children ef2ad746b589
files goseq.r goseq.xml
diffstat 2 files changed, 335 insertions(+), 191 deletions(-) [+]
line wrap: on
line diff
--- a/goseq.r	Sun Mar 17 10:27:17 2019 -0400
+++ b/goseq.r	Fri Sep 06 07:50:46 2019 -0400
@@ -10,47 +10,32 @@
     library("ggplot2")
 })
 
+sessionInfo()
+
 option_list <- list(
     make_option(c("-d", "--dge_file"), type="character", help="Path to file with differential gene expression result"),
-    make_option(c("-w","--wallenius_tab"), type="character", help="Path to output file with P-values estimated using wallenius distribution."),
-    make_option(c("-s","--sampling_tab"), type="character", default=FALSE, help="Path to output file with P-values estimated using sampling distribution."),
-    make_option(c("-n","--nobias_tab"), type="character", default=FALSE, help="Path to output file with P-values estimated using hypergeometric distribution and no correction for gene length bias."),
-    make_option(c("-l","--length_bias_plot"), type="character", default=FALSE, help="Path to length-bias plot."),
-    make_option(c("-sw","--sample_vs_wallenius_plot"), type="character", default=FALSE, help="Path to plot comparing sampling with wallenius p-values."),
-    make_option(c("-r", "--repcnt"), type="integer", default=100, help="Number of repeats for sampling"),
-    make_option(c("-lf", "--length_file"), type="character", default="FALSE", help = "Path to tabular file mapping gene id to length"),
-    make_option(c("-cat_file", "--category_file"), default="FALSE", type="character", help = "Path to tabular file with gene_id <-> category mapping."),
-    make_option(c("-g", "--genome"), default=NULL, type="character", help = "Genome [used for looking up correct gene length]"),
-    make_option(c("-i", "--gene_id"), default=NULL, type="character", help = "Gene ID format of genes in DGE file"),
-    make_option(c("-p", "--p_adj_method"), default="BH", type="character", help="Multiple hypothesis testing correction method to use"),
-    make_option(c("-cat", "--use_genes_without_cat"), default=FALSE, type="logical",
-                help="A large number of gene may have no GO term annotated. If this option is set to FALSE, genes without category will be ignored in the calculation of p-values(default behaviour). If TRUE these genes will count towards the total number of genes outside the tested category (default behaviour prior to version 1.15.2)."),
-    make_option(c("-plots", "--make_plots"), default=FALSE, type="logical", help="produce diagnostic plots?"),
-    make_option(c("-fc", "--fetch_cats"), default=NULL, type="character", help="Categories to get can include one or more of GO:CC, GO:BP, GO:MF, KEGG"),
-    make_option(c("-rd", "--rdata"), default=NULL, type="character", help="Path to RData output file."),
-    make_option(c("-tp", "--top_plot"), default=NULL, type="logical", help="Output PDF with top10 over-rep GO terms?")
+    make_option(c("-lf", "--length_file"), type="character", default=NULL, help="Path to tabular file mapping gene id to length"),
+    make_option(c("-g", "--genome"), type="character", default=NULL, help="Genome [used for looking up correct gene length]"),
+    make_option(c("-i", "--gene_id"), type="character", default=NULL, help="Gene ID format of genes in DGE file"),
+    make_option(c("-fc", "--fetch_cats"), type="character", default=NULL, help="Categories to get can include one or more of GO:CC, GO:BP, GO:MF, KEGG"),
+    make_option(c("-cat_file", "--category_file"), type="character", default=NULL, help="Path to tabular file with gene_id <-> category mapping"),
+    make_option(c("-w","--wallenius_tab"), type="character", default=NULL, help="Path to output file with P-values estimated using wallenius distribution"),
+    make_option(c("-n","--nobias_tab"), type="character", default=NULL, help="Path to output file with P-values estimated using hypergeometric distribution and no correction for gene length bias"),
+    make_option(c("-r", "--repcnt"), type="integer", default=0, help="Number of repeats for sampling"),
+    make_option(c("-s","--sampling_tab"), type="character", default=NULL, help="Path to output file with P-values estimated using sampling distribution"),
+    make_option(c("-p", "--p_adj_method"), type="character", default="BH", help="Multiple hypothesis testing correction method to use"),
+    make_option(c("-cat", "--use_genes_without_cat"), type="logical", default=FALSE, help="A large number of gene may have no GO term annotated. If this option is set to FALSE, genes without category will be ignored in the calculation of p-values(default behaviour). If TRUE these genes will count towards the total number of genes outside the tested category (default behaviour prior to version 1.15.2)."),
+    make_option(c("-tp", "--top_plot"), type="character", default=NULL, help="Path to output PDF with top10 over-rep GO terms"),
+    make_option(c("-plots", "--make_plots"), default=FALSE, type="logical", help="Produce diagnostic plots?"),
+    make_option(c("-l","--length_bias_plot"), type="character", default=NULL, help="Path to length-bias plot"),
+    make_option(c("-sw","--sample_vs_wallenius_plot"), type="character", default=NULL, help="Path to plot comparing sampling with wallenius p-values"),
+    make_option(c("-rd", "--rdata"), type="character", default=NULL, help="Path to RData output file"),
+    make_option(c("-g2g", "--categories_genes_out_fp"), type="character", default=NULL, help="Path to file with categories (GO/KEGG terms) and associated DE genes")
     )
 
 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 args = parse_args(parser)
 
-# Vars:
-dge_file = args$dge_file
-category_file = args$category_file
-length_file = args$length_file
-genome = args$genome
-gene_id = args$gene_id
-wallenius_tab = args$wallenius_tab
-sampling_tab = args$sampling_tab
-nobias_tab = args$nobias_tab
-length_bias_plot = args$length_bias_plot
-sample_vs_wallenius_plot = args$sample_vs_wallenius_plot
-repcnt = args$repcnt
-p_adj_method = args$p_adj_method
-use_genes_without_cat = args$use_genes_without_cat
-make_plots = args$make_plots
-rdata = args$rdata
-
 if (!is.null(args$fetch_cats)) {
   fetch_cats = unlist(strsplit(args$fetch_cats, ","))
 } else {
@@ -59,101 +44,115 @@
 
 # format DE genes into named vector suitable for goseq
 # check if header is present
-first_line = read.delim(dge_file, header = FALSE, nrow=1)
+first_line = read.delim(args$dge_file, header=FALSE, nrow=1)
 second_col = toupper(first_line[, ncol(first_line)])
 if (second_col == TRUE || second_col == FALSE) {
-    dge_table = read.delim(dge_file, header = FALSE, sep="\t")
+  dge_table = read.delim(args$dge_file, header=FALSE, sep="\t")
 } else {
-    dge_table = read.delim(dge_file, header = TRUE, sep="\t")
+  dge_table = read.delim(args$dge_file, header=TRUE, sep="\t")
 }
-genes = as.numeric(as.logical(dge_table[,ncol(dge_table)])) # Last column contains TRUE/FALSE
+genes = as.numeric(as.logical(dge_table[, ncol(dge_table)])) # Last column contains TRUE/FALSE
 names(genes) = dge_table[,1] # Assuming first column contains gene names
 
 # gene lengths, assuming last column
-if (length_file != "FALSE" ) {
-  first_line = read.delim(length_file, header = FALSE, nrow=1)
-  if (is.numeric(first_line[, ncol(first_line)])) {
-    length_table = read.delim(length_file, header=FALSE, sep="\t", check.names=FALSE)
-    } else {
-    length_table = read.delim(length_file, header=TRUE, sep="\t", check.names=FALSE)
-    }
-  row.names(length_table) = length_table[,1]
-  gene_lengths = length_table[names(genes),][,ncol(length_table)]
-  } else {
-  gene_lengths = getlength(names(genes), genome, gene_id)
-  }
+first_line = read.delim(args$length_file, header=FALSE, nrow=1)
+if (is.numeric(first_line[, ncol(first_line)])) {
+  length_table = read.delim(args$length_file, header=FALSE, sep="\t", check.names=FALSE)
+} else {
+  length_table = read.delim(args$length_file, header=TRUE, sep="\t", check.names=FALSE)
+}
+row.names(length_table) = length_table[,1]
+# get vector of gene length in same order as the genes
+gene_lengths = length_table[names(genes),][, ncol(length_table)]
 
 # Estimate PWF
-
-if (make_plots != 'false') {
-  pdf(length_bias_plot)
+if (args$make_plots) {
+  pdf(args$length_bias_plot)
 }
-pwf=nullp(genes, genome = genome, id = gene_id, bias.data = gene_lengths, plot.fit=make_plots)
-if (make_plots != 'false') {
+pwf=nullp(genes, genome=args$genome, id=args$gene_id, bias.data=gene_lengths, plot.fit=args$make_plots)
+if (args$make_plots) {
   dev.off()
 }
 
 # Fetch GO annotations if category_file hasn't been supplied:
-if (category_file == "FALSE") {
-  go_map=getgo(genes = names(genes), genome=genome, id=gene_id, fetch.cats=fetch_cats)
-  } else {
+if (is.null(args$category_file)) {
+  go_map=getgo(genes=names(genes), genome=args$genome, id=args$gene_id, fetch.cats=fetch_cats)
+} else {
   # check for header: first entry in first column must be present in genes, else it's a header
-  first_line = read.delim(category_file, header = FALSE, nrow=1)
+  first_line = read.delim(args$category_file, header=FALSE, nrow=1)
   if (first_line[,1] %in% names(genes)) {
-     go_map = read.delim(category_file, header = FALSE)
-     } else {
-     go_map = read.delim(category_file, header= TRUE)
-    }
+    go_map = read.delim(args$category_file, header=FALSE)
+  } else {
+    go_map = read.delim(args$category_file, header=TRUE)
+  }
 }
 
 results <- list()
 
-# wallenius approximation of p-values
-if (wallenius_tab != FALSE) {
-  GO.wall=goseq(pwf, genome = genome, id = gene_id, use_genes_without_cat = use_genes_without_cat, gene2cat=go_map)
-  GO.wall$p.adjust.over_represented = p.adjust(GO.wall$over_represented_pvalue, method=p_adj_method)
-  GO.wall$p.adjust.under_represented = p.adjust(GO.wall$under_represented_pvalue, method=p_adj_method)
-  write.table(GO.wall, args$wallenius_tab, sep="\t", row.names = FALSE, quote = FALSE)
-  results[['Wallenius']] <- GO.wall
+runGoseq <- function(pwf, genome, gene_id, goseq_method, use_genes_without_cat, repcnt, gene2cat, p_adj_method, out_fp){
+  out=goseq(pwf, genome=genome, id=gene_id, method=goseq_method, use_genes_without_cat=use_genes_without_cat, gene2cat=go_map)
+  out$p.adjust.over_represented = p.adjust(out$over_represented_pvalue, method=p_adj_method)
+  out$p.adjust.under_represented = p.adjust(out$under_represented_pvalue, method=p_adj_method)
+  write.table(out, out_fp, sep="\t", row.names=FALSE, quote=FALSE)
+  return(out)
 }
 
+# wallenius approximation of p-values
+if (!is.null(args$wallenius_tab)) results[['Wallenius']] <- runGoseq(
+  pwf,
+  genome=args$genome,
+  gene_id=args$gene_id,
+  goseq_method="Wallenius",
+  use_genes_without_cat=args$use_genes_without_cat,
+  repcnt=args$repcnt,
+  gene2cat=go_map,
+  p_adj_method=args$p_adj_method,
+  out_fp=args$wallenius_tab)
+
+
 # hypergeometric (no length bias correction)
-if (nobias_tab != FALSE) {
-  GO.nobias=goseq(pwf, genome = genome, id = gene_id, method="Hypergeometric", use_genes_without_cat = use_genes_without_cat, gene2cat=go_map)
-  GO.nobias$p.adjust.over_represented = p.adjust(GO.nobias$over_represented_pvalue, method=p_adj_method)
-  GO.nobias$p.adjust.under_represented = p.adjust(GO.nobias$under_represented_pvalue, method=p_adj_method)
-  write.table(GO.nobias, args$nobias_tab, sep="\t", row.names = FALSE, quote = FALSE)
-  results[['Hypergeometric']] <- GO.nobias
-}
+if (!is.null(args$nobias_tab)) results[['Hypergeometric']] <- runGoseq(
+  pwf,
+  genome=args$genome,
+  gene_id=args$gene_id,
+  goseq_method="Hypergeometric",
+  use_genes_without_cat=args$use_genes_without_cat,
+  repcnt=args$repcnt,
+  gene2cat=go_map,
+  p_adj_method=args$p_adj_method,
+  out_fp=args$nobias_tab)
 
 # Sampling distribution
-if (repcnt > 0) {
-
-  # capture the sampling progress so it doesn't fill stdout  
-  zz <- file("/dev/null", open = "wt")
-  sink(zz)
-  GO.samp=goseq(pwf, genome = genome, id = gene_id, method="Sampling", repcnt=repcnt, use_genes_without_cat = use_genes_without_cat, gene2cat=go_map)
-  sink()
+if (args$repcnt > 0){
+  results[['Sampling']] <- runGoseq(
+    pwf,
+    genome=args$genome,
+    gene_id=args$gene_id,
+    goseq_method="Sampling",
+    use_genes_without_cat=args$use_genes_without_cat,
+    repcnt=args$repcnt,
+    gene2cat=go_map,
+    p_adj_method=args$p_adj_method,
+    out_fp=args$sampling_tab)
 
-  GO.samp$p.adjust.over_represented = p.adjust(GO.samp$over_represented_pvalue, method=p_adj_method)
-  GO.samp$p.adjust.under_represented = p.adjust(GO.samp$under_represented_pvalue, method=p_adj_method)
-  write.table(GO.samp, sampling_tab, sep="\t", row.names = FALSE, quote = FALSE)
   # Compare sampling with wallenius
-  if (make_plots == TRUE) {
-  pdf(sample_vs_wallenius_plot)
-  plot(log10(GO.wall[,2]), log10(GO.samp[match(GO.samp[,1],GO.wall[,1]),2]),
-     xlab="log10(Wallenius p-values)",ylab="log10(Sampling p-values)",
-     xlim=c(-3,0))
-     abline(0,1,col=3,lty=2)
-  dev.off()
+  if (args$make_plots & !is.null(args$wallenius_tab)) {
+    pdf(args$sample_vs_wallenius_plot)
+    plot(log10(results[['Wallenius']][,2]), 
+      log10(results[['Sampling']][match(results[['Sampling']][,1], results[['Wallenius']][,1]), 2]),
+      xlab="log10(Wallenius p-values)",
+      ylab="log10(Sampling p-values)",
+      xlim=c(-3,0))
+    abline(0,1,col=3,lty=2)
+    dev.off()
   }
-  results[['Sampling']] <- GO.samp
 }
 
+# Plot the top 10
 if (!is.null(args$top_plot)) {
   cats_title <- gsub("GO:","", args$fetch_cats)
   # modified from https://bioinformatics-core-shared-training.github.io/cruk-summer-school-2018/RNASeq2018/html/06_Gene_set_testing.nb.html
-  pdf("top10.pdf")
+  pdf(args$top_plot)
   for (m in names(results)) {
     p <- results[[m]] %>%
       top_n(10, wt=-over_represented_pvalue)  %>%
@@ -165,16 +164,35 @@
       geom_point() +
       expand_limits(x=0) +
       labs(x="% DE in category", y="Category", colour="Adj P value", size="Count", title=paste("Top over-represented categories in", cats_title), subtitle=paste(m, " method")) +
-      theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5))
+      theme(plot.title=element_text(hjust = 0.5), plot.subtitle=element_text(hjust = 0.5))
     print(p)
   }
   dev.off()
 }
 
+# Extract the genes to the categories (GO/KEGG terms)
+if (!is.null(args$categories_genes_out_fp)) {
+  cat2gene = split(rep(names(go_map), sapply(go_map, length)), unlist(go_map, use.names = FALSE))
+  # extract categories (GO/KEGG terms) for all results
+  categories = c()
+  for (m in names(results)) {
+    categories = c(categories, results[[m]]$category)
+  }
+  categories = unique(categories)
+  # extract the DE genes for each catge term
+  categories_genes = data.frame(Categories=categories, DEgenes=rep('', length(categories)))
+  categories_genes$DEgenes = as.character(categories_genes$DEgenes)
+  rownames(categories_genes) = categories
+  for (cat in categories){
+    tmp = pwf[cat2gene[[cat]],]
+    tmp = rownames(tmp[tmp$DEgenes > 0, ])
+    categories_genes[cat, 'DEgenes'] = paste(tmp, collapse=',')
+  }
+  # output
+  write.table(categories_genes, args$categories_genes_out_fp, sep = "\t", row.names=FALSE, quote=FALSE)
+}
+
 # Output RData file
 if (!is.null(args$rdata)) {
-  save.image(file = "goseq_analysis.RData")
+  save.image(file=args$rdata)
 }
-
-
-sessionInfo()
--- a/goseq.xml	Sun Mar 17 10:27:17 2019 -0400
+++ b/goseq.xml	Fri Sep 06 07:50:46 2019 -0400
@@ -1,14 +1,18 @@
-<tool id="goseq" name="goseq" version="1.34.0+galaxy1">
+<tool id="goseq" name="goseq" version="@VERSION@+@GALAXY_VERSION@">
     <description>tests for overrepresented gene categories</description>
+    <macros>
+        <token name="@VERSION@">1.36.0</token>
+        <token name="@GALAXY_VERSION@">galaxy0</token>
+    </macros>
     <requirements>
-        <requirement type="package" version="1.34.0">bioconductor-goseq</requirement>
-        <requirement type="package" version="3.7.0">bioconductor-org.hs.eg.db</requirement>
-        <requirement type="package" version="3.7.0">bioconductor-org.dm.eg.db</requirement>
-        <requirement type="package" version="3.7.0">bioconductor-org.dr.eg.db</requirement>
-        <requirement type="package" version="3.7.0">bioconductor-org.mm.eg.db</requirement>
-        <requirement type="package" version="0.7.8">r-dplyr</requirement>
-        <requirement type="package" version="3.1.0">r-ggplot2</requirement>
-        <requirement type="package" version="1.6.0">r-optparse</requirement>
+        <requirement type="package" version="@VERSION@">bioconductor-goseq</requirement>
+        <requirement type="package" version="3.8.2">bioconductor-org.hs.eg.db</requirement>
+        <requirement type="package" version="3.8.2">bioconductor-org.dm.eg.db</requirement>
+        <requirement type="package" version="3.8.2">bioconductor-org.dr.eg.db</requirement>
+        <requirement type="package" version="3.8.2">bioconductor-org.mm.eg.db</requirement>
+        <requirement type="package" version="0.8.3">r-dplyr</requirement>
+        <requirement type="package" version="3.2.1">r-ggplot2</requirement>
+        <requirement type="package" version="1.6.2">r-optparse</requirement>
     </requirements>
     <stdio>
         <regex match="Execution halted"
@@ -30,42 +34,54 @@
     <command><![CDATA[
 Rscript '$__tool_directory__/goseq.r'
 
---dge_file '$dge_file'
---length_file '$length_file'
+    --dge_file '$dge_file'
+    --length_file '$length_file'
 
-#if $categorySource.catSource == 'getgo':
+#if $categorySource.catSource == 'getgo'
     --genome $categorySource.genome
     --gene_id $categorySource.gene_id
     --fetch_cats '$categorySource.fetchcats'
-#elif $categorySource.catSource == 'history':
+#elif $categorySource.catSource == 'history'
     --category_file '$categorySource.category_file'
 #end if
 
-#if $methods['wallenius']:
+#if $methods.wallenius
     --wallenius_tab '$wallenius_tab'
 #end if
-#if $methods['hypergeometric']:
+#if $methods.hypergeometric
     --nobias_tab '$nobias_tab'
 #end if
---repcnt '$methods.repcnt'
---sampling_tab '$sampling_tab'
 
---make_plots '$out.make_plots'
---length_bias_plot '$length_bias_plot'
---sample_vs_wallenius_plot '$sample_vs_wallenius_plot'
+    --repcnt $methods.repcnt
+#if $methods.repcnt != 0
+    --sampling_tab '$sampling_tab'
+#end if
 
---rdata '$out.rdata_out'
---p_adj_method '$adv.p_adj_method'
---use_genes_without_cat '$adv.use_genes_without_cat'
+    --p_adj_method '$adv.p_adj_method'
+    --use_genes_without_cat '$adv.use_genes_without_cat'
 
-#if $out.topgo_plot:
-    --top_plot '$out.topgo_plot'
+#if $out.topgo_plot
+    --top_plot '$top_plot'
 #end if
 
-    ]]></command>
+#if str($out.make_plots) == 'TRUE'
+    --make_plots '$out.make_plots'
+    --length_bias_plot '$length_bias_plot'
+    #if $methods.repcnt != 0 and $methods.wallenius
+    --sample_vs_wallenius_plot '$sample_vs_wallenius_plot'
+    #end if
+#end if
 
-    <!-- Input Files-->
+#if $out.cat_genes
+    --categories_genes_out_fp '$cat_genes_tab'
+#end if
+
+#if $out.rdata_out
+    --rdata '$rdata'
+#end if
+    ]]></command>
     <inputs>
+        <!-- Input Files-->
         <param name="dge_file" type="data" format="tabular" label="Differentially expressed genes file" help="A tabular file with Gene IDs in the first column, and True or False in the second column. True means a gene is differentially expressed. See Help section for details."/>
         <param name="length_file" type="data" format="tabular" label="Gene lengths file" help="You can calculate the gene lengths using featureCounts or the Gene length and GC content tool."/>
         <conditional name="categorySource">
@@ -96,21 +112,12 @@
                 <param name="category_file" type="data" format="tabular" label="Gene category file"/>
             </when>
         </conditional>
-
         <!-- Method Options -->
         <section name="methods" title="Method Options">
-            <param name="wallenius" type="boolean" checked="true" label="Use Wallenius method" help="See help for details. Default: Yes" />
-            <param name="hypergeometric" type="boolean" checked="false" label="Use Hypergeometric method" help="Does not use gene length information. See help for details. Default: No" />
-            <param name="repcnt" type="integer" size="3" min="0" max="10000" value="0" label="Sampling number" help="Number of random samples to be calculated when sampling is used. Set to 0 to not do sampling. Larger values take a long time. Default: 0" />
+            <param name="wallenius" type="boolean" checked="true" label="Use Wallenius method" help="See help for details" />
+            <param name="hypergeometric" type="boolean" checked="false" label="Use Hypergeometric method" help="Does not use gene length information. See help for details" />
+            <param name="repcnt" type="integer" size="3" min="0" max="10000" value="0" label="Sampling number" help="Number of random samples to be calculated when sampling is used. Set to 0 to not do sampling. Larger values take a long time" />
         </section>
-
-        <!-- Output Options -->
-        <section name="out" title="Output Options">
-            <param name="topgo_plot" type="boolean" checked="false" label="Output Top GO terms plot?" help="Output a PDF plot of the Top 10 over-represented GO terms. Default: No" />
-            <param name="make_plots" type="boolean" checked="false" label="Produce diagnostic plots?" help="This will produce the length bias (PWF) plot. If both sampling and wallenius methods are selected, it will also produce a plot comparing their p-values. These plots may help you compare the different p-value estimation methods that goseq can use. Default: No" />
-            <param name="rdata_out" type="boolean" checked="false" label="Output RData file?" help="Output all the data used by R to construct the tables and plots, can be loaded into R. Default: No" />
-        </section>
-
         <!-- Advanced Options -->
         <section name="adv" title="Advanced Options">
             <param name="p_adj_method" type="select" label="Select a method for multiple hypothesis testing correction">
@@ -121,10 +128,16 @@
                 <option value="bonferroni">Bonferroni</option>
                 <option value="BY">Benjamini - Yekutieli (2001)</option>
             </param>
-            <param name="use_genes_without_cat" type="boolean" checked="false" label="Count genes without any category?" help="For example, a large number of genes may have no GO term annotated. If this option is set to No, those genes will be ignored in the calculation of p-values. If this option is set to Yes, then these genes will count towards the total number of genes outside the category being tested. This was the default behaviour for version 1.15.1 and earlier. Default: No"/>
+            <param name="use_genes_without_cat" type="boolean" checked="false" label="Count genes without any category?" help="For example, a large number of genes may have no GO term annotated. If this option is set to No, those genes will be ignored in the calculation of p-values. If this option is set to Yes, then these genes will count towards the total number of genes outside the category being tested. This was the default behaviour for version 1.15.1 and earlier"/>
+        </section>
+        <!-- Output Options -->
+        <section name="out" title="Output Options">
+            <param name="topgo_plot" type="boolean" checked="false" label="Output Top GO terms plot?" help="Output a PDF plot of the Top 10 over-represented GO terms" />
+            <param name="make_plots" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" label="Produce diagnostic plots?" help="This will produce the length bias (PWF) plot. If both sampling and wallenius methods are selected, it will also produce a plot comparing their p-values. These plots may help you compare the different p-value estimation methods that goseq can use" />
+            <param name="cat_genes" type="boolean" checked="false" label="Extract the DE genes for the categories (GO/KEGG terms)?" help="" />
+            <param name="rdata_out" type="boolean" checked="false" label="Output RData file?" help="Output all the data used by R to construct the tables and plots, can be loaded into R" />
         </section>
     </inputs>
-
     <outputs>
         <data name="wallenius_tab" format="tabular" label="${tool.name} on ${on_string}: Ranked category list - Wallenius method">
             <filter>methods['wallenius']</filter>
@@ -143,33 +156,42 @@
             <filter>methods['wallenius']</filter>
             <filter>out['make_plots']</filter>
         </data>
-        <data name="rdata" format="rdata" from_work_dir="goseq_analysis.RData" label="${tool.name} on ${on_string}: RData file">
-            <filter>out['rdata_out']</filter>
-        </data>
-        <data name="top_plot" format="pdf" from_work_dir="top10.pdf" label="${tool.name} on ${on_string}: Top over-represented GO terms plot">
+        <data name="top_plot" format="pdf" label="${tool.name} on ${on_string}: Top over-represented GO terms plot">
             <filter>methods['wallenius']</filter>
             <filter>out['topgo_plot']</filter>
         </data>
+        <data name="cat_genes_tab" format="tabular" label="${tool.name} on ${on_string}: DE genes for categories (GO/KEGG terms)">
+            <filter>out['cat_genes']</filter>
+        </data>
+        <data name="rdata" format="rdata" label="${tool.name} on ${on_string}: RData file">
+            <filter>out['rdata_out']</filter>
+        </data>
     </outputs>
-
     <tests>
-        <!-- Ensure top plot is output -->
+        <!-- Ensure top plot is output and check Wallenius -->
         <test expect_num_outputs="2">
             <param name="dge_file" value="dge_list.tab" ftype="tabular" />
             <param name="length_file" value="gene_length.tab" ftype="tabular" />
-            <param name="catSource" value="history" />
-            <param name="category_file" value="category.tab" ftype="tabular" />
-            <param name="use_genes_without_cat" value="true" />
-            <param name="topgo_plot" value="true" />
+            <conditional name="categorySource">
+                <param name="catSource" value="history" />
+                <param name="category_file" value="category.tab" ftype="tabular" />
+            </conditional>
+            <section name="methods">
+                <param name="wallenius" value="true"/>
+                <param name="hypergeometric" value="false"/>
+                <param name="repcnt" value="0"/>
+            </section>
+            <section name="adv">
+                <param name="p_adj_method" value="BH"/>
+                <param name="use_genes_without_cat" value="true" />
+            </section>
+            <section name="out">
+                <param name="topgo_plot" value="true"/>
+                <param name="make_plots" value="false"/>
+                <param name="cat_genes" value="false"/>
+                <param name="rdata_out" value="false"/>
+            </section>
             <output name="top_plot" ftype="pdf" file="topgo.pdf" compare="sim_size"/>
-        </test>
-        <!-- Ensure Wallenius table is output -->
-        <test expect_num_outputs="1">
-            <param name="dge_file" value="dge_list.tab" ftype="tabular" />
-            <param name="length_file" value="gene_length.tab" ftype="tabular" />
-            <param name="catSource" value="history" />
-            <param name="category_file" value="category.tab" ftype="tabular" />
-            <param name="use_genes_without_cat" value="true" />
             <output name="wallenius_tab">
                 <assert_contents>
                     <has_text_matching expression="category.*over_represented_pvalue.*under_represented_pvalue.*numDEInCat.*numInCat.*term.*ontology.*p.adjust.over_represented.*p.adjust.under_represented" />
@@ -177,18 +199,41 @@
                 </assert_contents>
             </output>
         </test>
-        <!-- Ensure getting GO categories works -->
-        <test expect_num_outputs="1">
+        <!-- Ensure getting GO categories works & also DE genes for GO terms-->
+        <test expect_num_outputs="2">
             <param name="dge_file" value="dge_list.tab" ftype="tabular"/>
             <param name="length_file" value="gene_length.tab" ftype="tabular"/>
-            <param name="catSource" value="getgo" />
-            <param name="genome" value="hg38" />
-            <param name="gene_id" value="ensGene" />
-            <param name="use_genes_without_cat" value="true" />
+            <conditional name="categorySource">
+                <param name="catSource" value="getgo" />
+                <param name="genome" value="hg38" />
+                <param name="gene_id" value="ensGene" />
+                <param name="fetchcats" value="GO:CC,GO:BP,GO:MF"/>
+            </conditional>
+            <section name="methods">
+                <param name="wallenius" value="true"/>
+                <param name="hypergeometric" value="false"/>
+                <param name="repcnt" value="0"/>
+            </section>
+            <section name="adv">
+                <param name="p_adj_method" value="BH"/>
+                <param name="use_genes_without_cat" value="true" />
+            </section>
+            <section name="out">
+                <param name="topgo_plot" value="false"/>
+                <param name="make_plots" value="false"/>
+                <param name="cat_genes" value="true"/>
+                <param name="rdata_out" value="false"/>
+            </section>
             <output name="wallenius_tab">
                 <assert_contents>
                     <has_text_matching expression="category.*over_represented_pvalue.*under_represented_pvalue.*numDEInCat.*numInCat.*term.*ontology.*p.adjust.over_represented.*p.adjust.under_represented" />
-                    <has_text_matching expression="GO:0005576.*8.8" />
+                    <has_text_matching expression="GO:0005576.*9.0" />
+                </assert_contents>
+            </output>
+            <output name="cat_genes_tab">
+                <assert_contents>
+                    <has_text_matching expression="Categories.*DEgenes" />
+                    <has_text_matching expression="GO:0005615.*ENSG00000090402,ENSG00000108953,ENSG00000070961" />
                 </assert_contents>
             </output>
         </test>
@@ -196,14 +241,31 @@
         <test expect_num_outputs="1">
             <param name="dge_file" value="dge_list_zf.tab" ftype="tabular"/>
             <param name="length_file" value="gene_length_zf.tab" ftype="tabular"/>
-            <param name="catSource" value="getgo" />
-            <param name="genome" value="danRer10"/>
-            <param name="gene_id" value="ensGene" />
-            <param name="use_genes_without_cat" value="true" />
+            <conditional name="categorySource">
+                <param name="catSource" value="getgo" />
+                <param name="genome" value="danRer10"/>
+                <param name="gene_id" value="ensGene" />
+                <param name="fetchcats" value="GO:CC,GO:BP,GO:MF"/>
+            </conditional>
+            <section name="methods">
+                <param name="wallenius" value="true"/>
+                <param name="hypergeometric" value="false"/>
+                <param name="repcnt" value="0"/>
+            </section>
+            <section name="adv">
+                <param name="p_adj_method" value="BH"/>
+                <param name="use_genes_without_cat" value="true" />
+            </section>
+            <section name="out">
+                <param name="topgo_plot" value="false"/>
+                <param name="make_plots" value="false"/>
+                <param name="cat_genes" value="false"/>
+                <param name="rdata_out" value="false"/>
+            </section>
             <output name="wallenius_tab">
                 <assert_contents>
                     <has_text_matching expression="category.*over_represented_pvalue.*under_represented_pvalue.*numDEInCat.*numInCat.*term.*ontology.*p.adjust.over_represented.*p.adjust.under_represented" />
-                    <has_text_matching expression="GO:0031324.*0.50" />
+                    <has_text_matching expression="GO:0016569.*0.8" />
                 </assert_contents>
             </output>
         </test>
@@ -211,31 +273,76 @@
         <test expect_num_outputs="2">
             <param name="dge_file" value="dge_list.tab" ftype="tabular" />
             <param name="length_file" value="gene_length.tab" ftype="tabular" />
-            <param name="catSource" value="history" />
-            <param name="category_file" value="category.tab" ftype="tabular" />
+            <conditional name="categorySource">
+                <param name="catSource" value="history" />
+                <param name="category_file" value="category.tab" ftype="tabular" />
+            </conditional>
+            <section name="methods">
+                <param name="wallenius" value="true"/>
+                <param name="hypergeometric" value="false"/>
+                <param name="repcnt" value="0"/>
+            </section>
+            <section name="adv">
+                <param name="p_adj_method" value="BH"/>
+                <param name="use_genes_without_cat" value="true" />
+            </section>
+            <section name="out">
+                <param name="topgo_plot" value="false"/>
+                <param name="make_plots" value="true"/>
+                <param name="cat_genes" value="false"/>
+                <param name="rdata_out" value="false"/>
+            </section>
             <param name="make_plots" value="true" />
-            <param name="use_genes_without_cat" value="true" />
             <output name="length_bias_plot" ftype="pdf" file="length_bias_plot.pdf" compare="sim_size" />
         </test>
         <!-- Ensure hypergeometric works -->
         <test expect_num_outputs="2">
             <param name="dge_file" value="dge_list.tab" ftype="tabular" />
             <param name="length_file" value="gene_length.tab" ftype="tabular" />
-            <param name="catSource" value="history" />
-            <param name="category_file" value="category.tab" ftype="tabular" />
-            <param name="use_genes_without_cat" value="true" />
-            <param name="hypergeometric" value="true" />
+            <conditional name="categorySource">
+                <param name="catSource" value="history" />
+                <param name="category_file" value="category.tab" ftype="tabular" />
+            </conditional>
+            <section name="methods">
+                <param name="wallenius" value="true"/>
+                <param name="hypergeometric" value="true"/>
+                <param name="repcnt" value="0"/>
+            </section>
+            <section name="adv">
+                <param name="p_adj_method" value="BH"/>
+                <param name="use_genes_without_cat" value="true" />
+            </section>
+            <section name="out">
+                <param name="topgo_plot" value="false"/>
+                <param name="make_plots" value="false"/>
+                <param name="cat_genes" value="false"/>
+                <param name="rdata_out" value="false"/>
+            </section>
             <output name="nobias_tab" file="nobias.tab" compare="contains" />
         </test>
         <!-- Ensure sampling vs wallenius works -->
         <test expect_num_outputs="4">
             <param name="dge_file" value="dge_list.tab" ftype="tabular" />
             <param name="length_file" value="gene_length.tab" ftype="tabular" />
-            <param name="catSource" value="history" />
-            <param name="category_file" value="category.tab" ftype="tabular" />
-            <param name="use_genes_without_cat" value="true" />
-            <param name="make_plots" value="true" />
-            <param name="repcnt" value="1000" />
+            <conditional name="categorySource">
+                <param name="catSource" value="history" />
+                <param name="category_file" value="category.tab" ftype="tabular" />
+            </conditional>
+            <section name="methods">
+                <param name="wallenius" value="true"/>
+                <param name="hypergeometric" value="false"/>
+                <param name="repcnt" value="1000"/>
+            </section>
+            <section name="adv">
+                <param name="p_adj_method" value="BH"/>
+                <param name="use_genes_without_cat" value="true" />
+            </section>
+            <section name="out">
+                <param name="topgo_plot" value="false"/>
+                <param name="make_plots" value="true"/>
+                <param name="cat_genes" value="false"/>
+                <param name="rdata_out" value="false"/>
+            </section>
             <output name="sampling_tab" file="samp.tab" compare="sim_size" />
             <output name="length_bias_plot" ftype="pdf" file="length_bias_plot.pdf" compare="sim_size" />
             <output name="sample_vs_wallenius_plot" ftype="pdf" file="sample_vs_wallenius_plot.pdf" compare="sim_size" />
@@ -244,14 +351,28 @@
         <test expect_num_outputs="2">
             <param name="dge_file" value="dge_list.tab" ftype="tabular" />
             <param name="length_file" value="gene_length.tab" ftype="tabular" />
-            <param name="catSource" value="history" />
-            <param name="category_file" value="category.tab" ftype="tabular" />
-            <param name="use_genes_without_cat" value="true" />
-            <param name="rdata_out" value="true" />
+            <conditional name="categorySource">
+                <param name="catSource" value="history" />
+                <param name="category_file" value="category.tab" ftype="tabular" />
+            </conditional>
+            <section name="methods">
+                <param name="wallenius" value="true"/>
+                <param name="hypergeometric" value="false"/>
+                <param name="repcnt" value="0"/>
+            </section>
+            <section name="adv">
+                <param name="p_adj_method" value="BH"/>
+                <param name="use_genes_without_cat" value="true" />
+            </section>
+            <section name="out">
+                <param name="topgo_plot" value="false"/>
+                <param name="make_plots" value="false"/>
+                <param name="cat_genes" value="false"/>
+                <param name="rdata_out" value="true"/>
+            </section>
             <output name="rdata" file="goseq_analysis.RData" compare="sim_size" />
         </test>
     </tests>
-
     <help><![CDATA[
 
 .. class:: infomark
@@ -312,8 +433,7 @@
 
 **Outputs**
 
-* This tool outputs a tabular file containing a ranked list of gene categories, similar to below. The default output is the Wallenius method table. If the Sampling and/or Hypergeometric methods are also selected, additional tables are produced.
-* Optionally, this tool can also output a plot of the top 10 over-represented GO categories, some diagnostic plots and an RData file, see **Output Options** above.
+This tool outputs a tabular file containing a ranked list of gene categories, similar to below. The default output is the Wallenius method table. If the Sampling and/or Hypergeometric methods are also selected, additional tables are produced.
 
 Example:
 
@@ -328,6 +448,12 @@
 GO\:0070062  0.000428        0.999808         43           108       extracellular exosome                    CC         0.394825             1
 =========== =============== ================ ============ ========== ======================================== ========== =================== ====================
 
+Optionally, this tool can also output:
+  * a plot of the top 10 over-represented GO categories
+  * some diagnostic plots
+  * a tabular with the differentially expressed genes in categories (GO/KEGG terms)
+  * an RData file
+
 -----
 
 **Method options**