Mercurial > repos > cristian > rbgoa
diff GO_MWU.R @ 0:91261b42c07e draft
"planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
author | cristian |
---|---|
date | Thu, 14 Apr 2022 13:28:05 +0000 |
parents | |
children | f7287f82602f |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GO_MWU.R Thu Apr 14 13:28:05 2022 +0000 @@ -0,0 +1,218 @@ + +# GO_MWU uses continuous measure of significance (such as fold-change or +# -log(p-value) ) to identify GO categories that are significantly enriches +# with either up- or down-regulated genes. The advantage - no need to impose +# arbitrary significance cutoff. + +# If the measure is binary (0 or 1) the script will perform a typical "GO +# enrichment" analysis based Fisher's exact test: it will show GO categories +# over-represented among the genes that have 1 as their measure. + +# On the plot, different fonts are used to indicate significance and color +# indicates enrichment with either up (red) or down (blue) regulated genes. +# No colors are shown for binary measure analysis. + +# The tree on the plot is hierarchical clustering of GO categories based on +# shared genes. Categories with no branch length between them are subsets of each other. + +# The fraction next to GO category name indicates the fracton of "good" genes +# in it; "good" genes being the ones exceeding the arbitrary absValue cutoff +# (option in gomwuPlot). For Fisher's based test, specify absValue = 0.5. +# This value does not affect statistics and is used for plotting only. + +# Stretch the plot manually to match tree to text + +# Mikhail V. Matz, UT Austin, February 2015; matz@utexas.edu + + +# setup R error handling to go to stderr +options(show.error.messages = F, error = function() { + cat(geterrmessage(), file = stderr()) + q("no", 1, F) +}) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") + +library("getopt") +library("tools") +options(stringAsFactors = FALSE, useFancyQuotes = FALSE) +args <- commandArgs(trailingOnly = TRUE) + +# get options, using the spec as defined by the enclosed list. +# we read the options from the default: commandArgs(TRUE). +spec <- matrix(c( + "help", "h", 0, "logical", + "scriptdir", "s", 1, "character", + "input", "i", 1, "character", + "goAnnotations", "a", 1, "character", + "goDatabase", "g", 1, "character", + "goDivision", "d", 1, "character", + "largest", "o", 1, "numeric", + "smallest", "m", 1, "numeric", + "clusterheight", "c", 1, "numeric", + "pcut", "p", 1, "numeric", + "hcut", "t", 1, "numeric", + "version", "v", 0, "character" +), byrow = TRUE, ncol = 4) +opt <- getopt(spec) + +# if help was asked for print a friendly message +# and exit with a non-zero error code +if (!is.null(opt$help)) { + cat(getopt(spec, usage = TRUE)) + q(status = 1) +} + +if (!is.null(opt$version)) { + cat("0.1.0\n") + q(status = 1) +} + +# enforce the following required arguments +if (is.null(opt$scriptdir)) { + cat("'scriptdir' is required\n") + q(status = 1) +} +if (is.null(opt$input)) { + cat("'input' is required\n") + q(status = 1) +} +if (is.null(opt$goDatabase)) { + cat("'goDatabase' is required\n") + q(status = 1) +} +if (is.null(opt$goAnnotations)) { + cat("'goAnnotations' is required\n") + q(status = 1) +} +if (is.null(opt$goDivision)) { + cat("'goDivision' is required\n") + q(status = 1) +} +if (is.null(opt$clusterheight)) { + cat("'clusterheight' is required\n") + q(status = 1) +} +if (is.null(opt$largest)) { + cat("'largest' is required\n") + q(status = 1) +} +if (is.null(opt$smallest)) { + cat("'smallest' is required\n") + q(status = 1) +} +if (is.null(opt$pcut)) { + cat("'pcut' is required\n") + q(status = 1) +} +if (is.null(opt$hcut)) { + cat("'hcut' is required\n") + q(status = 1) +} + +source_local <- function(fname) { + # argv <- commandArgs(trailingOnly = FALSE) + # base_dir <- dirname(substring(argv[grep("--file = ", argv)], 8)) + base_dir <- opt$scriptdir + source(paste(base_dir, fname, sep = "/")) +} + +source_local("gomwu.functions.R") + +# It might take a few minutes for MF and BP. Do not rerun it if you just want +# to replot the data with different cutoffs, go straight to gomwuPlot. If you +# change any of the numeric values below, delete the files that were generated +# in previos runs first. + +gomwuStats(opt$input, opt$goDatabase, opt$goAnnotations, opt$goDivision, opt$scriptdir, + # replace with full path to perl executable if it is not in your system's PATH already + perlPath = "perl", + # a GO category will not be considered if it contains more than this fraction of the total number of genes + largest = opt$largest, + smallest = opt$smallest, # a GO category should contain at least this many genes to be considered. + clusterCutHeight = opt$clusterheight, # threshold for merging similar (gene-sharing) terms. See README for details. + # Alternative = "g" # by default the MWU test is two-tailed; + # specify "g" or "l" of you want to test for "greater" or "less" instead. + # Module = TRUE,Alternative="g" # un-remark this if you are analyzing a + # SIGNED WGCNA module (values: 0 for not in module genes, kME for in-module genes). + # In the call to gomwuPlot below, specify absValue=0.001 (count number of "good genes" that fall into the module) + # Module = TRUE # un-remark this if you are analyzing an UNSIGNED WGCNA module +) +# do not continue if the printout shows that no GO terms pass 10% FDR. + + +# ----------- Plotting results + +# change this to a pdf output +pdf() + +results <- gomwuPlot(opt$input, opt$goAnnotations, opt$goDivision, + # genes with the measure value exceeding this will be counted as "good genes". + # This setting is for signed log-pvalues. Specify absValue=0.001 if you are doing + # Fisher's exact test for standard GO enrichment or analyzing a WGCNA module (all non-zero genes = "good genes"). + absValue = -log(0.05, 10), + # absValue = 1, # un-remark this if you are using log2-fold changes + # FDR threshold for plotting. Specify level1=1 to plot all GO categories containing genes exceeding the absValue. + level1 = 0.1, + level2 = 0.05, # FDR cutoff to print in regular (not italic) font. + level3 = 0.01, # FDR cutoff to print in large bold font. + # decrease to fit more on one page, or increase (after rescaling the plot so the tree fits the text) + # for better "word cloud" effect + txtsize = 1.2, + treeHeight = 0.5, # height of the hierarchical clustering tree + # colors = c("dodgerblue2","firebrick1","skyblue2","lightcoral") + # these are default colors, uncomment and change if needed +) +# manually rescale the plot so the tree matches the text +# if there are too many categories displayed, try make it more stringent with level1 = 0.05,level2=0.01,level3=0.001. + +# text representation of results, with actual adjusted p-values +write.table(results[[1]], "results.tsv", sep = "\t") + + +# ------- extracting representative GOs + +# this module chooses GO terms that best represent *independent* groups of significant GO terms + +pcut <- opt$pcut +hcut <- opt$hcut + +# plotting the GO tree with the cut level (uncomment the next two lines to plot) +# plot(results[[2]],cex = 0.6) +# abline(h = hcut,col="red") + +# cutting +ct <- cutree(results[[2]], h = hcut) +annots <- c() +ci <- 1 +for (ci in unique(ct)) { + message(ci) + rn <- names(ct)[ct == ci] + obs <- grep("obsolete", rn) + if (length(obs) > 0) { + rn <- rn[-obs] + } + if (length(rn) == 0) { + next + } + rr <- results[[1]][rn, ] + bestrr <- rr[which(rr$pval == min(rr$pval)), ] + best <- 1 + if (nrow(bestrr) > 1) { + nns <- sub(" .+", "", row.names(bestrr)) + fr <- c() + for (i in 1:length(nns)) { + fr <- c(fr, eval(parse(text = nns[i]))) + } + best <- which(fr == max(fr)) + } + if (bestrr$pval[best] <= pcut) { + annots <- c(annots, sub("\\d+\\/\\d+ ", "", row.names(bestrr)[best])) + } +} + +mwus <- read.table(paste("MWU", opt$goDivision, opt$input, sep = "_"), header = T) +best_GOs <- mwus[mwus$name %in% annots, ] +write.table(best_GOs, "best_go.tsv", sep = "\t") +dev.off()