view GO_MWU.R @ 2:5acf9dfdfa27 draft default tip

planemo upload commit 66a856bcce69986d9a6f1a39820dd9b3f4f6b0db
author cristian
date Wed, 09 Nov 2022 08:57:54 +0000
parents f7287f82602f
children
line wrap: on
line source


# GO_MWU uses continuous measure of significance (such as fold-change or
# -log(p-value) ) to identify GO categories that are significantly enriches
# with either up- or down-regulated genes. The advantage - no need to impose
# arbitrary significance cutoff.

# If the measure is binary (0 or 1) the script will perform a typical "GO
# enrichment" analysis based Fisher's exact test: it will show GO categories
# over-represented among the genes that have 1 as their measure.

# On the plot, different fonts are used to indicate significance and color
# indicates enrichment with either up (red) or down (blue) regulated genes.
# No colors are shown for binary measure analysis.

# The tree on the plot is hierarchical clustering of GO categories based on
# shared genes. Categories with no branch length between them are subsets of each other.

# The fraction next to GO category name indicates the fracton of "good" genes
# in it; "good" genes being the ones exceeding the arbitrary absValue cutoff
# (option in gomwuPlot). For Fisher's based test, specify absValue = 0.5.
# This value does not affect statistics and is used for plotting only.

# Stretch the plot manually to match tree to text

# Mikhail V. Matz, UT Austin, February 2015; matz@utexas.edu


# setup R error handling to go to stderr
options(show.error.messages = F, error = function() {
    cat(geterrmessage(), file = stderr())
    q("no", 1, F)
})

# we need that to not crash galaxy with an UTF8 error on German LC settings.
loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")

library("getopt")
library("tools")
options(stringAsFactors = FALSE, useFancyQuotes = FALSE)

getScriptPath <- function() {
    cmd.args <- commandArgs()
    m <- regexpr("(?<=^--file=).+", cmd.args, perl = TRUE)
    script.dir <- dirname(regmatches(cmd.args, m))
    if (length(script.dir) == 0) stop("can't determine script dir: please call the script with Rscript")
    if (length(script.dir) > 1) stop("can't determine script dir: more than one '--file' argument detected")
    return(script.dir)
}

args <- commandArgs(trailingOnly = TRUE)

# get options, using the spec as defined by the enclosed list.
# we read the options from the default: commandArgs(TRUE).
spec <- matrix(c(
    "help", "h", 0, "logical",
    "input", "i", 1, "character",
    "goAnnotations", "a", 1, "character",
    "goDatabase", "g", 1, "character",
    "goDivision", "d", 1, "character",
    "largest", "o", 1, "numeric",
    "smallest", "m", 1, "numeric",
    "absValue", "k", 1, "numeric",
    "clusterheight", "c", 1, "numeric",
    "textsize", "e", 1, "numeric",
    "pcut", "p", 1, "numeric",
    "hcut", "t", 1, "numeric",
    "l1", "1", 1, "numeric",
    "l2", "2", 1, "numeric",
    "l3", "3", 1, "numeric",
    "version", "v", 0, "character"
), byrow = TRUE, ncol = 4)
opt <- getopt(spec)

# if help was asked for print a friendly message
# and exit with a non-zero error code
if (!is.null(opt$help)) {
    cat(getopt(spec, usage = TRUE))
    q(status = 1)
}

if (!is.null(opt$version)) {
    cat("0.3.0\n")
    q(status = 1)
}

scriptdir <- getScriptPath()

### if running in Rstudio ###
# Uncomment this block, change the values and run from here.
# opt = list()
# opt$input = "heats.csv"
# opt$goAnnotations = "amil_defog_iso2go.tab"
# opt$goDatabase = "go.obo"
# opt$goDivision = "BP"

### Optionally uncomment the options below that you want to change and change the default value.
# opt$clusterheight <- 0.25
# opt$textsize <- 1.2
# opt$largest <- 0.1
# opt$smallest <- 5
# opt$absValue <- -log(0.05,10)
# opt$l1 <- 0.1
# opt$l2 <- 0.05
# opt$l3 <- 0.01
# opt$pcut <- 1e-2
# opt$hcut <- 0.9


# enforce the following required arguments
if (is.null(opt$input)) {
    cat("'input' is required\n")
    q(status = 1)
}
if (is.null(opt$goDatabase)) {
    cat("'goDatabase' is required\n")
    q(status = 1)
}
if (is.null(opt$goAnnotations)) {
    cat("'goAnnotations' is required\n")
    q(status = 1)
}
if (is.null(opt$goDivision)) {
    cat("'goDivision' is required\n")
    q(status = 1)
}
if (is.null(opt$clusterheight)) {
    opt$clusterheight <- 0.25
}
if (is.null(opt$textsize)) {
    opt$textsize <- 1.2
}
if (is.null(opt$largest)) {
    opt$largest <- 0.1
}
if (is.null(opt$smallest)) {
    opt$smallest <- 5
}
if (is.null(opt$absValue)) {
    opt$absValue <- -log(0.05, 10)
}
if (is.null(opt$l1)) {
    opt$l1 <- 0.1
}
if (is.null(opt$l2)) {
    opt$l2 <- 0.05
}
if (is.null(opt$l3)) {
    opt$l3 <- 0.01
}
if (is.null(opt$pcut)) {
    opt$pcut <- 1e-2
}
if (is.null(opt$hcut)) {
    opt$hcut <- 0.9
}

source_local <- function(fname) {
    # argv <- commandArgs(trailingOnly = FALSE)
    # base_dir <- dirname(substring(argv[grep("--file = ", argv)], 8))
    base_dir <- scriptdir
    source(paste(base_dir, fname, sep = "/"))
}

source_local("gomwu.functions.R")


nn <- strsplit(opt$input, "[/.]")
if (length(nn[[1]]) == 3) {
    dir <- nn[[1]][1]
    name <- nn[[1]][2]
    ext <- nn[[1]][3]
} else if (length(nn[[1]]) == 2) {
    dir <- "."
    name <- nn[[1]][1]
    ext <- nn[[1]][2]
}
# It might take a few minutes for MF and BP. Do not rerun it if you just want
# to replot the data with different cutoffs, go straight to gomwuPlot. If you
# change any of the numeric values below, delete the files that were generated
# in previos runs first.

gomwuStats(opt$input, opt$goDatabase, opt$goAnnotations, opt$goDivision, scriptdir,
    # replace with full path to perl executable if it is not in your system's PATH already
    perlPath = "perl",
    # a GO category will not be considered if it contains more than this fraction of the total number of genes
    largest = opt$largest,
    smallest = opt$smallest, # a GO category should contain at least this many genes to be considered.
    clusterCutHeight = opt$clusterheight, # threshold for merging similar (gene-sharing) terms. See README for details.
    # Alternative = "g" # by default the MWU test is two-tailed;
    # specify "g" or "l" of you want to test for "greater" or "less" instead.
    # Module = TRUE,Alternative="g" # un-remark this if you are analyzing a
    # SIGNED WGCNA module (values: 0 for not in module genes, kME for in-module genes).
    # In the call to gomwuPlot below, specify absValue=0.001 (count number of "good genes" that fall into the module)
    # Module = TRUE # un-remark this if you are analyzing an UNSIGNED WGCNA module
)
# do not continue if the printout shows that no GO terms pass 10% FDR.


# ----------- Plotting results

# change this to a pdf output
pdf(paste0(dir, "/", "Rplots.pdf"), width = 7, height = 7)
# png(paste0(dir,"/","Rplots.png"),res=100)

results <- gomwuPlot(opt$input, opt$goAnnotations, opt$goDivision,
    # genes with the measure value exceeding this will be counted as "good genes".
    # This setting is for signed log-pvalues. Specify absValue=0.001 if you are doing
    # Fisher's exact test for standard GO enrichment or analyzing a WGCNA module (all non-zero genes = "good genes").
    absValue = opt$absValue,
    # 	absValue = 1, # un-remark this if you are using log2-fold changes
    # FDR threshold for plotting. Specify level1=1 to plot all GO categories containing genes exceeding the absValue.
    level1 = opt$l1,
    level2 = opt$l2, # FDR cutoff to print in regular (not italic) font.
    level3 = opt$l3, # FDR cutoff to print in large bold font.
    # decrease to fit more on one page, or increase (after rescaling the plot so the tree fits the text)
    # for better "word cloud" effect
    txtsize = opt$textsize,
    treeHeight = 0.5, # height of the hierarchical clustering tree
    # 	colors = c("dodgerblue2","firebrick1","skyblue2","lightcoral")
    # these are default colors, uncomment and change if needed
)
dev.off()
# manually rescale the plot so the tree matches the text
# if there are too many categories displayed, try make it more stringent with level1 = 0.05,level2=0.01,level3=0.001.

# text representation of results, with actual adjusted p-values
write.table(results[[1]], paste0(dir, "/", "results.tsv"), sep = "\t")


# ------- extracting representative GOs

# this module chooses GO terms that best represent *independent* groups of significant GO terms

pcut <- opt$pcut
hcut <- opt$hcut

# plotting the GO tree with the cut level (uncomment the next two lines to plot)
# plot(results[[2]],cex = 0.6)
# abline(h = hcut,col="red")

# cutting
ct <- cutree(results[[2]], h = hcut)
annots <- c()
ci <- 1
for (ci in unique(ct)) {
    message(ci)
    rn <- names(ct)[ct == ci]
    obs <- grep("obsolete", rn)
    if (length(obs) > 0) {
        rn <- rn[-obs]
    }
    if (length(rn) == 0) {
        next
    }
    rr <- results[[1]][rn, ]
    bestrr <- rr[which(rr$pval == min(rr$pval)), ]
    best <- 1
    if (nrow(bestrr) > 1) {
        nns <- sub(" .+", "", row.names(bestrr))
        fr <- c()
        for (i in 1:length(nns)) {
            fr <- c(fr, eval(parse(text = nns[i])))
        }
        best <- which(fr == max(fr))
    }
    if (bestrr$pval[best] <= pcut) {
        annots <- c(annots, sub("\\d+\\/\\d+ ", "", row.names(bestrr)[best]))
    }
}

mwus <- read.table(paste0(dir, "/", paste("MWU", opt$goDivision, name, sep = "_"), ".", ext), header = T)
best_GOs <- mwus[mwus$name %in% annots, ]
write.table(best_GOs, paste0(dir, "/", "best_go.tsv"), sep = "\t", row.names = FALSE)