comparison GO_MWU.R @ 0:91261b42c07e draft

"planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
author cristian
date Thu, 14 Apr 2022 13:28:05 +0000
parents
children f7287f82602f
comparison
equal deleted inserted replaced
-1:000000000000 0:91261b42c07e
1
2 # GO_MWU uses continuous measure of significance (such as fold-change or
3 # -log(p-value) ) to identify GO categories that are significantly enriches
4 # with either up- or down-regulated genes. The advantage - no need to impose
5 # arbitrary significance cutoff.
6
7 # If the measure is binary (0 or 1) the script will perform a typical "GO
8 # enrichment" analysis based Fisher's exact test: it will show GO categories
9 # over-represented among the genes that have 1 as their measure.
10
11 # On the plot, different fonts are used to indicate significance and color
12 # indicates enrichment with either up (red) or down (blue) regulated genes.
13 # No colors are shown for binary measure analysis.
14
15 # The tree on the plot is hierarchical clustering of GO categories based on
16 # shared genes. Categories with no branch length between them are subsets of each other.
17
18 # The fraction next to GO category name indicates the fracton of "good" genes
19 # in it; "good" genes being the ones exceeding the arbitrary absValue cutoff
20 # (option in gomwuPlot). For Fisher's based test, specify absValue = 0.5.
21 # This value does not affect statistics and is used for plotting only.
22
23 # Stretch the plot manually to match tree to text
24
25 # Mikhail V. Matz, UT Austin, February 2015; matz@utexas.edu
26
27
28 # setup R error handling to go to stderr
29 options(show.error.messages = F, error = function() {
30 cat(geterrmessage(), file = stderr())
31 q("no", 1, F)
32 })
33
34 # we need that to not crash galaxy with an UTF8 error on German LC settings.
35 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
36
37 library("getopt")
38 library("tools")
39 options(stringAsFactors = FALSE, useFancyQuotes = FALSE)
40 args <- commandArgs(trailingOnly = TRUE)
41
42 # get options, using the spec as defined by the enclosed list.
43 # we read the options from the default: commandArgs(TRUE).
44 spec <- matrix(c(
45 "help", "h", 0, "logical",
46 "scriptdir", "s", 1, "character",
47 "input", "i", 1, "character",
48 "goAnnotations", "a", 1, "character",
49 "goDatabase", "g", 1, "character",
50 "goDivision", "d", 1, "character",
51 "largest", "o", 1, "numeric",
52 "smallest", "m", 1, "numeric",
53 "clusterheight", "c", 1, "numeric",
54 "pcut", "p", 1, "numeric",
55 "hcut", "t", 1, "numeric",
56 "version", "v", 0, "character"
57 ), byrow = TRUE, ncol = 4)
58 opt <- getopt(spec)
59
60 # if help was asked for print a friendly message
61 # and exit with a non-zero error code
62 if (!is.null(opt$help)) {
63 cat(getopt(spec, usage = TRUE))
64 q(status = 1)
65 }
66
67 if (!is.null(opt$version)) {
68 cat("0.1.0\n")
69 q(status = 1)
70 }
71
72 # enforce the following required arguments
73 if (is.null(opt$scriptdir)) {
74 cat("'scriptdir' is required\n")
75 q(status = 1)
76 }
77 if (is.null(opt$input)) {
78 cat("'input' is required\n")
79 q(status = 1)
80 }
81 if (is.null(opt$goDatabase)) {
82 cat("'goDatabase' is required\n")
83 q(status = 1)
84 }
85 if (is.null(opt$goAnnotations)) {
86 cat("'goAnnotations' is required\n")
87 q(status = 1)
88 }
89 if (is.null(opt$goDivision)) {
90 cat("'goDivision' is required\n")
91 q(status = 1)
92 }
93 if (is.null(opt$clusterheight)) {
94 cat("'clusterheight' is required\n")
95 q(status = 1)
96 }
97 if (is.null(opt$largest)) {
98 cat("'largest' is required\n")
99 q(status = 1)
100 }
101 if (is.null(opt$smallest)) {
102 cat("'smallest' is required\n")
103 q(status = 1)
104 }
105 if (is.null(opt$pcut)) {
106 cat("'pcut' is required\n")
107 q(status = 1)
108 }
109 if (is.null(opt$hcut)) {
110 cat("'hcut' is required\n")
111 q(status = 1)
112 }
113
114 source_local <- function(fname) {
115 # argv <- commandArgs(trailingOnly = FALSE)
116 # base_dir <- dirname(substring(argv[grep("--file = ", argv)], 8))
117 base_dir <- opt$scriptdir
118 source(paste(base_dir, fname, sep = "/"))
119 }
120
121 source_local("gomwu.functions.R")
122
123 # It might take a few minutes for MF and BP. Do not rerun it if you just want
124 # to replot the data with different cutoffs, go straight to gomwuPlot. If you
125 # change any of the numeric values below, delete the files that were generated
126 # in previos runs first.
127
128 gomwuStats(opt$input, opt$goDatabase, opt$goAnnotations, opt$goDivision, opt$scriptdir,
129 # replace with full path to perl executable if it is not in your system's PATH already
130 perlPath = "perl",
131 # a GO category will not be considered if it contains more than this fraction of the total number of genes
132 largest = opt$largest,
133 smallest = opt$smallest, # a GO category should contain at least this many genes to be considered.
134 clusterCutHeight = opt$clusterheight, # threshold for merging similar (gene-sharing) terms. See README for details.
135 # Alternative = "g" # by default the MWU test is two-tailed;
136 # specify "g" or "l" of you want to test for "greater" or "less" instead.
137 # Module = TRUE,Alternative="g" # un-remark this if you are analyzing a
138 # SIGNED WGCNA module (values: 0 for not in module genes, kME for in-module genes).
139 # In the call to gomwuPlot below, specify absValue=0.001 (count number of "good genes" that fall into the module)
140 # Module = TRUE # un-remark this if you are analyzing an UNSIGNED WGCNA module
141 )
142 # do not continue if the printout shows that no GO terms pass 10% FDR.
143
144
145 # ----------- Plotting results
146
147 # change this to a pdf output
148 pdf()
149
150 results <- gomwuPlot(opt$input, opt$goAnnotations, opt$goDivision,
151 # genes with the measure value exceeding this will be counted as "good genes".
152 # This setting is for signed log-pvalues. Specify absValue=0.001 if you are doing
153 # Fisher's exact test for standard GO enrichment or analyzing a WGCNA module (all non-zero genes = "good genes").
154 absValue = -log(0.05, 10),
155 # absValue = 1, # un-remark this if you are using log2-fold changes
156 # FDR threshold for plotting. Specify level1=1 to plot all GO categories containing genes exceeding the absValue.
157 level1 = 0.1,
158 level2 = 0.05, # FDR cutoff to print in regular (not italic) font.
159 level3 = 0.01, # FDR cutoff to print in large bold font.
160 # decrease to fit more on one page, or increase (after rescaling the plot so the tree fits the text)
161 # for better "word cloud" effect
162 txtsize = 1.2,
163 treeHeight = 0.5, # height of the hierarchical clustering tree
164 # colors = c("dodgerblue2","firebrick1","skyblue2","lightcoral")
165 # these are default colors, uncomment and change if needed
166 )
167 # manually rescale the plot so the tree matches the text
168 # if there are too many categories displayed, try make it more stringent with level1 = 0.05,level2=0.01,level3=0.001.
169
170 # text representation of results, with actual adjusted p-values
171 write.table(results[[1]], "results.tsv", sep = "\t")
172
173
174 # ------- extracting representative GOs
175
176 # this module chooses GO terms that best represent *independent* groups of significant GO terms
177
178 pcut <- opt$pcut
179 hcut <- opt$hcut
180
181 # plotting the GO tree with the cut level (uncomment the next two lines to plot)
182 # plot(results[[2]],cex = 0.6)
183 # abline(h = hcut,col="red")
184
185 # cutting
186 ct <- cutree(results[[2]], h = hcut)
187 annots <- c()
188 ci <- 1
189 for (ci in unique(ct)) {
190 message(ci)
191 rn <- names(ct)[ct == ci]
192 obs <- grep("obsolete", rn)
193 if (length(obs) > 0) {
194 rn <- rn[-obs]
195 }
196 if (length(rn) == 0) {
197 next
198 }
199 rr <- results[[1]][rn, ]
200 bestrr <- rr[which(rr$pval == min(rr$pval)), ]
201 best <- 1
202 if (nrow(bestrr) > 1) {
203 nns <- sub(" .+", "", row.names(bestrr))
204 fr <- c()
205 for (i in 1:length(nns)) {
206 fr <- c(fr, eval(parse(text = nns[i])))
207 }
208 best <- which(fr == max(fr))
209 }
210 if (bestrr$pval[best] <= pcut) {
211 annots <- c(annots, sub("\\d+\\/\\d+ ", "", row.names(bestrr)[best]))
212 }
213 }
214
215 mwus <- read.table(paste("MWU", opt$goDivision, opt$input, sep = "_"), header = T)
216 best_GOs <- mwus[mwus$name %in% annots, ]
217 write.table(best_GOs, "best_go.tsv", sep = "\t")
218 dev.off()