Mercurial > repos > cristian > rbgoa
comparison GO_MWU.R @ 0:91261b42c07e draft
"planemo upload commit 25eebba0c98dd7a5a703412be90e97f13f66b5bc"
author | cristian |
---|---|
date | Thu, 14 Apr 2022 13:28:05 +0000 |
parents | |
children | f7287f82602f |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:91261b42c07e |
---|---|
1 | |
2 # GO_MWU uses continuous measure of significance (such as fold-change or | |
3 # -log(p-value) ) to identify GO categories that are significantly enriches | |
4 # with either up- or down-regulated genes. The advantage - no need to impose | |
5 # arbitrary significance cutoff. | |
6 | |
7 # If the measure is binary (0 or 1) the script will perform a typical "GO | |
8 # enrichment" analysis based Fisher's exact test: it will show GO categories | |
9 # over-represented among the genes that have 1 as their measure. | |
10 | |
11 # On the plot, different fonts are used to indicate significance and color | |
12 # indicates enrichment with either up (red) or down (blue) regulated genes. | |
13 # No colors are shown for binary measure analysis. | |
14 | |
15 # The tree on the plot is hierarchical clustering of GO categories based on | |
16 # shared genes. Categories with no branch length between them are subsets of each other. | |
17 | |
18 # The fraction next to GO category name indicates the fracton of "good" genes | |
19 # in it; "good" genes being the ones exceeding the arbitrary absValue cutoff | |
20 # (option in gomwuPlot). For Fisher's based test, specify absValue = 0.5. | |
21 # This value does not affect statistics and is used for plotting only. | |
22 | |
23 # Stretch the plot manually to match tree to text | |
24 | |
25 # Mikhail V. Matz, UT Austin, February 2015; matz@utexas.edu | |
26 | |
27 | |
28 # setup R error handling to go to stderr | |
29 options(show.error.messages = F, error = function() { | |
30 cat(geterrmessage(), file = stderr()) | |
31 q("no", 1, F) | |
32 }) | |
33 | |
34 # we need that to not crash galaxy with an UTF8 error on German LC settings. | |
35 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | |
36 | |
37 library("getopt") | |
38 library("tools") | |
39 options(stringAsFactors = FALSE, useFancyQuotes = FALSE) | |
40 args <- commandArgs(trailingOnly = TRUE) | |
41 | |
42 # get options, using the spec as defined by the enclosed list. | |
43 # we read the options from the default: commandArgs(TRUE). | |
44 spec <- matrix(c( | |
45 "help", "h", 0, "logical", | |
46 "scriptdir", "s", 1, "character", | |
47 "input", "i", 1, "character", | |
48 "goAnnotations", "a", 1, "character", | |
49 "goDatabase", "g", 1, "character", | |
50 "goDivision", "d", 1, "character", | |
51 "largest", "o", 1, "numeric", | |
52 "smallest", "m", 1, "numeric", | |
53 "clusterheight", "c", 1, "numeric", | |
54 "pcut", "p", 1, "numeric", | |
55 "hcut", "t", 1, "numeric", | |
56 "version", "v", 0, "character" | |
57 ), byrow = TRUE, ncol = 4) | |
58 opt <- getopt(spec) | |
59 | |
60 # if help was asked for print a friendly message | |
61 # and exit with a non-zero error code | |
62 if (!is.null(opt$help)) { | |
63 cat(getopt(spec, usage = TRUE)) | |
64 q(status = 1) | |
65 } | |
66 | |
67 if (!is.null(opt$version)) { | |
68 cat("0.1.0\n") | |
69 q(status = 1) | |
70 } | |
71 | |
72 # enforce the following required arguments | |
73 if (is.null(opt$scriptdir)) { | |
74 cat("'scriptdir' is required\n") | |
75 q(status = 1) | |
76 } | |
77 if (is.null(opt$input)) { | |
78 cat("'input' is required\n") | |
79 q(status = 1) | |
80 } | |
81 if (is.null(opt$goDatabase)) { | |
82 cat("'goDatabase' is required\n") | |
83 q(status = 1) | |
84 } | |
85 if (is.null(opt$goAnnotations)) { | |
86 cat("'goAnnotations' is required\n") | |
87 q(status = 1) | |
88 } | |
89 if (is.null(opt$goDivision)) { | |
90 cat("'goDivision' is required\n") | |
91 q(status = 1) | |
92 } | |
93 if (is.null(opt$clusterheight)) { | |
94 cat("'clusterheight' is required\n") | |
95 q(status = 1) | |
96 } | |
97 if (is.null(opt$largest)) { | |
98 cat("'largest' is required\n") | |
99 q(status = 1) | |
100 } | |
101 if (is.null(opt$smallest)) { | |
102 cat("'smallest' is required\n") | |
103 q(status = 1) | |
104 } | |
105 if (is.null(opt$pcut)) { | |
106 cat("'pcut' is required\n") | |
107 q(status = 1) | |
108 } | |
109 if (is.null(opt$hcut)) { | |
110 cat("'hcut' is required\n") | |
111 q(status = 1) | |
112 } | |
113 | |
114 source_local <- function(fname) { | |
115 # argv <- commandArgs(trailingOnly = FALSE) | |
116 # base_dir <- dirname(substring(argv[grep("--file = ", argv)], 8)) | |
117 base_dir <- opt$scriptdir | |
118 source(paste(base_dir, fname, sep = "/")) | |
119 } | |
120 | |
121 source_local("gomwu.functions.R") | |
122 | |
123 # It might take a few minutes for MF and BP. Do not rerun it if you just want | |
124 # to replot the data with different cutoffs, go straight to gomwuPlot. If you | |
125 # change any of the numeric values below, delete the files that were generated | |
126 # in previos runs first. | |
127 | |
128 gomwuStats(opt$input, opt$goDatabase, opt$goAnnotations, opt$goDivision, opt$scriptdir, | |
129 # replace with full path to perl executable if it is not in your system's PATH already | |
130 perlPath = "perl", | |
131 # a GO category will not be considered if it contains more than this fraction of the total number of genes | |
132 largest = opt$largest, | |
133 smallest = opt$smallest, # a GO category should contain at least this many genes to be considered. | |
134 clusterCutHeight = opt$clusterheight, # threshold for merging similar (gene-sharing) terms. See README for details. | |
135 # Alternative = "g" # by default the MWU test is two-tailed; | |
136 # specify "g" or "l" of you want to test for "greater" or "less" instead. | |
137 # Module = TRUE,Alternative="g" # un-remark this if you are analyzing a | |
138 # SIGNED WGCNA module (values: 0 for not in module genes, kME for in-module genes). | |
139 # In the call to gomwuPlot below, specify absValue=0.001 (count number of "good genes" that fall into the module) | |
140 # Module = TRUE # un-remark this if you are analyzing an UNSIGNED WGCNA module | |
141 ) | |
142 # do not continue if the printout shows that no GO terms pass 10% FDR. | |
143 | |
144 | |
145 # ----------- Plotting results | |
146 | |
147 # change this to a pdf output | |
148 pdf() | |
149 | |
150 results <- gomwuPlot(opt$input, opt$goAnnotations, opt$goDivision, | |
151 # genes with the measure value exceeding this will be counted as "good genes". | |
152 # This setting is for signed log-pvalues. Specify absValue=0.001 if you are doing | |
153 # Fisher's exact test for standard GO enrichment or analyzing a WGCNA module (all non-zero genes = "good genes"). | |
154 absValue = -log(0.05, 10), | |
155 # absValue = 1, # un-remark this if you are using log2-fold changes | |
156 # FDR threshold for plotting. Specify level1=1 to plot all GO categories containing genes exceeding the absValue. | |
157 level1 = 0.1, | |
158 level2 = 0.05, # FDR cutoff to print in regular (not italic) font. | |
159 level3 = 0.01, # FDR cutoff to print in large bold font. | |
160 # decrease to fit more on one page, or increase (after rescaling the plot so the tree fits the text) | |
161 # for better "word cloud" effect | |
162 txtsize = 1.2, | |
163 treeHeight = 0.5, # height of the hierarchical clustering tree | |
164 # colors = c("dodgerblue2","firebrick1","skyblue2","lightcoral") | |
165 # these are default colors, uncomment and change if needed | |
166 ) | |
167 # manually rescale the plot so the tree matches the text | |
168 # if there are too many categories displayed, try make it more stringent with level1 = 0.05,level2=0.01,level3=0.001. | |
169 | |
170 # text representation of results, with actual adjusted p-values | |
171 write.table(results[[1]], "results.tsv", sep = "\t") | |
172 | |
173 | |
174 # ------- extracting representative GOs | |
175 | |
176 # this module chooses GO terms that best represent *independent* groups of significant GO terms | |
177 | |
178 pcut <- opt$pcut | |
179 hcut <- opt$hcut | |
180 | |
181 # plotting the GO tree with the cut level (uncomment the next two lines to plot) | |
182 # plot(results[[2]],cex = 0.6) | |
183 # abline(h = hcut,col="red") | |
184 | |
185 # cutting | |
186 ct <- cutree(results[[2]], h = hcut) | |
187 annots <- c() | |
188 ci <- 1 | |
189 for (ci in unique(ct)) { | |
190 message(ci) | |
191 rn <- names(ct)[ct == ci] | |
192 obs <- grep("obsolete", rn) | |
193 if (length(obs) > 0) { | |
194 rn <- rn[-obs] | |
195 } | |
196 if (length(rn) == 0) { | |
197 next | |
198 } | |
199 rr <- results[[1]][rn, ] | |
200 bestrr <- rr[which(rr$pval == min(rr$pval)), ] | |
201 best <- 1 | |
202 if (nrow(bestrr) > 1) { | |
203 nns <- sub(" .+", "", row.names(bestrr)) | |
204 fr <- c() | |
205 for (i in 1:length(nns)) { | |
206 fr <- c(fr, eval(parse(text = nns[i]))) | |
207 } | |
208 best <- which(fr == max(fr)) | |
209 } | |
210 if (bestrr$pval[best] <= pcut) { | |
211 annots <- c(annots, sub("\\d+\\/\\d+ ", "", row.names(bestrr)[best])) | |
212 } | |
213 } | |
214 | |
215 mwus <- read.table(paste("MWU", opt$goDivision, opt$input, sep = "_"), header = T) | |
216 best_GOs <- mwus[mwus$name %in% annots, ] | |
217 write.table(best_GOs, "best_go.tsv", sep = "\t") | |
218 dev.off() |