0
|
1 # enrichment_v3.R
|
|
2 # Usage : Rscript --vanilla enrichment_v3.R --inputtype tabfile (or
|
|
3 # copypaste) --input file.txt --ontology "BP/CC/MF" --option option (e.g
|
|
4 # : classic/elim...) --threshold threshold --correction correction --textoutput
|
|
5 # text --barplotoutput barplot
|
|
6 # --dotplotoutput dotplot --column column --geneuniver human
|
|
7 # e.g : Rscript --vanilla enrichment_v3.R --inputtype tabfile --input file.txt
|
|
8 # --ontology BP --option classic --threshold 1e-15 --correction holm
|
|
9 # --textoutput TRUE
|
|
10 # --barplotoutput TRUE --dotplotoutput TRUE --column c1 --geneuniverse
|
|
11 # org.Hs.eg.db
|
|
12 # INPUT :
|
|
13 # - type of input. Can be ids separated by a blank space (copypast), or a text
|
|
14 # file (tabfile)
|
|
15 # - file with at least one column of ensembl ids
|
|
16 # - gene ontology category : Biological Process (BP), Cellular Component (CC), Molecular Function (MF)
|
|
17 # - test option (relative to topGO algorithms) : elim, weight01, parentchild, or no option (classic)
|
|
18 # - threshold for enriched GO term pvalues (e.g : 1e-15)
|
|
19 # - correction for multiple testing (see p.adjust options : holm, hochberg, hommel, bonferroni, BH, BY,fdr,none
|
|
20 # - outputs wanted in this order text, barplot, dotplot with boolean value (e.g
|
|
21 # : TRUE TRUE TRUE ).
|
|
22 # Declare the output not wanted as none
|
|
23 # - column containing the ensembl ids if the input file is a tabfile
|
|
24 # - gene universe reference for the user chosen specie
|
|
25 # - header : if the input is a text file, does this text file have a header
|
|
26 # (TRUE/FALSE)
|
|
27 #
|
|
28 # OUTPUT :
|
|
29 # - outputs commanded by the user named respectively result.tsv for the text
|
|
30 # results file, barplot.png for the barplot image file and dotplot.png for the
|
|
31 # dotplot image file
|
|
32
|
|
33
|
|
34 # loading topGO library
|
|
35 library("topGO")
|
|
36
|
|
37 '%!in%' <- function(x,y)!('%in%'(x,y))
|
|
38
|
|
39
|
|
40 # Parse command line arguments
|
|
41
|
|
42 args = commandArgs(trailingOnly = TRUE)
|
|
43
|
|
44 # create a list of the arguments from the command line, separated by a blank space
|
|
45 hh <- paste(unlist(args),collapse=' ')
|
|
46
|
|
47 # delete the first element of the list which is always a blank space
|
|
48 listoptions <- unlist(strsplit(hh,'--'))[-1]
|
|
49
|
|
50 # for each input, split the arguments with blank space as separator, unlist,
|
|
51 # and delete the first element which is the input name (e.g --inputtype)
|
|
52 options.args <- sapply(listoptions,function(x){
|
|
53 unlist(strsplit(x, ' '))[-1]
|
|
54 })
|
|
55 # same as the step above, except that only the names are kept
|
|
56 options.names <- sapply(listoptions,function(x){
|
|
57 option <- unlist(strsplit(x, ' '))[1]
|
|
58 })
|
|
59 names(options.args) <- unlist(options.names)
|
|
60
|
|
61
|
|
62 if (length(options.args) != 12) {
|
|
63 stop("Not enough/Too many arguments", call. = FALSE)
|
|
64 }
|
|
65
|
|
66 typeinput = options.args[1]
|
|
67 listfile = options.args[2]
|
|
68 onto = as.character(options.args[3])
|
|
69 option = as.character(options.args[4])
|
|
70 correction = as.character(options.args[6])
|
|
71 threshold = as.numeric(options.args[5])
|
|
72 text = as.character(options.args[7])
|
|
73 barplot = as.character(options.args[8])
|
|
74 dotplot = as.character(options.args[9])
|
|
75 column = as.numeric(gsub("c","",options.args[10]))
|
|
76 geneuniverse = as.character(options.args[11])
|
|
77 header = as.character(options.args[12])
|
|
78
|
|
79 if (typeinput=="copypaste"){
|
|
80 sample = as.data.frame(unlist(listfile))
|
|
81 sample = sample[,column]
|
|
82 }
|
|
83 if (typeinput=="tabfile"){
|
|
84
|
|
85 if (header=="TRUE"){
|
|
86 sample = read.table(listfile,header=TRUE,sep="\t",na.strings="NA",fill=TRUE)
|
|
87 }else{
|
|
88 sample = read.table(listfile,header=FALSE,sep="\t",na.strings="NA",fill=TRUE)
|
|
89 }
|
|
90 sample = sample[,column]
|
|
91
|
|
92 }
|
|
93 # Launch enrichment analysis and return result data from the analysis or the null
|
|
94 # object if the enrichment could not be done.
|
|
95 goEnrichment = function(geneuniverse,sample,onto){
|
|
96
|
|
97 # get all the GO terms of the corresponding ontology (BP/CC/MF) and all their
|
|
98 # associated ensembl ids according to the org package
|
|
99 xx = annFUN.org(onto,mapping=geneuniverse,ID="ensembl")
|
|
100 allGenes = unique(unlist(xx))
|
|
101 # check if the genes given by the user can be found in the org package (gene
|
|
102 # universe), that is in
|
|
103 # allGenes
|
|
104 if (length(intersect(sample,allGenes))==0){
|
|
105
|
|
106 print("None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably have no associated GO terms.")
|
|
107 return(c(NULL,NULL))
|
|
108
|
|
109 }
|
|
110
|
|
111 geneList = factor(as.integer(allGenes %in% sample))
|
|
112 names(geneList) <- allGenes
|
|
113
|
|
114
|
|
115 #topGO enrichment
|
|
116
|
|
117
|
|
118 # Creation of a topGOdata object
|
|
119 # It will contain : the list of genes of interest, the GO annotations and the GO hierarchy
|
|
120 # Parameters :
|
|
121 # ontology : character string specifying the ontology of interest (BP, CC, MF)
|
|
122 # allGenes : named vector of type numeric or factor
|
|
123 # annot : tells topGO how to map genes to GO annotations.
|
|
124 # argument not used here : nodeSize : at which minimal number of GO annotations
|
|
125 # do we consider a gene
|
|
126
|
|
127 myGOdata = new("topGOdata", description="SEA with TopGO", ontology=onto, allGenes=geneList, annot = annFUN.org, mapping=geneuniverse,ID="ensembl")
|
|
128
|
|
129
|
|
130 # Performing enrichment tests
|
|
131 result <- runTest(myGOdata, algorithm=option, statistic="fisher")
|
|
132 return(c(result,myGOdata))
|
|
133 }
|
|
134
|
|
135 # Some libraries such as GOsummaries won't be able to treat the values such as
|
|
136 # "< 1e-30" produced by topGO. As such it is important to delete the < char
|
|
137 # with the deleteInfChar function. Nevertheless the user will have access to the original results in the text output.
|
|
138 deleteInfChar = function(values){
|
|
139
|
|
140 lines = grep("<",values)
|
|
141 if (length(lines)!=0){
|
|
142 for (line in lines){
|
|
143 values[line]=gsub("<","",values[line])
|
|
144 }
|
|
145 }
|
|
146 return(values)
|
|
147 }
|
|
148
|
|
149 corrMultipleTesting = function(result, myGOdata,correction,threshold){
|
|
150
|
|
151 # adjust for multiple testing
|
|
152 if (correction!="none"){
|
|
153 # GenTable : transforms the result object into a list. Filters can be applied
|
|
154 # (e.g : with the topNodes argument, to get for instance only the n first
|
|
155 # GO terms with the lowest pvalues), but as we want to apply a correction we
|
|
156 # take all the GO terms, no matter their pvalues
|
|
157 allRes <- GenTable(myGOdata, test = result, orderBy = "result", ranksOf = "result",topNodes=length(attributes(result)$score))
|
|
158 # Some pvalues given by topGO are not numeric (e.g : "<1e-30). As such, these
|
|
159 # values are converted to 1e-30 to be able to correct the pvalues
|
|
160 pvaluestmp = deleteInfChar(allRes$test)
|
|
161
|
|
162 # the correction is done from the modified pvalues
|
|
163 allRes$qvalues = p.adjust(pvaluestmp, method = as.character(correction), n = length(pvaluestmp))
|
|
164 allRes = as.data.frame(allRes)
|
|
165
|
|
166 # Rename the test column by pvalues, so that is more explicit
|
|
167 nb = which(names(allRes) %in% c("test"))
|
|
168
|
|
169 names(allRes)[nb] = "pvalues"
|
|
170
|
|
171 allRes = allRes[which(as.numeric(allRes$pvalues) <= threshold),]
|
|
172 if (length(allRes$pvalues)==0){
|
|
173 print("Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value")
|
|
174 return(NULL)
|
|
175 }
|
|
176 allRes = allRes[order(allRes$qvalues),]
|
|
177 }
|
|
178
|
|
179 if (correction=="none"){
|
|
180 # get all the go terms under user threshold
|
|
181 mysummary <- summary(attributes(result)$score <= threshold)
|
|
182 numsignif <- as.integer(mysummary[[3]])
|
|
183 # get all significant nodes
|
|
184 allRes <- GenTable(myGOdata, test = result, orderBy = "result", ranksOf = "result",topNodes=numsignif)
|
|
185
|
|
186
|
|
187 allRes = as.data.frame(allRes)
|
|
188 # Rename the test column by pvalues, so that is more explicit
|
|
189 nb = which(names(allRes) %in% c("test"))
|
|
190 names(allRes)[nb] = "pvalues"
|
|
191 if (numsignif==0){
|
|
192
|
|
193 print("Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value")
|
|
194 return(NULL)
|
|
195 }
|
|
196
|
|
197 allRes = allRes[order(allRes$pvalues),]
|
|
198 }
|
|
199
|
|
200 return(allRes)
|
|
201 }
|
|
202
|
|
203 # roundValues will simplify the results by rounding down the values. For instance 1.1e-17 becomes 1e-17
|
|
204 roundValues = function(values){
|
|
205 for (line in 1:length(values)){
|
|
206 values[line]=as.numeric(gsub(".*e","1e",as.character(values[line])))
|
|
207 }
|
|
208 return(values)
|
|
209 }
|
|
210
|
|
211 createDotPlot = function(data){
|
|
212
|
|
213 values = deleteInfChar(data$pvalues)
|
|
214 values = roundValues(values)
|
|
215 values = as.numeric(values)
|
|
216
|
|
217 geneRatio = data$Significant/data$Annotated
|
|
218 goTerms = data$Term
|
|
219 count = data$Significant
|
|
220
|
|
221 png(filename="dotplot.png",res=300, width = 3200, height = 3200, units = "px")
|
|
222 sp1 = ggplot(data,aes(x=geneRatio,y=goTerms,xlabel ="Ratio" ,ylabel = "GO terms", color=values,size=count)) +geom_point() + scale_colour_gradientn(colours=c("red","violet","blue")) + labs(color="p-values\n")
|
|
223
|
|
224 plot(sp1)
|
|
225 dev.off()
|
|
226 }
|
|
227
|
|
228 createBarPlot = function(data){
|
|
229
|
|
230
|
|
231 values = deleteInfChar(data$pvalues)
|
|
232 values = roundValues(values)
|
|
233
|
|
234 values = as.numeric(values)
|
|
235 goTerms = data$Term
|
|
236 count = data$Significant
|
|
237 png(filename="barplot.png",res=300, width = 3200, height = 3200, units = "px")
|
|
238
|
|
239 p<-ggplot(data, aes(x=goTerms, y=count,fill=values)) + geom_bar(stat="identity") + scale_fill_gradientn(colours=c("red","violet","blue")) + coord_flip() + labs(fill="p-values\n")
|
|
240 plot(p)
|
|
241 dev.off()
|
|
242 }
|
|
243
|
|
244
|
|
245 # Produce the different outputs
|
|
246 createOutputs = function(result, cut_result,text, barplot,dotplot){
|
|
247
|
|
248
|
|
249 if (is.null(result)){
|
|
250
|
|
251 if (text=="TRUE"){
|
|
252
|
|
253 err_msg = "None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably either have no associated GO terms or are not ENSG identifiers (e.g : ENSG00000012048)."
|
|
254 write.table(err_msg, file='result.csv', quote=FALSE, sep='\t', col.names = T, row.names = F)
|
|
255
|
|
256 }
|
|
257
|
|
258 if (barplot=="TRUE"){
|
|
259
|
|
260 png(filename="barplot.png")
|
|
261 plot.new()
|
|
262 #text(0,0,err_msg)
|
|
263 dev.off()
|
|
264 }
|
|
265
|
|
266 if (dotplot=="TRUE"){
|
|
267
|
|
268 png(filename="dotplot.png")
|
|
269 plot.new()
|
|
270 #text(0,0,err_msg)
|
|
271 dev.off()
|
|
272
|
|
273 }
|
|
274 return(TRUE)
|
|
275 }
|
|
276
|
|
277
|
|
278 if (is.null(cut_result)){
|
|
279
|
|
280
|
|
281 if (text=="TRUE"){
|
|
282
|
|
283 err_msg = "Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value."
|
|
284 write.table(err_msg, file='result.csv', quote=FALSE, sep='\t', col.names = T, row.names = F)
|
|
285
|
|
286 }
|
|
287
|
|
288 if (barplot=="TRUE"){
|
|
289
|
|
290 png(filename="barplot.png")
|
|
291 plot.new()
|
|
292 text(0,0,err_msg)
|
|
293 dev.off()
|
|
294 }
|
|
295
|
|
296 if (dotplot=="TRUE"){
|
|
297
|
|
298 png(filename="dotplot.png")
|
|
299 plot.new()
|
|
300 text(0,0,err_msg)
|
|
301 dev.off()
|
|
302
|
|
303 }
|
|
304 return(TRUE)
|
|
305
|
|
306
|
|
307
|
|
308 }
|
|
309
|
|
310 if (text=="TRUE"){
|
|
311 write.table(cut_result, file='result.csv', quote=FALSE, sep='\t', col.names = T, row.names = F)
|
|
312 }
|
|
313
|
|
314 if (barplot=="TRUE"){
|
|
315
|
|
316 createBarPlot(cut_result)
|
|
317 }
|
|
318
|
|
319 if (dotplot=="TRUE"){
|
|
320
|
|
321 createDotPlot(cut_result)
|
|
322 }
|
|
323 return(TRUE)
|
|
324 }
|
|
325
|
|
326
|
|
327
|
|
328 # Load R library ggplot2 to plot graphs
|
|
329 library(ggplot2)
|
|
330
|
|
331 # Launch enrichment analysis
|
|
332 allresult = goEnrichment(geneuniverse,sample,onto)
|
|
333 result = allresult[1][[1]]
|
|
334 myGOdata = allresult[2][[1]]
|
|
335 if (!is.null(result)){
|
|
336
|
|
337 # Adjust the result with a multiple testing correction or not and with the user
|
|
338 # p-value cutoff
|
|
339 cut_result = corrMultipleTesting(result,myGOdata, correction,threshold)
|
|
340 }else{
|
|
341
|
|
342 cut_result=NULL
|
|
343
|
|
344 }
|
|
345
|
|
346
|
|
347 createOutputs(result, cut_result,text, barplot,dotplot)
|
|
348
|