comparison enrichment_v3.R @ 0:aade04e750fa draft default tip

planemo upload
author lnguyen
date Fri, 15 Sep 2017 10:38:28 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:aade04e750fa
1 # enrichment_v3.R
2 # Usage : Rscript --vanilla enrichment_v3.R --inputtype tabfile (or
3 # copypaste) --input file.txt --ontology "BP/CC/MF" --option option (e.g
4 # : classic/elim...) --threshold threshold --correction correction --textoutput
5 # text --barplotoutput barplot
6 # --dotplotoutput dotplot --column column --geneuniver human
7 # e.g : Rscript --vanilla enrichment_v3.R --inputtype tabfile --input file.txt
8 # --ontology BP --option classic --threshold 1e-15 --correction holm
9 # --textoutput TRUE
10 # --barplotoutput TRUE --dotplotoutput TRUE --column c1 --geneuniverse
11 # org.Hs.eg.db
12 # INPUT :
13 # - type of input. Can be ids separated by a blank space (copypast), or a text
14 # file (tabfile)
15 # - file with at least one column of ensembl ids
16 # - gene ontology category : Biological Process (BP), Cellular Component (CC), Molecular Function (MF)
17 # - test option (relative to topGO algorithms) : elim, weight01, parentchild, or no option (classic)
18 # - threshold for enriched GO term pvalues (e.g : 1e-15)
19 # - correction for multiple testing (see p.adjust options : holm, hochberg, hommel, bonferroni, BH, BY,fdr,none
20 # - outputs wanted in this order text, barplot, dotplot with boolean value (e.g
21 # : TRUE TRUE TRUE ).
22 # Declare the output not wanted as none
23 # - column containing the ensembl ids if the input file is a tabfile
24 # - gene universe reference for the user chosen specie
25 # - header : if the input is a text file, does this text file have a header
26 # (TRUE/FALSE)
27 #
28 # OUTPUT :
29 # - outputs commanded by the user named respectively result.tsv for the text
30 # results file, barplot.png for the barplot image file and dotplot.png for the
31 # dotplot image file
32
33
34 # loading topGO library
35 library("topGO")
36
37 '%!in%' <- function(x,y)!('%in%'(x,y))
38
39
40 # Parse command line arguments
41
42 args = commandArgs(trailingOnly = TRUE)
43
44 # create a list of the arguments from the command line, separated by a blank space
45 hh <- paste(unlist(args),collapse=' ')
46
47 # delete the first element of the list which is always a blank space
48 listoptions <- unlist(strsplit(hh,'--'))[-1]
49
50 # for each input, split the arguments with blank space as separator, unlist,
51 # and delete the first element which is the input name (e.g --inputtype)
52 options.args <- sapply(listoptions,function(x){
53 unlist(strsplit(x, ' '))[-1]
54 })
55 # same as the step above, except that only the names are kept
56 options.names <- sapply(listoptions,function(x){
57 option <- unlist(strsplit(x, ' '))[1]
58 })
59 names(options.args) <- unlist(options.names)
60
61
62 if (length(options.args) != 12) {
63 stop("Not enough/Too many arguments", call. = FALSE)
64 }
65
66 typeinput = options.args[1]
67 listfile = options.args[2]
68 onto = as.character(options.args[3])
69 option = as.character(options.args[4])
70 correction = as.character(options.args[6])
71 threshold = as.numeric(options.args[5])
72 text = as.character(options.args[7])
73 barplot = as.character(options.args[8])
74 dotplot = as.character(options.args[9])
75 column = as.numeric(gsub("c","",options.args[10]))
76 geneuniverse = as.character(options.args[11])
77 header = as.character(options.args[12])
78
79 if (typeinput=="copypaste"){
80 sample = as.data.frame(unlist(listfile))
81 sample = sample[,column]
82 }
83 if (typeinput=="tabfile"){
84
85 if (header=="TRUE"){
86 sample = read.table(listfile,header=TRUE,sep="\t",na.strings="NA",fill=TRUE)
87 }else{
88 sample = read.table(listfile,header=FALSE,sep="\t",na.strings="NA",fill=TRUE)
89 }
90 sample = sample[,column]
91
92 }
93 # Launch enrichment analysis and return result data from the analysis or the null
94 # object if the enrichment could not be done.
95 goEnrichment = function(geneuniverse,sample,onto){
96
97 # get all the GO terms of the corresponding ontology (BP/CC/MF) and all their
98 # associated ensembl ids according to the org package
99 xx = annFUN.org(onto,mapping=geneuniverse,ID="ensembl")
100 allGenes = unique(unlist(xx))
101 # check if the genes given by the user can be found in the org package (gene
102 # universe), that is in
103 # allGenes
104 if (length(intersect(sample,allGenes))==0){
105
106 print("None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably have no associated GO terms.")
107 return(c(NULL,NULL))
108
109 }
110
111 geneList = factor(as.integer(allGenes %in% sample))
112 names(geneList) <- allGenes
113
114
115 #topGO enrichment
116
117
118 # Creation of a topGOdata object
119 # It will contain : the list of genes of interest, the GO annotations and the GO hierarchy
120 # Parameters :
121 # ontology : character string specifying the ontology of interest (BP, CC, MF)
122 # allGenes : named vector of type numeric or factor
123 # annot : tells topGO how to map genes to GO annotations.
124 # argument not used here : nodeSize : at which minimal number of GO annotations
125 # do we consider a gene
126
127 myGOdata = new("topGOdata", description="SEA with TopGO", ontology=onto, allGenes=geneList, annot = annFUN.org, mapping=geneuniverse,ID="ensembl")
128
129
130 # Performing enrichment tests
131 result <- runTest(myGOdata, algorithm=option, statistic="fisher")
132 return(c(result,myGOdata))
133 }
134
135 # Some libraries such as GOsummaries won't be able to treat the values such as
136 # "< 1e-30" produced by topGO. As such it is important to delete the < char
137 # with the deleteInfChar function. Nevertheless the user will have access to the original results in the text output.
138 deleteInfChar = function(values){
139
140 lines = grep("<",values)
141 if (length(lines)!=0){
142 for (line in lines){
143 values[line]=gsub("<","",values[line])
144 }
145 }
146 return(values)
147 }
148
149 corrMultipleTesting = function(result, myGOdata,correction,threshold){
150
151 # adjust for multiple testing
152 if (correction!="none"){
153 # GenTable : transforms the result object into a list. Filters can be applied
154 # (e.g : with the topNodes argument, to get for instance only the n first
155 # GO terms with the lowest pvalues), but as we want to apply a correction we
156 # take all the GO terms, no matter their pvalues
157 allRes <- GenTable(myGOdata, test = result, orderBy = "result", ranksOf = "result",topNodes=length(attributes(result)$score))
158 # Some pvalues given by topGO are not numeric (e.g : "<1e-30). As such, these
159 # values are converted to 1e-30 to be able to correct the pvalues
160 pvaluestmp = deleteInfChar(allRes$test)
161
162 # the correction is done from the modified pvalues
163 allRes$qvalues = p.adjust(pvaluestmp, method = as.character(correction), n = length(pvaluestmp))
164 allRes = as.data.frame(allRes)
165
166 # Rename the test column by pvalues, so that is more explicit
167 nb = which(names(allRes) %in% c("test"))
168
169 names(allRes)[nb] = "pvalues"
170
171 allRes = allRes[which(as.numeric(allRes$pvalues) <= threshold),]
172 if (length(allRes$pvalues)==0){
173 print("Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value")
174 return(NULL)
175 }
176 allRes = allRes[order(allRes$qvalues),]
177 }
178
179 if (correction=="none"){
180 # get all the go terms under user threshold
181 mysummary <- summary(attributes(result)$score <= threshold)
182 numsignif <- as.integer(mysummary[[3]])
183 # get all significant nodes
184 allRes <- GenTable(myGOdata, test = result, orderBy = "result", ranksOf = "result",topNodes=numsignif)
185
186
187 allRes = as.data.frame(allRes)
188 # Rename the test column by pvalues, so that is more explicit
189 nb = which(names(allRes) %in% c("test"))
190 names(allRes)[nb] = "pvalues"
191 if (numsignif==0){
192
193 print("Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value")
194 return(NULL)
195 }
196
197 allRes = allRes[order(allRes$pvalues),]
198 }
199
200 return(allRes)
201 }
202
203 # roundValues will simplify the results by rounding down the values. For instance 1.1e-17 becomes 1e-17
204 roundValues = function(values){
205 for (line in 1:length(values)){
206 values[line]=as.numeric(gsub(".*e","1e",as.character(values[line])))
207 }
208 return(values)
209 }
210
211 createDotPlot = function(data){
212
213 values = deleteInfChar(data$pvalues)
214 values = roundValues(values)
215 values = as.numeric(values)
216
217 geneRatio = data$Significant/data$Annotated
218 goTerms = data$Term
219 count = data$Significant
220
221 png(filename="dotplot.png",res=300, width = 3200, height = 3200, units = "px")
222 sp1 = ggplot(data,aes(x=geneRatio,y=goTerms,xlabel ="Ratio" ,ylabel = "GO terms", color=values,size=count)) +geom_point() + scale_colour_gradientn(colours=c("red","violet","blue")) + labs(color="p-values\n")
223
224 plot(sp1)
225 dev.off()
226 }
227
228 createBarPlot = function(data){
229
230
231 values = deleteInfChar(data$pvalues)
232 values = roundValues(values)
233
234 values = as.numeric(values)
235 goTerms = data$Term
236 count = data$Significant
237 png(filename="barplot.png",res=300, width = 3200, height = 3200, units = "px")
238
239 p<-ggplot(data, aes(x=goTerms, y=count,fill=values)) + geom_bar(stat="identity") + scale_fill_gradientn(colours=c("red","violet","blue")) + coord_flip() + labs(fill="p-values\n")
240 plot(p)
241 dev.off()
242 }
243
244
245 # Produce the different outputs
246 createOutputs = function(result, cut_result,text, barplot,dotplot){
247
248
249 if (is.null(result)){
250
251 if (text=="TRUE"){
252
253 err_msg = "None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably either have no associated GO terms or are not ENSG identifiers (e.g : ENSG00000012048)."
254 write.table(err_msg, file='result.csv', quote=FALSE, sep='\t', col.names = T, row.names = F)
255
256 }
257
258 if (barplot=="TRUE"){
259
260 png(filename="barplot.png")
261 plot.new()
262 #text(0,0,err_msg)
263 dev.off()
264 }
265
266 if (dotplot=="TRUE"){
267
268 png(filename="dotplot.png")
269 plot.new()
270 #text(0,0,err_msg)
271 dev.off()
272
273 }
274 return(TRUE)
275 }
276
277
278 if (is.null(cut_result)){
279
280
281 if (text=="TRUE"){
282
283 err_msg = "Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value."
284 write.table(err_msg, file='result.csv', quote=FALSE, sep='\t', col.names = T, row.names = F)
285
286 }
287
288 if (barplot=="TRUE"){
289
290 png(filename="barplot.png")
291 plot.new()
292 text(0,0,err_msg)
293 dev.off()
294 }
295
296 if (dotplot=="TRUE"){
297
298 png(filename="dotplot.png")
299 plot.new()
300 text(0,0,err_msg)
301 dev.off()
302
303 }
304 return(TRUE)
305
306
307
308 }
309
310 if (text=="TRUE"){
311 write.table(cut_result, file='result.csv', quote=FALSE, sep='\t', col.names = T, row.names = F)
312 }
313
314 if (barplot=="TRUE"){
315
316 createBarPlot(cut_result)
317 }
318
319 if (dotplot=="TRUE"){
320
321 createDotPlot(cut_result)
322 }
323 return(TRUE)
324 }
325
326
327
328 # Load R library ggplot2 to plot graphs
329 library(ggplot2)
330
331 # Launch enrichment analysis
332 allresult = goEnrichment(geneuniverse,sample,onto)
333 result = allresult[1][[1]]
334 myGOdata = allresult[2][[1]]
335 if (!is.null(result)){
336
337 # Adjust the result with a multiple testing correction or not and with the user
338 # p-value cutoff
339 cut_result = corrMultipleTesting(result,myGOdata, correction,threshold)
340 }else{
341
342 cut_result=NULL
343
344 }
345
346
347 createOutputs(result, cut_result,text, barplot,dotplot)
348