comparison enrichment_v3.R @ 0:472ad7da3d92 draft

planemo upload commit 9f9e8b72a239e58b5087b2b3737262c25cc2671e-dirty
author proteore
date Mon, 04 Dec 2017 09:39:21 -0500
parents
children 829cbdb71efa
comparison
equal deleted inserted replaced
-1:000000000000 0:472ad7da3d92
1 # enrichment_v3.R
2 # Usage : Rscript --vanilla enrichment_v3.R --inputtype tabfile (or
3 # copypaste) --input file.txt --ontology "BP/CC/MF" --option option (e.g
4 # : classic/elim...) --threshold threshold --correction correction --textoutput
5 # text --barplotoutput barplot
6 # --dotplotoutput dotplot --column column --geneuniver human
7 # e.g : Rscript --vanilla enrichment_v3.R --inputtype tabfile --input file.txt
8 # --ontology BP --option classic --threshold 1e-15 --correction holm
9 # --textoutput TRUE
10 # --barplotoutput TRUE --dotplotoutput TRUE --column c1 --geneuniverse
11 # org.Hs.eg.db
12 # INPUT :
13 # - type of input. Can be ids separated by a blank space (copypast), or a text
14 # file (tabfile)
15 # - file with at least one column of ensembl ids
16 # - gene ontology category : Biological Process (BP), Cellular Component (CC), Molecular Function (MF)
17 # - test option (relative to topGO algorithms) : elim, weight01, parentchild, or no option (classic)
18 # - threshold for enriched GO term pvalues (e.g : 1e-15)
19 # - correction for multiple testing (see p.adjust options : holm, hochberg, hommel, bonferroni, BH, BY,fdr,none
20 # - outputs wanted in this order text, barplot, dotplot with boolean value (e.g
21 # : TRUE TRUE TRUE ).
22 # Declare the output not wanted as none
23 # - column containing the ensembl ids if the input file is a tabfile
24 # - gene universe reference for the user chosen specie
25 # - header : if the input is a text file, does this text file have a header
26 # (TRUE/FALSE)
27 #
28 # OUTPUT :
29 # - outputs commanded by the user named respectively result.tsv for the text
30 # results file, barplot.png for the barplot image file and dotplot.png for the
31 # dotplot image file
32
33
34 # loading topGO library
35 library("topGO")
36
37 '%!in%' <- function(x,y)!('%in%'(x,y))
38
39
40 # Parse command line arguments
41
42 args = commandArgs(trailingOnly = TRUE)
43
44 # create a list of the arguments from the command line, separated by a blank space
45 hh <- paste(unlist(args),collapse=' ')
46
47 # delete the first element of the list which is always a blank space
48 listoptions <- unlist(strsplit(hh,'--'))[-1]
49
50 # for each input, split the arguments with blank space as separator, unlist,
51 # and delete the first element which is the input name (e.g --inputtype)
52 options.args <- sapply(listoptions,function(x){
53 unlist(strsplit(x, ' '))[-1]
54 })
55 # same as the step above, except that only the names are kept
56 options.names <- sapply(listoptions,function(x){
57 option <- unlist(strsplit(x, ' '))[1]
58 })
59 names(options.args) <- unlist(options.names)
60
61
62 if (length(options.args) != 12) {
63 stop("Not enough/Too many arguments", call. = FALSE)
64 }
65
66 typeinput = options.args[1]
67 listfile = options.args[2]
68 onto = as.character(options.args[3])
69 option = as.character(options.args[4])
70 correction = as.character(options.args[6])
71 threshold = as.numeric(options.args[5])
72 text = as.character(options.args[7])
73 barplot = as.character(options.args[8])
74 dotplot = as.character(options.args[9])
75 column = as.numeric(gsub("c","",options.args[10]))
76 geneuniverse = as.character(options.args[11])
77 header = as.character(options.args[12])
78
79 if (typeinput=="copypaste"){
80 sample = as.data.frame(unlist(listfile))
81 sample = sample[,column]
82 }
83 if (typeinput=="tabfile"){
84
85 if (header=="TRUE"){
86 sample = read.table(listfile,header=TRUE,sep="\t",na.strings="NA",fill=TRUE)
87 }else{
88 sample = read.table(listfile,header=FALSE,sep="\t",na.strings="NA",fill=TRUE)
89 }
90 sample = sample[,column]
91
92 }
93 # Launch enrichment analysis and return result data from the analysis or the null
94 # object if the enrichment could not be done.
95 goEnrichment = function(geneuniverse,sample,onto){
96
97 # get all the GO terms of the corresponding ontology (BP/CC/MF) and all their
98 # associated ensembl ids according to the org package
99 xx = annFUN.org(onto,mapping=geneuniverse,ID="ensembl")
100 allGenes = unique(unlist(xx))
101 # check if the genes given by the user can be found in the org package (gene
102 # universe), that is in
103 # allGenes
104 if (length(intersect(sample,allGenes))==0){
105
106 print("None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably have no associated GO terms.")
107 return(c(NULL,NULL))
108
109 }
110
111 geneList = factor(as.integer(allGenes %in% sample))
112 names(geneList) <- allGenes
113
114
115 #topGO enrichment
116
117
118 # Creation of a topGOdata object
119 # It will contain : the list of genes of interest, the GO annotations and the GO hierarchy
120 # Parameters :
121 # ontology : character string specifying the ontology of interest (BP, CC, MF)
122 # allGenes : named vector of type numeric or factor
123 # annot : tells topGO how to map genes to GO annotations.
124 # argument not used here : nodeSize : at which minimal number of GO annotations
125 # do we consider a gene
126
127 myGOdata = new("topGOdata", description="SEA with TopGO", ontology=onto, allGenes=geneList, annot = annFUN.org, mapping=geneuniverse,ID="ensembl")
128
129
130 # Performing enrichment tests
131 result <- runTest(myGOdata, algorithm=option, statistic="fisher")
132 return(c(result,myGOdata))
133 }
134
135 # Some libraries such as GOsummaries won't be able to treat the values such as
136 # "< 1e-30" produced by topGO. As such it is important to delete the < char
137 # with the deleteInfChar function. Nevertheless the user will have access to the original results in the text output.
138 deleteInfChar = function(values){
139
140 lines = grep("<",values)
141 if (length(lines)!=0){
142 for (line in lines){
143 values[line]=gsub("<","",values[line])
144 }
145 }
146 return(values)
147 }
148
149 corrMultipleTesting = function(result, myGOdata,correction,threshold){
150
151 # adjust for multiple testing
152 if (correction!="none"){
153 # GenTable : transforms the result object into a list. Filters can be applied
154 # (e.g : with the topNodes argument, to get for instance only the n first
155 # GO terms with the lowest pvalues), but as we want to apply a correction we
156 # take all the GO terms, no matter their pvalues
157 allRes <- GenTable(myGOdata, test = result, orderBy = "result", ranksOf = "result",topNodes=length(attributes(result)$score))
158 # Some pvalues given by topGO are not numeric (e.g : "<1e-30). As such, these
159 # values are converted to 1e-30 to be able to correct the pvalues
160 pvaluestmp = deleteInfChar(allRes$test)
161
162 # the correction is done from the modified pvalues
163 allRes$qvalues = p.adjust(pvaluestmp, method = as.character(correction), n = length(pvaluestmp))
164 allRes = as.data.frame(allRes)
165
166 # Rename the test column by pvalues, so that is more explicit
167 nb = which(names(allRes) %in% c("test"))
168
169 names(allRes)[nb] = "pvalues"
170
171 allRes = allRes[which(as.numeric(allRes$pvalues) <= threshold),]
172 if (length(allRes$pvalues)==0){
173 print("Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value")
174 return(NULL)
175 }
176 allRes = allRes[order(allRes$qvalues),]
177 }
178
179 if (correction=="none"){
180 # get all the go terms under user threshold
181 mysummary <- summary(attributes(result)$score <= threshold)
182 numsignif <- as.integer(mysummary[[3]])
183 # get all significant nodes
184 allRes <- GenTable(myGOdata, test = result, orderBy = "result", ranksOf = "result",topNodes=numsignif)
185
186
187 allRes = as.data.frame(allRes)
188 # Rename the test column by pvalues, so that is more explicit
189 nb = which(names(allRes) %in% c("test"))
190 names(allRes)[nb] = "pvalues"
191 if (numsignif==0){
192
193 print("Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value")
194 return(NULL)
195 }
196
197 allRes = allRes[order(allRes$pvalues),]
198 }
199
200 return(allRes)
201 }
202
203 # roundValues will simplify the results by rounding down the values. For instance 1.1e-17 becomes 1e-17
204 roundValues = function(values){
205 for (line in 1:length(values)){
206 values[line]=as.numeric(gsub(".*e","1e",as.character(values[line])))
207 }
208 return(values)
209 }
210
211 createDotPlot = function(data, onto){
212
213 values = deleteInfChar(data$pvalues)
214 values = roundValues(values)
215 values = as.numeric(values)
216
217 geneRatio = data$Significant/data$Annotated
218 goTerms = data$Term
219 count = data$Significant
220
221 labely = paste("GO terms",onto,sep=" ")
222 png(filename="dotplot.png",res=300, width = 3200, height = 3200, units = "px")
223 sp1 = ggplot(data,aes(x=geneRatio,y=goTerms, color=values,size=count)) +geom_point() + scale_colour_gradientn(colours=c("red","violet","blue")) + xlab("Gene Ratio") + ylab(labely) + labs(color="p-values\n")
224
225 plot(sp1)
226 dev.off()
227 }
228
229 createBarPlot = function(data, onto){
230
231
232 values = deleteInfChar(data$pvalues)
233 values = roundValues(values)
234
235 values = as.numeric(values)
236 goTerms = data$Term
237 count = data$Significant
238 png(filename="barplot.png",res=300, width = 3200, height = 3200, units = "px")
239
240 labely = paste("GO terms",onto,sep=" ")
241 p<-ggplot(data, aes(x=goTerms, y=count,fill=values)) + ylab("Gene count") + xlab(labely) +geom_bar(stat="identity") + scale_fill_gradientn(colours=c("red","violet","blue")) + coord_flip() + labs(fill="p-values\n")
242 plot(p)
243 dev.off()
244 }
245
246
247 # Produce the different outputs
248 createOutputs = function(result, cut_result,text, barplot, dotplot, onto){
249
250
251 if (is.null(result)){
252
253 if (text=="TRUE"){
254
255 err_msg = "None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably either have no associated GO terms or are not ENSG identifiers (e.g : ENSG00000012048)."
256 write.table(err_msg, file='result.csv', quote=FALSE, sep='\t', col.names = T, row.names = F)
257
258 }
259
260 if (barplot=="TRUE"){
261
262 png(filename="barplot.png")
263 plot.new()
264 #text(0,0,err_msg)
265 dev.off()
266 }
267
268 if (dotplot=="TRUE"){
269
270 png(filename="dotplot.png")
271 plot.new()
272 #text(0,0,err_msg)
273 dev.off()
274
275 }
276 return(TRUE)
277 }
278
279
280 if (is.null(cut_result)){
281
282
283 if (text=="TRUE"){
284
285 err_msg = "Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value."
286 write.table(err_msg, file='result.csv', quote=FALSE, sep='\t', col.names = T, row.names = F)
287
288 }
289
290 if (barplot=="TRUE"){
291
292 png(filename="barplot.png")
293 plot.new()
294 text(0,0,err_msg)
295 dev.off()
296 }
297
298 if (dotplot=="TRUE"){
299
300 png(filename="dotplot.png")
301 plot.new()
302 text(0,0,err_msg)
303 dev.off()
304
305 }
306 return(TRUE)
307
308
309
310 }
311
312 if (text=="TRUE"){
313 write.table(cut_result, file='result.csv', quote=FALSE, sep='\t', col.names = T, row.names = F)
314 }
315
316 if (barplot=="TRUE"){
317
318 createBarPlot(cut_result, onto)
319 }
320
321 if (dotplot=="TRUE"){
322
323 createDotPlot(cut_result, onto)
324 }
325 return(TRUE)
326 }
327
328
329
330 # Load R library ggplot2 to plot graphs
331 library(ggplot2)
332
333 # Launch enrichment analysis
334 allresult = goEnrichment(geneuniverse,sample,onto)
335 result = allresult[1][[1]]
336 myGOdata = allresult[2][[1]]
337 if (!is.null(result)){
338
339 # Adjust the result with a multiple testing correction or not and with the user
340 # p-value cutoff
341 cut_result = corrMultipleTesting(result,myGOdata, correction,threshold)
342 }else{
343
344 cut_result=NULL
345
346 }
347
348
349 createOutputs(result, cut_result,text, barplot, dotplot, onto)
350