# HG changeset patch # User lnguyen # Date 1505486308 14400 # Node ID aade04e750fa2f93faecdc06089295a6a445485c planemo upload diff -r 000000000000 -r aade04e750fa enrichment_v3.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/enrichment_v3.R Fri Sep 15 10:38:28 2017 -0400 @@ -0,0 +1,348 @@ +# enrichment_v3.R +# Usage : Rscript --vanilla enrichment_v3.R --inputtype tabfile (or +# copypaste) --input file.txt --ontology "BP/CC/MF" --option option (e.g +# : classic/elim...) --threshold threshold --correction correction --textoutput +# text --barplotoutput barplot +# --dotplotoutput dotplot --column column --geneuniver human +# e.g : Rscript --vanilla enrichment_v3.R --inputtype tabfile --input file.txt +# --ontology BP --option classic --threshold 1e-15 --correction holm +# --textoutput TRUE +# --barplotoutput TRUE --dotplotoutput TRUE --column c1 --geneuniverse +# org.Hs.eg.db +# INPUT : +# - type of input. Can be ids separated by a blank space (copypast), or a text +# file (tabfile) +# - file with at least one column of ensembl ids +# - gene ontology category : Biological Process (BP), Cellular Component (CC), Molecular Function (MF) +# - test option (relative to topGO algorithms) : elim, weight01, parentchild, or no option (classic) +# - threshold for enriched GO term pvalues (e.g : 1e-15) +# - correction for multiple testing (see p.adjust options : holm, hochberg, hommel, bonferroni, BH, BY,fdr,none +# - outputs wanted in this order text, barplot, dotplot with boolean value (e.g +# : TRUE TRUE TRUE ). +# Declare the output not wanted as none +# - column containing the ensembl ids if the input file is a tabfile +# - gene universe reference for the user chosen specie +# - header : if the input is a text file, does this text file have a header +# (TRUE/FALSE) +# +# OUTPUT : +# - outputs commanded by the user named respectively result.tsv for the text +# results file, barplot.png for the barplot image file and dotplot.png for the +# dotplot image file + + +# loading topGO library +library("topGO") + +'%!in%' <- function(x,y)!('%in%'(x,y)) + + +# Parse command line arguments + +args = commandArgs(trailingOnly = TRUE) + +# create a list of the arguments from the command line, separated by a blank space +hh <- paste(unlist(args),collapse=' ') + +# delete the first element of the list which is always a blank space +listoptions <- unlist(strsplit(hh,'--'))[-1] + +# for each input, split the arguments with blank space as separator, unlist, +# and delete the first element which is the input name (e.g --inputtype) +options.args <- sapply(listoptions,function(x){ + unlist(strsplit(x, ' '))[-1] + }) +# same as the step above, except that only the names are kept +options.names <- sapply(listoptions,function(x){ + option <- unlist(strsplit(x, ' '))[1] +}) +names(options.args) <- unlist(options.names) + + +if (length(options.args) != 12) { + stop("Not enough/Too many arguments", call. = FALSE) +} + +typeinput = options.args[1] +listfile = options.args[2] +onto = as.character(options.args[3]) +option = as.character(options.args[4]) +correction = as.character(options.args[6]) +threshold = as.numeric(options.args[5]) +text = as.character(options.args[7]) +barplot = as.character(options.args[8]) +dotplot = as.character(options.args[9]) +column = as.numeric(gsub("c","",options.args[10])) +geneuniverse = as.character(options.args[11]) +header = as.character(options.args[12]) + +if (typeinput=="copypaste"){ + sample = as.data.frame(unlist(listfile)) + sample = sample[,column] +} +if (typeinput=="tabfile"){ + + if (header=="TRUE"){ + sample = read.table(listfile,header=TRUE,sep="\t",na.strings="NA",fill=TRUE) + }else{ + sample = read.table(listfile,header=FALSE,sep="\t",na.strings="NA",fill=TRUE) + } + sample = sample[,column] + +} +# Launch enrichment analysis and return result data from the analysis or the null +# object if the enrichment could not be done. +goEnrichment = function(geneuniverse,sample,onto){ + + # get all the GO terms of the corresponding ontology (BP/CC/MF) and all their + # associated ensembl ids according to the org package + xx = annFUN.org(onto,mapping=geneuniverse,ID="ensembl") + allGenes = unique(unlist(xx)) + # check if the genes given by the user can be found in the org package (gene + # universe), that is in + # allGenes + if (length(intersect(sample,allGenes))==0){ + + print("None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably have no associated GO terms.") + return(c(NULL,NULL)) + + } + + geneList = factor(as.integer(allGenes %in% sample)) + names(geneList) <- allGenes + + + #topGO enrichment + + + # Creation of a topGOdata object + # It will contain : the list of genes of interest, the GO annotations and the GO hierarchy + # Parameters : + # ontology : character string specifying the ontology of interest (BP, CC, MF) + # allGenes : named vector of type numeric or factor + # annot : tells topGO how to map genes to GO annotations. + # argument not used here : nodeSize : at which minimal number of GO annotations + # do we consider a gene + + myGOdata = new("topGOdata", description="SEA with TopGO", ontology=onto, allGenes=geneList, annot = annFUN.org, mapping=geneuniverse,ID="ensembl") + + + # Performing enrichment tests + result <- runTest(myGOdata, algorithm=option, statistic="fisher") + return(c(result,myGOdata)) +} + +# Some libraries such as GOsummaries won't be able to treat the values such as +# "< 1e-30" produced by topGO. As such it is important to delete the < char +# with the deleteInfChar function. Nevertheless the user will have access to the original results in the text output. +deleteInfChar = function(values){ + + lines = grep("<",values) + if (length(lines)!=0){ + for (line in lines){ + values[line]=gsub("<","",values[line]) + } + } + return(values) +} + +corrMultipleTesting = function(result, myGOdata,correction,threshold){ + + # adjust for multiple testing + if (correction!="none"){ + # GenTable : transforms the result object into a list. Filters can be applied + # (e.g : with the topNodes argument, to get for instance only the n first + # GO terms with the lowest pvalues), but as we want to apply a correction we + # take all the GO terms, no matter their pvalues + allRes <- GenTable(myGOdata, test = result, orderBy = "result", ranksOf = "result",topNodes=length(attributes(result)$score)) + # Some pvalues given by topGO are not numeric (e.g : "<1e-30). As such, these + # values are converted to 1e-30 to be able to correct the pvalues + pvaluestmp = deleteInfChar(allRes$test) + + # the correction is done from the modified pvalues + allRes$qvalues = p.adjust(pvaluestmp, method = as.character(correction), n = length(pvaluestmp)) + allRes = as.data.frame(allRes) + + # Rename the test column by pvalues, so that is more explicit + nb = which(names(allRes) %in% c("test")) + + names(allRes)[nb] = "pvalues" + + allRes = allRes[which(as.numeric(allRes$pvalues) <= threshold),] + if (length(allRes$pvalues)==0){ + print("Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value") + return(NULL) + } + allRes = allRes[order(allRes$qvalues),] + } + + if (correction=="none"){ + # get all the go terms under user threshold + mysummary <- summary(attributes(result)$score <= threshold) + numsignif <- as.integer(mysummary[[3]]) + # get all significant nodes + allRes <- GenTable(myGOdata, test = result, orderBy = "result", ranksOf = "result",topNodes=numsignif) + + + allRes = as.data.frame(allRes) + # Rename the test column by pvalues, so that is more explicit + nb = which(names(allRes) %in% c("test")) + names(allRes)[nb] = "pvalues" + if (numsignif==0){ + + print("Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value") + return(NULL) + } + + allRes = allRes[order(allRes$pvalues),] + } + + return(allRes) +} + +# roundValues will simplify the results by rounding down the values. For instance 1.1e-17 becomes 1e-17 +roundValues = function(values){ + for (line in 1:length(values)){ + values[line]=as.numeric(gsub(".*e","1e",as.character(values[line]))) + } + return(values) +} + +createDotPlot = function(data){ + + values = deleteInfChar(data$pvalues) + values = roundValues(values) + values = as.numeric(values) + + geneRatio = data$Significant/data$Annotated + goTerms = data$Term + count = data$Significant + + png(filename="dotplot.png",res=300, width = 3200, height = 3200, units = "px") + sp1 = ggplot(data,aes(x=geneRatio,y=goTerms,xlabel ="Ratio" ,ylabel = "GO terms", color=values,size=count)) +geom_point() + scale_colour_gradientn(colours=c("red","violet","blue")) + labs(color="p-values\n") + + plot(sp1) + dev.off() +} + +createBarPlot = function(data){ + + + values = deleteInfChar(data$pvalues) + values = roundValues(values) + + values = as.numeric(values) + goTerms = data$Term + count = data$Significant + png(filename="barplot.png",res=300, width = 3200, height = 3200, units = "px") + + p<-ggplot(data, aes(x=goTerms, y=count,fill=values)) + geom_bar(stat="identity") + scale_fill_gradientn(colours=c("red","violet","blue")) + coord_flip() + labs(fill="p-values\n") + plot(p) + dev.off() +} + + +# Produce the different outputs +createOutputs = function(result, cut_result,text, barplot,dotplot){ + + + if (is.null(result)){ + + if (text=="TRUE"){ + + err_msg = "None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably either have no associated GO terms or are not ENSG identifiers (e.g : ENSG00000012048)." + write.table(err_msg, file='result.csv', quote=FALSE, sep='\t', col.names = T, row.names = F) + + } + + if (barplot=="TRUE"){ + + png(filename="barplot.png") + plot.new() + #text(0,0,err_msg) + dev.off() + } + + if (dotplot=="TRUE"){ + + png(filename="dotplot.png") + plot.new() + #text(0,0,err_msg) + dev.off() + + } + return(TRUE) + } + + + if (is.null(cut_result)){ + + + if (text=="TRUE"){ + + err_msg = "Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value." + write.table(err_msg, file='result.csv', quote=FALSE, sep='\t', col.names = T, row.names = F) + + } + + if (barplot=="TRUE"){ + + png(filename="barplot.png") + plot.new() + text(0,0,err_msg) + dev.off() + } + + if (dotplot=="TRUE"){ + + png(filename="dotplot.png") + plot.new() + text(0,0,err_msg) + dev.off() + + } + return(TRUE) + + + + } + + if (text=="TRUE"){ + write.table(cut_result, file='result.csv', quote=FALSE, sep='\t', col.names = T, row.names = F) + } + + if (barplot=="TRUE"){ + + createBarPlot(cut_result) + } + + if (dotplot=="TRUE"){ + + createDotPlot(cut_result) + } + return(TRUE) +} + + + +# Load R library ggplot2 to plot graphs +library(ggplot2) + +# Launch enrichment analysis +allresult = goEnrichment(geneuniverse,sample,onto) +result = allresult[1][[1]] +myGOdata = allresult[2][[1]] +if (!is.null(result)){ + + # Adjust the result with a multiple testing correction or not and with the user + # p-value cutoff + cut_result = corrMultipleTesting(result,myGOdata, correction,threshold) +}else{ + + cut_result=NULL + +} + + +createOutputs(result, cut_result,text, barplot,dotplot) + diff -r 000000000000 -r aade04e750fa test-data/Barplot_output_for_topGO_analysis.png Binary file test-data/Barplot_output_for_topGO_analysis.png has changed diff -r 000000000000 -r aade04e750fa test-data/Dotplot_output_for_topGO_analysis.png Binary file test-data/Dotplot_output_for_topGO_analysis.png has changed diff -r 000000000000 -r aade04e750fa test-data/Text_output_for_topGO_analysis.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/Text_output_for_topGO_analysis.txt Fri Sep 15 10:38:28 2017 -0400 @@ -0,0 +1,71 @@ +GO.ID Term Annotated Significant Expected pvalues qvalues +GO:0045087 innate immune response 1428 35 2.84 6.5e-27 9.28915e-24 +GO:0007268 synaptic transmission 766 20 1.52 7.2e-17 9.35410909090909e-14 +GO:0019933 cAMP-mediated signaling 56 9 0.11 1.2e-15 1.4291e-12 +GO:0050796 regulation of insulin secretion 208 13 0.41 2.6e-15 2.8582e-12 +GO:0007193 adenylate cyclase-inhibiting G-protein c... 75 9 0.15 1.9e-14 1.93949285714286e-11 +GO:1904322 cellular response to forskolin 5 5 0.01 2.3e-14 2.19128666666667e-11 +GO:0007189 adenylate cyclase-activating G-protein c... 84 9 0.17 5.5e-14 4.91253125e-11 +GO:0002223 stimulatory C-type lectin receptor signa... 139 10 0.28 1.1e-13 9.24711764705882e-11 +GO:0006171 cAMP biosynthetic process 139 13 0.28 1.2e-13 9.52733333333333e-11 +GO:0055085 transmembrane transport 1435 25 2.86 3.7e-13 2.78298421052632e-10 +GO:0007596 blood coagulation 590 21 1.17 9.7e-12 6.931135e-09 +GO:0048010 vascular endothelial growth factor recep... 317 11 0.63 1.6e-11 1.0888380952381e-08 +GO:0038095 Fc-epsilon receptor signaling pathway 316 10 0.63 3.9e-10 2.53340454545455e-07 +GO:0030168 platelet activation 263 11 0.52 6.2e-10 3.85235652173913e-07 +GO:2000480 negative regulation of cAMP-dependent pr... 8 4 0.02 9.2e-10 5.47821666666667e-07 +GO:0038096 Fc-gamma receptor signaling pathway invo... 74 6 0.15 6.7e-09 3.829988e-06 +GO:0010881 regulation of cardiac muscle contraction... 19 4 0.04 5.0e-08 2.74826923076923e-05 +GO:0051343 positive regulation of cyclic-nucleotide... 5 3 0.01 7.2e-08 3.81093333333333e-05 +GO:0010800 positive regulation of peptidyl-threonin... 26 4 0.05 1.9e-07 9.69746428571429e-05 +GO:1901841 regulation of high voltage-gated calcium... 8 3 0.02 4.0e-07 0.00019711724137931 +GO:0022400 regulation of rhodopsin mediated signali... 33 4 0.07 5.2e-07 0.00023972 +GO:0001975 response to amphetamine 33 4 0.07 5.2e-07 0.00023972 +GO:0060316 positive regulation of ryanodine-sensiti... 9 3 0.02 6.0e-07 0.000259836363636364 +GO:1901844 regulation of cell communication by elec... 9 3 0.02 6.0e-07 0.000259836363636364 +GO:0007190 activation of adenylate cyclase activity 35 4 0.07 6.6e-07 0.000277413529411765 +GO:0071361 cellular response to ethanol 11 3 0.02 1.2e-06 0.000476366666666667 +GO:0060315 negative regulation of ryanodine-sensiti... 11 3 0.02 1.2e-06 0.000476366666666667 +GO:0032465 regulation of cytokinesis 98 5 0.2 1.4e-06 0.00054074054054054 +GO:0010801 negative regulation of peptidyl-threonin... 13 3 0.03 2.0e-06 0.000752157894736842 +GO:0007611 learning or memory 229 11 0.46 2.1e-06 0.000769515384615385 +GO:0032516 positive regulation of phosphoprotein ph... 15 3 0.03 3.2e-06 0.00111539512195122 +GO:0005513 detection of calcium ion 15 3 0.03 3.2e-06 0.00111539512195122 +GO:0097338 response to clozapine 2 2 0 3.9e-06 0.00132702142857143 +GO:0051447 negative regulation of meiotic cell cycl... 19 3 0.04 6.9e-06 0.00229320697674419 +GO:0031954 positive regulation of protein autophosp... 20 3 0.04 8.1e-06 0.00263084318181818 +GO:0051000 positive regulation of nitric-oxide synt... 21 3 0.04 9.4e-06 0.00298523111111111 +GO:0000186 activation of MAPKK activity 258 6 0.51 1.1e-05 0.00341741304347826 +GO:0043647 inositol phosphate metabolic process 71 4 0.14 1.2e-05 0.00364876595744681 +GO:0051412 response to corticosterone 26 3 0.05 1.8e-05 0.00524975510204082 +GO:0007616 long-term memory 26 3 0.05 1.8e-05 0.00524975510204082 +GO:0005980 glycogen catabolic process 27 3 0.05 2.0e-05 0.0057164 +GO:0002576 platelet degranulation 89 4 0.18 2.9e-05 0.00812625490196078 +GO:0002027 regulation of heart rate 90 4 0.18 3.0e-05 0.00824480769230769 +GO:0046209 nitric oxide metabolic process 94 4 0.19 3.5e-05 0.00943745283018868 +GO:0046777 protein autophosphorylation 231 8 0.46 3.8e-05 0.0100566296296296 +GO:0035556 intracellular signal transduction 2849 30 5.67 4.1e-05 0.0106532909090909 +GO:0018107 peptidyl-threonine phosphorylation 78 8 0.16 4.3e-05 0.0109734464285714 +GO:0019433 triglyceride catabolic process 35 3 0.07 4.5e-05 0.0112823684210526 +GO:0048016 inositol phosphate-mediated signaling 36 3 0.07 4.9e-05 0.0120734310344828 +GO:1901621 negative regulation of smoothened signal... 6 2 0.01 5.7e-05 0.0138065593220339 +GO:0050730 regulation of peptidyl-tyrosine phosphor... 214 5 0.43 6.2e-05 0.0147673666666667 +GO:0042045 epithelial fluid transport 7 2 0.01 8.0e-05 0.0187422950819672 +GO:0044281 small molecule metabolic process 2690 28 5.35 8.4e-05 0.019362 +GO:0018105 peptidyl-serine phosphorylation 237 5 0.47 0.00010 0.022684126984127 +GO:0021762 substantia nigra development 47 3 0.09 0.00011 0.02456265625 +GO:0034351 negative regulation of glial cell apopto... 9 2 0.02 0.00014 0.0307806153846154 +GO:0007411 axon guidance 600 7 1.19 0.00015 0.0324795454545454 +GO:0090330 regulation of platelet aggregation 13 2 0.03 0.00030 0.063989552238806 +GO:0008286 insulin receptor signaling pathway 361 7 0.72 0.00036 0.0745617391304348 +GO:0070588 calcium ion transmembrane transport 205 7 0.41 0.00036 0.0745617391304348 +GO:0071902 positive regulation of protein serine/th... 316 5 0.63 0.00038 0.0775797142857143 +GO:0046827 positive regulation of protein export fr... 15 2 0.03 0.00040 0.080512676056338 +GO:0071380 cellular response to prostaglandin E sti... 17 2 0.03 0.00051 0.0998412328767123 +GO:0035413 positive regulation of catenin import in... 17 2 0.03 0.00051 0.0998412328767123 +GO:0045787 positive regulation of cell cycle 340 5 0.68 0.00053 0.102354459459459 +GO:0006094 gluconeogenesis 84 3 0.17 0.00062 0.118138933333333 +GO:0051592 response to calcium ion 114 6 0.23 0.00077 0.144790394736842 +GO:0043536 positive regulation of blood vessel endo... 22 2 0.04 0.00087 0.16146974025974 +GO:0071364 cellular response to epidermal growth fa... 23 2 0.05 0.00095 0.174057051282051 +GO:0042752 regulation of circadian rhythm 98 3 0.2 0.00097 0.175471772151899 diff -r 000000000000 -r aade04e750fa test-data/prot_reactome_EGFR_mapped_ensg.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/prot_reactome_EGFR_mapped_ensg.txt Fri Sep 15 10:38:28 2017 -0400 @@ -0,0 +1,36 @@ +ENSG00000005249 +ENSG00000072062 +ENSG00000078295 +ENSG00000096433 +ENSG00000108946 +ENSG00000114302 +ENSG00000115252 +ENSG00000118260 +ENSG00000121281 +ENSG00000123104 +ENSG00000123360 +ENSG00000124181 +ENSG00000126583 +ENSG00000129467 +ENSG00000138031 +ENSG00000138798 +ENSG00000142875 +ENSG00000143933 +ENSG00000146648 +ENSG00000150995 +ENSG00000152495 +ENSG00000154229 +ENSG00000154678 +ENSG00000155897 +ENSG00000160014 +ENSG00000162104 +ENSG00000163932 +ENSG00000164742 +ENSG00000165059 +ENSG00000168710 +ENSG00000171132 +ENSG00000173020 +ENSG00000173175 +ENSG00000174233 +ENSG00000188191 +ENSG00000198668 diff -r 000000000000 -r aade04e750fa topGO.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/topGO.xml Fri Sep 15 10:38:28 2017 -0400 @@ -0,0 +1,192 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + condtext['textoutput']=="TRUE" + + + + condbar['barplotoutput']=="TRUE" + + + + conddot['dotplotoutput']=="TRUE" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +