# HG changeset patch # User proteore # Date 1512398361 18000 # Node ID 472ad7da3d927e72e9dd1f006a7fd62f1b7a2b3a planemo upload commit 9f9e8b72a239e58b5087b2b3737262c25cc2671e-dirty diff -r 000000000000 -r 472ad7da3d92 README.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.rst Mon Dec 04 09:39:21 2017 -0500 @@ -0,0 +1,68 @@ +Wrapper for topGO +============================= + +**Authors** + +Alexa A and Rahnenfuhrer J (2016). topGO: Enrichment Analysis for Gene Ontology. R package version 2.30.0. + +**Galaxy integration** + +Lisa Peru, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR + +Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform + +This work has been partially funded through the French National Agency for Research (ANR) IFB project. + +Contact support@proteore.org for any questions or concerns about the Galaxy implementation of this tool. + +============================= + +**Galaxy component based on R package topGO.** + +**Input required** + +This component works with Ensembl gene ids (e.g : ENSG0000013618). You can +copy/paste these identifiers or supply a tabular file (.csv, .tsv, .txt, .tab) +where there are contained. + +**Principle** + +This component provides the GO terms representativity of a gene list in one ontology category (Biological Process "BP", Cellular Component "CC", Molecular Function "MF"). This representativity is evaluated in comparison to the background list of all human genes associated associated with GO terms of the chosen category (BP,CC,MF). This background is given by the R package "org.Hs.eg.db", which is a genome wide association package for **human**. + +**Output** + +Three kind of outputs are available : a textual output, a barplot output and +a dotplot output. + +*Textual output* : +The text output lists all the GO-terms that were found significant under the specified threshold. + + +The different fields are as follow : + +- Annotated : number of genes in org.Hs.eg.db which are annotated with the GO-term. + +- Significant : number of genes belonging to your input which are annotated with the GO-term. + +- Expected : show an estimate of the number of genes a node of size Annotated would have if the significant genes were to be randomly selected from the gene universe. + +- pvalues : pvalue obtained after the test + +- ( qvalues : additional column with adjusted pvalues ) + + +**Tests** + +topGO provides a classic fisher test for evaluating if some GO terms are over-represented in your gene list, but other options are also provided (elim, weight01,parentchild). For the merits of each option and their algorithmic descriptions, please refer to topGO manual : +https://bioconductor.org/packages/release/bioc/vignettes/topGO/inst/doc/topGO.pdf + +**Multiple testing corrections** + +Furthermore, the following corrections for multiple testing can also be applied : +- holm +- hochberg +- hommel +- bonferroni +- BH +- BY +- fdr diff -r 000000000000 -r 472ad7da3d92 enrichment_v3.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/enrichment_v3.R Mon Dec 04 09:39:21 2017 -0500 @@ -0,0 +1,350 @@ +# enrichment_v3.R +# Usage : Rscript --vanilla enrichment_v3.R --inputtype tabfile (or +# copypaste) --input file.txt --ontology "BP/CC/MF" --option option (e.g +# : classic/elim...) --threshold threshold --correction correction --textoutput +# text --barplotoutput barplot +# --dotplotoutput dotplot --column column --geneuniver human +# e.g : Rscript --vanilla enrichment_v3.R --inputtype tabfile --input file.txt +# --ontology BP --option classic --threshold 1e-15 --correction holm +# --textoutput TRUE +# --barplotoutput TRUE --dotplotoutput TRUE --column c1 --geneuniverse +# org.Hs.eg.db +# INPUT : +# - type of input. Can be ids separated by a blank space (copypast), or a text +# file (tabfile) +# - file with at least one column of ensembl ids +# - gene ontology category : Biological Process (BP), Cellular Component (CC), Molecular Function (MF) +# - test option (relative to topGO algorithms) : elim, weight01, parentchild, or no option (classic) +# - threshold for enriched GO term pvalues (e.g : 1e-15) +# - correction for multiple testing (see p.adjust options : holm, hochberg, hommel, bonferroni, BH, BY,fdr,none +# - outputs wanted in this order text, barplot, dotplot with boolean value (e.g +# : TRUE TRUE TRUE ). +# Declare the output not wanted as none +# - column containing the ensembl ids if the input file is a tabfile +# - gene universe reference for the user chosen specie +# - header : if the input is a text file, does this text file have a header +# (TRUE/FALSE) +# +# OUTPUT : +# - outputs commanded by the user named respectively result.tsv for the text +# results file, barplot.png for the barplot image file and dotplot.png for the +# dotplot image file + + +# loading topGO library +library("topGO") + +'%!in%' <- function(x,y)!('%in%'(x,y)) + + +# Parse command line arguments + +args = commandArgs(trailingOnly = TRUE) + +# create a list of the arguments from the command line, separated by a blank space +hh <- paste(unlist(args),collapse=' ') + +# delete the first element of the list which is always a blank space +listoptions <- unlist(strsplit(hh,'--'))[-1] + +# for each input, split the arguments with blank space as separator, unlist, +# and delete the first element which is the input name (e.g --inputtype) +options.args <- sapply(listoptions,function(x){ + unlist(strsplit(x, ' '))[-1] + }) +# same as the step above, except that only the names are kept +options.names <- sapply(listoptions,function(x){ + option <- unlist(strsplit(x, ' '))[1] +}) +names(options.args) <- unlist(options.names) + + +if (length(options.args) != 12) { + stop("Not enough/Too many arguments", call. = FALSE) +} + +typeinput = options.args[1] +listfile = options.args[2] +onto = as.character(options.args[3]) +option = as.character(options.args[4]) +correction = as.character(options.args[6]) +threshold = as.numeric(options.args[5]) +text = as.character(options.args[7]) +barplot = as.character(options.args[8]) +dotplot = as.character(options.args[9]) +column = as.numeric(gsub("c","",options.args[10])) +geneuniverse = as.character(options.args[11]) +header = as.character(options.args[12]) + +if (typeinput=="copypaste"){ + sample = as.data.frame(unlist(listfile)) + sample = sample[,column] +} +if (typeinput=="tabfile"){ + + if (header=="TRUE"){ + sample = read.table(listfile,header=TRUE,sep="\t",na.strings="NA",fill=TRUE) + }else{ + sample = read.table(listfile,header=FALSE,sep="\t",na.strings="NA",fill=TRUE) + } + sample = sample[,column] + +} +# Launch enrichment analysis and return result data from the analysis or the null +# object if the enrichment could not be done. +goEnrichment = function(geneuniverse,sample,onto){ + + # get all the GO terms of the corresponding ontology (BP/CC/MF) and all their + # associated ensembl ids according to the org package + xx = annFUN.org(onto,mapping=geneuniverse,ID="ensembl") + allGenes = unique(unlist(xx)) + # check if the genes given by the user can be found in the org package (gene + # universe), that is in + # allGenes + if (length(intersect(sample,allGenes))==0){ + + print("None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably have no associated GO terms.") + return(c(NULL,NULL)) + + } + + geneList = factor(as.integer(allGenes %in% sample)) + names(geneList) <- allGenes + + + #topGO enrichment + + + # Creation of a topGOdata object + # It will contain : the list of genes of interest, the GO annotations and the GO hierarchy + # Parameters : + # ontology : character string specifying the ontology of interest (BP, CC, MF) + # allGenes : named vector of type numeric or factor + # annot : tells topGO how to map genes to GO annotations. + # argument not used here : nodeSize : at which minimal number of GO annotations + # do we consider a gene + + myGOdata = new("topGOdata", description="SEA with TopGO", ontology=onto, allGenes=geneList, annot = annFUN.org, mapping=geneuniverse,ID="ensembl") + + + # Performing enrichment tests + result <- runTest(myGOdata, algorithm=option, statistic="fisher") + return(c(result,myGOdata)) +} + +# Some libraries such as GOsummaries won't be able to treat the values such as +# "< 1e-30" produced by topGO. As such it is important to delete the < char +# with the deleteInfChar function. Nevertheless the user will have access to the original results in the text output. +deleteInfChar = function(values){ + + lines = grep("<",values) + if (length(lines)!=0){ + for (line in lines){ + values[line]=gsub("<","",values[line]) + } + } + return(values) +} + +corrMultipleTesting = function(result, myGOdata,correction,threshold){ + + # adjust for multiple testing + if (correction!="none"){ + # GenTable : transforms the result object into a list. Filters can be applied + # (e.g : with the topNodes argument, to get for instance only the n first + # GO terms with the lowest pvalues), but as we want to apply a correction we + # take all the GO terms, no matter their pvalues + allRes <- GenTable(myGOdata, test = result, orderBy = "result", ranksOf = "result",topNodes=length(attributes(result)$score)) + # Some pvalues given by topGO are not numeric (e.g : "<1e-30). As such, these + # values are converted to 1e-30 to be able to correct the pvalues + pvaluestmp = deleteInfChar(allRes$test) + + # the correction is done from the modified pvalues + allRes$qvalues = p.adjust(pvaluestmp, method = as.character(correction), n = length(pvaluestmp)) + allRes = as.data.frame(allRes) + + # Rename the test column by pvalues, so that is more explicit + nb = which(names(allRes) %in% c("test")) + + names(allRes)[nb] = "pvalues" + + allRes = allRes[which(as.numeric(allRes$pvalues) <= threshold),] + if (length(allRes$pvalues)==0){ + print("Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value") + return(NULL) + } + allRes = allRes[order(allRes$qvalues),] + } + + if (correction=="none"){ + # get all the go terms under user threshold + mysummary <- summary(attributes(result)$score <= threshold) + numsignif <- as.integer(mysummary[[3]]) + # get all significant nodes + allRes <- GenTable(myGOdata, test = result, orderBy = "result", ranksOf = "result",topNodes=numsignif) + + + allRes = as.data.frame(allRes) + # Rename the test column by pvalues, so that is more explicit + nb = which(names(allRes) %in% c("test")) + names(allRes)[nb] = "pvalues" + if (numsignif==0){ + + print("Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value") + return(NULL) + } + + allRes = allRes[order(allRes$pvalues),] + } + + return(allRes) +} + +# roundValues will simplify the results by rounding down the values. For instance 1.1e-17 becomes 1e-17 +roundValues = function(values){ + for (line in 1:length(values)){ + values[line]=as.numeric(gsub(".*e","1e",as.character(values[line]))) + } + return(values) +} + +createDotPlot = function(data, onto){ + + values = deleteInfChar(data$pvalues) + values = roundValues(values) + values = as.numeric(values) + + geneRatio = data$Significant/data$Annotated + goTerms = data$Term + count = data$Significant + + labely = paste("GO terms",onto,sep=" ") + png(filename="dotplot.png",res=300, width = 3200, height = 3200, units = "px") + sp1 = ggplot(data,aes(x=geneRatio,y=goTerms, color=values,size=count)) +geom_point() + scale_colour_gradientn(colours=c("red","violet","blue")) + xlab("Gene Ratio") + ylab(labely) + labs(color="p-values\n") + + plot(sp1) + dev.off() +} + +createBarPlot = function(data, onto){ + + + values = deleteInfChar(data$pvalues) + values = roundValues(values) + + values = as.numeric(values) + goTerms = data$Term + count = data$Significant + png(filename="barplot.png",res=300, width = 3200, height = 3200, units = "px") + + labely = paste("GO terms",onto,sep=" ") + p<-ggplot(data, aes(x=goTerms, y=count,fill=values)) + ylab("Gene count") + xlab(labely) +geom_bar(stat="identity") + scale_fill_gradientn(colours=c("red","violet","blue")) + coord_flip() + labs(fill="p-values\n") + plot(p) + dev.off() +} + + +# Produce the different outputs +createOutputs = function(result, cut_result,text, barplot, dotplot, onto){ + + + if (is.null(result)){ + + if (text=="TRUE"){ + + err_msg = "None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably either have no associated GO terms or are not ENSG identifiers (e.g : ENSG00000012048)." + write.table(err_msg, file='result.csv', quote=FALSE, sep='\t', col.names = T, row.names = F) + + } + + if (barplot=="TRUE"){ + + png(filename="barplot.png") + plot.new() + #text(0,0,err_msg) + dev.off() + } + + if (dotplot=="TRUE"){ + + png(filename="dotplot.png") + plot.new() + #text(0,0,err_msg) + dev.off() + + } + return(TRUE) + } + + + if (is.null(cut_result)){ + + + if (text=="TRUE"){ + + err_msg = "Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value." + write.table(err_msg, file='result.csv', quote=FALSE, sep='\t', col.names = T, row.names = F) + + } + + if (barplot=="TRUE"){ + + png(filename="barplot.png") + plot.new() + text(0,0,err_msg) + dev.off() + } + + if (dotplot=="TRUE"){ + + png(filename="dotplot.png") + plot.new() + text(0,0,err_msg) + dev.off() + + } + return(TRUE) + + + + } + + if (text=="TRUE"){ + write.table(cut_result, file='result.csv', quote=FALSE, sep='\t', col.names = T, row.names = F) + } + + if (barplot=="TRUE"){ + + createBarPlot(cut_result, onto) + } + + if (dotplot=="TRUE"){ + + createDotPlot(cut_result, onto) + } + return(TRUE) +} + + + +# Load R library ggplot2 to plot graphs +library(ggplot2) + +# Launch enrichment analysis +allresult = goEnrichment(geneuniverse,sample,onto) +result = allresult[1][[1]] +myGOdata = allresult[2][[1]] +if (!is.null(result)){ + + # Adjust the result with a multiple testing correction or not and with the user + # p-value cutoff + cut_result = corrMultipleTesting(result,myGOdata, correction,threshold) +}else{ + + cut_result=NULL + +} + + +createOutputs(result, cut_result,text, barplot, dotplot, onto) + diff -r 000000000000 -r 472ad7da3d92 test-data/Barplot_output_for_topGO_analysis.png Binary file test-data/Barplot_output_for_topGO_analysis.png has changed diff -r 000000000000 -r 472ad7da3d92 test-data/Dotplot_output_for_topGO_analysis.png Binary file test-data/Dotplot_output_for_topGO_analysis.png has changed diff -r 000000000000 -r 472ad7da3d92 test-data/Text_output_for_topGO_analysis.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/Text_output_for_topGO_analysis.txt Mon Dec 04 09:39:21 2017 -0500 @@ -0,0 +1,71 @@ +GO.ID Term Annotated Significant Expected pvalues qvalues +GO:0045087 innate immune response 1428 35 2.84 6.5e-27 9.28915e-24 +GO:0007268 synaptic transmission 766 20 1.52 7.2e-17 9.35410909090909e-14 +GO:0019933 cAMP-mediated signaling 56 9 0.11 1.2e-15 1.4291e-12 +GO:0050796 regulation of insulin secretion 208 13 0.41 2.6e-15 2.8582e-12 +GO:0007193 adenylate cyclase-inhibiting G-protein c... 75 9 0.15 1.9e-14 1.93949285714286e-11 +GO:1904322 cellular response to forskolin 5 5 0.01 2.3e-14 2.19128666666667e-11 +GO:0007189 adenylate cyclase-activating G-protein c... 84 9 0.17 5.5e-14 4.91253125e-11 +GO:0002223 stimulatory C-type lectin receptor signa... 139 10 0.28 1.1e-13 9.24711764705882e-11 +GO:0006171 cAMP biosynthetic process 139 13 0.28 1.2e-13 9.52733333333333e-11 +GO:0055085 transmembrane transport 1435 25 2.86 3.7e-13 2.78298421052632e-10 +GO:0007596 blood coagulation 590 21 1.17 9.7e-12 6.931135e-09 +GO:0048010 vascular endothelial growth factor recep... 317 11 0.63 1.6e-11 1.0888380952381e-08 +GO:0038095 Fc-epsilon receptor signaling pathway 316 10 0.63 3.9e-10 2.53340454545455e-07 +GO:0030168 platelet activation 263 11 0.52 6.2e-10 3.85235652173913e-07 +GO:2000480 negative regulation of cAMP-dependent pr... 8 4 0.02 9.2e-10 5.47821666666667e-07 +GO:0038096 Fc-gamma receptor signaling pathway invo... 74 6 0.15 6.7e-09 3.829988e-06 +GO:0010881 regulation of cardiac muscle contraction... 19 4 0.04 5.0e-08 2.74826923076923e-05 +GO:0051343 positive regulation of cyclic-nucleotide... 5 3 0.01 7.2e-08 3.81093333333333e-05 +GO:0010800 positive regulation of peptidyl-threonin... 26 4 0.05 1.9e-07 9.69746428571429e-05 +GO:1901841 regulation of high voltage-gated calcium... 8 3 0.02 4.0e-07 0.00019711724137931 +GO:0022400 regulation of rhodopsin mediated signali... 33 4 0.07 5.2e-07 0.00023972 +GO:0001975 response to amphetamine 33 4 0.07 5.2e-07 0.00023972 +GO:0060316 positive regulation of ryanodine-sensiti... 9 3 0.02 6.0e-07 0.000259836363636364 +GO:1901844 regulation of cell communication by elec... 9 3 0.02 6.0e-07 0.000259836363636364 +GO:0007190 activation of adenylate cyclase activity 35 4 0.07 6.6e-07 0.000277413529411765 +GO:0071361 cellular response to ethanol 11 3 0.02 1.2e-06 0.000476366666666667 +GO:0060315 negative regulation of ryanodine-sensiti... 11 3 0.02 1.2e-06 0.000476366666666667 +GO:0032465 regulation of cytokinesis 98 5 0.2 1.4e-06 0.00054074054054054 +GO:0010801 negative regulation of peptidyl-threonin... 13 3 0.03 2.0e-06 0.000752157894736842 +GO:0007611 learning or memory 229 11 0.46 2.1e-06 0.000769515384615385 +GO:0032516 positive regulation of phosphoprotein ph... 15 3 0.03 3.2e-06 0.00111539512195122 +GO:0005513 detection of calcium ion 15 3 0.03 3.2e-06 0.00111539512195122 +GO:0097338 response to clozapine 2 2 0 3.9e-06 0.00132702142857143 +GO:0051447 negative regulation of meiotic cell cycl... 19 3 0.04 6.9e-06 0.00229320697674419 +GO:0031954 positive regulation of protein autophosp... 20 3 0.04 8.1e-06 0.00263084318181818 +GO:0051000 positive regulation of nitric-oxide synt... 21 3 0.04 9.4e-06 0.00298523111111111 +GO:0000186 activation of MAPKK activity 258 6 0.51 1.1e-05 0.00341741304347826 +GO:0043647 inositol phosphate metabolic process 71 4 0.14 1.2e-05 0.00364876595744681 +GO:0051412 response to corticosterone 26 3 0.05 1.8e-05 0.00524975510204082 +GO:0007616 long-term memory 26 3 0.05 1.8e-05 0.00524975510204082 +GO:0005980 glycogen catabolic process 27 3 0.05 2.0e-05 0.0057164 +GO:0002576 platelet degranulation 89 4 0.18 2.9e-05 0.00812625490196078 +GO:0002027 regulation of heart rate 90 4 0.18 3.0e-05 0.00824480769230769 +GO:0046209 nitric oxide metabolic process 94 4 0.19 3.5e-05 0.00943745283018868 +GO:0046777 protein autophosphorylation 231 8 0.46 3.8e-05 0.0100566296296296 +GO:0035556 intracellular signal transduction 2849 30 5.67 4.1e-05 0.0106532909090909 +GO:0018107 peptidyl-threonine phosphorylation 78 8 0.16 4.3e-05 0.0109734464285714 +GO:0019433 triglyceride catabolic process 35 3 0.07 4.5e-05 0.0112823684210526 +GO:0048016 inositol phosphate-mediated signaling 36 3 0.07 4.9e-05 0.0120734310344828 +GO:1901621 negative regulation of smoothened signal... 6 2 0.01 5.7e-05 0.0138065593220339 +GO:0050730 regulation of peptidyl-tyrosine phosphor... 214 5 0.43 6.2e-05 0.0147673666666667 +GO:0042045 epithelial fluid transport 7 2 0.01 8.0e-05 0.0187422950819672 +GO:0044281 small molecule metabolic process 2690 28 5.35 8.4e-05 0.019362 +GO:0018105 peptidyl-serine phosphorylation 237 5 0.47 0.00010 0.022684126984127 +GO:0021762 substantia nigra development 47 3 0.09 0.00011 0.02456265625 +GO:0034351 negative regulation of glial cell apopto... 9 2 0.02 0.00014 0.0307806153846154 +GO:0007411 axon guidance 600 7 1.19 0.00015 0.0324795454545454 +GO:0090330 regulation of platelet aggregation 13 2 0.03 0.00030 0.063989552238806 +GO:0008286 insulin receptor signaling pathway 361 7 0.72 0.00036 0.0745617391304348 +GO:0070588 calcium ion transmembrane transport 205 7 0.41 0.00036 0.0745617391304348 +GO:0071902 positive regulation of protein serine/th... 316 5 0.63 0.00038 0.0775797142857143 +GO:0046827 positive regulation of protein export fr... 15 2 0.03 0.00040 0.080512676056338 +GO:0071380 cellular response to prostaglandin E sti... 17 2 0.03 0.00051 0.0998412328767123 +GO:0035413 positive regulation of catenin import in... 17 2 0.03 0.00051 0.0998412328767123 +GO:0045787 positive regulation of cell cycle 340 5 0.68 0.00053 0.102354459459459 +GO:0006094 gluconeogenesis 84 3 0.17 0.00062 0.118138933333333 +GO:0051592 response to calcium ion 114 6 0.23 0.00077 0.144790394736842 +GO:0043536 positive regulation of blood vessel endo... 22 2 0.04 0.00087 0.16146974025974 +GO:0071364 cellular response to epidermal growth fa... 23 2 0.05 0.00095 0.174057051282051 +GO:0042752 regulation of circadian rhythm 98 3 0.2 0.00097 0.175471772151899 diff -r 000000000000 -r 472ad7da3d92 test-data/prot_reactome_EGFR_mapped_ensg.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/prot_reactome_EGFR_mapped_ensg.txt Mon Dec 04 09:39:21 2017 -0500 @@ -0,0 +1,36 @@ +ENSG00000005249 +ENSG00000072062 +ENSG00000078295 +ENSG00000096433 +ENSG00000108946 +ENSG00000114302 +ENSG00000115252 +ENSG00000118260 +ENSG00000121281 +ENSG00000123104 +ENSG00000123360 +ENSG00000124181 +ENSG00000126583 +ENSG00000129467 +ENSG00000138031 +ENSG00000138798 +ENSG00000142875 +ENSG00000143933 +ENSG00000146648 +ENSG00000150995 +ENSG00000152495 +ENSG00000154229 +ENSG00000154678 +ENSG00000155897 +ENSG00000160014 +ENSG00000162104 +ENSG00000163932 +ENSG00000164742 +ENSG00000165059 +ENSG00000168710 +ENSG00000171132 +ENSG00000173020 +ENSG00000173175 +ENSG00000174233 +ENSG00000188191 +ENSG00000198668 diff -r 000000000000 -r 472ad7da3d92 topGO.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/topGO.xml Mon Dec 04 09:39:21 2017 -0500 @@ -0,0 +1,217 @@ + + + performs enrichment analysis using R package topGO + + + bioconductor-graph + bioconductor-topgo + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + condtext['textoutput']=="TRUE" + + + + condbar['barplotoutput']=="TRUE" + + + + conddot['dotplotoutput']=="TRUE" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +