Mercurial > repos > proteore > proteore_topgo
changeset 10:e3430084c996 draft
planemo upload commit ad5f1c5a1a71d7fa2bc8bac408856aa80b0fc2a3
author | proteore |
---|---|
date | Tue, 18 Dec 2018 10:06:00 -0500 |
parents | 70c0c8757f5f |
children | fa2e27165d5d |
files | README.rst enrichment_v3.R topGO.xml topGO_enrichment.R |
diffstat | 4 files changed, 498 insertions(+), 532 deletions(-) [+] |
line wrap: on
line diff
--- a/README.rst Fri Sep 21 05:32:38 2018 -0400 +++ b/README.rst Tue Dec 18 10:06:00 2018 -0500 @@ -7,7 +7,7 @@ **Galaxy integration** -Lisa Peru, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR +Lisa Perus, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform
--- a/enrichment_v3.R Fri Sep 21 05:32:38 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,380 +0,0 @@ -# enrichment_v3.R -# Usage : Rscript --vanilla enrichment_v3.R --inputtype tabfile (or -# copypaste) --input file.txt --ontology "BP/CC/MF" --option option (e.g -# : classic/elim...) --threshold threshold --correction correction --textoutput -# text --barplotoutput barplot -# --dotplotoutput dotplot --column column --geneuniver human -# e.g : Rscript --vanilla enrichment_v3.R --inputtype tabfile --input file.txt -# --ontology BP --option classic --threshold 1e-15 --correction holm -# --textoutput TRUE -# --barplotoutput TRUE --dotplotoutput TRUE --column c1 --geneuniverse -# org.Hs.eg.db -# INPUT : -# - type of input. Can be ids separated by a blank space (copypast), or a text -# file (tabfile) -# - file with at least one column of ensembl ids -# - gene ontology category : Biological Process (BP), Cellular Component (CC), Molecular Function (MF) -# - test option (relative to topGO algorithms) : elim, weight01, parentchild, or no option (classic) -# - threshold for enriched GO term pvalues (e.g : 1e-15) -# - correction for multiple testing (see p.adjust options : holm, hochberg, hommel, bonferroni, BH, BY,fdr,none -# - outputs wanted in this order text, barplot, dotplot with boolean value (e.g -# : TRUE TRUE TRUE ). -# Declare the output not wanted as none -# - column containing the ensembl ids if the input file is a tabfile -# - gene universe reference for the user chosen specie -# - header : if the input is a text file, does this text file have a header -# (TRUE/FALSE) -# -# OUTPUT : -# - outputs commanded by the user named respectively result.tsv for the text -# results file, barplot.png for the barplot image file and dotplot.png for the -# dotplot image file - -options(warn=-1) #TURN OFF WARNINGS !!!!!! - -# loading topGO library -suppressMessages(library(topGO)) - -# Read file and return file content as data.frame -readfile = function(filename, header) { - if (header == "true") { - # Read only first line of the file as header: - headers <- read.table(filename, nrows = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "") - #Read the data of the files (skipping the first row) - file <- read.table(filename, skip = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "") - # Remove empty rows - file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE] - #And assign the header to the data - names(file) <- headers - } - else { - file <- read.table(filename, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "") - # Remove empty rows - file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE] - } - return(file) -} - -check_ens_ids <- function(vector) { - ens_pattern = "^(ENS[A-Z]+[0-9]{11}|[A-Z]{3}[0-9]{3}[A-Za-z](-[A-Za-z])?|CG[0-9]+|[A-Z0-9]+\\.[0-9]+|YM[A-Z][0-9]{3}[a-z][0-9])$" - return(grepl(ens_pattern,vector)) -} - -'%!in%' <- function(x,y)!('%in%'(x,y)) - - -# Parse command line arguments - -args = commandArgs(trailingOnly = TRUE) - -# create a list of the arguments from the command line, separated by a blank space -hh <- paste(unlist(args),collapse=' ') - -# delete the first element of the list which is always a blank space -listoptions <- unlist(strsplit(hh,'--'))[-1] - -# for each input, split the arguments with blank space as separator, unlist, -# and delete the first element which is the input name (e.g --inputtype) -options.args <- sapply(listoptions,function(x){ - unlist(strsplit(x, '[ \t\n]+'))[-1] - }) -# same as the step above, except that only the names are kept -options.names <- sapply(listoptions,function(x){ - option <- unlist(strsplit(x, '[ \t\n]+'))[1] -}) -names(options.args) <- unlist(options.names) - - -if (length(options.args) != 12) { - stop("Not enough/Too many arguments", call. = FALSE) -} - -#save(options.args,file="/home/dchristiany/proteore_project/ProteoRE/tools/topGO/args.Rda") -#load("/home/dchristiany/proteore_project/ProteoRE/tools/topGO/args.Rda") - - -typeinput = options.args[1] -listfile = options.args[2] -onto = as.character(options.args[3]) -option = as.character(options.args[4]) -correction = as.character(options.args[6]) -threshold = as.numeric(options.args[5]) -text = as.character(options.args[7]) -barplot = as.character(options.args[8]) -dotplot = as.character(options.args[9]) -column = as.numeric(gsub("c","",options.args[10])) -geneuniverse = as.character(options.args[11]) -header = as.character(options.args[12]) - -if (typeinput=="copypaste"){ - sample = as.data.frame(unlist(listfile)) - sample = sample[,column] -} -if (typeinput=="tabfile"){ - - if (header=="TRUE"){ - sample = readfile(listfile, "true") - }else{ - sample = readfile(listfile, "false") - } - sample = sample[,column] -} - -#check of ENS ids -if (! any(check_ens_ids(sample))){ - print("no ensembl gene ids found in your ids list, please check your IDs in input or the selected column of your input file") - stop() -} - -# Launch enrichment analysis and return result data from the analysis or the null -# object if the enrichment could not be done. -goEnrichment = function(geneuniverse,sample,onto){ - - # get all the GO terms of the corresponding ontology (BP/CC/MF) and all their - # associated ensembl ids according to the org package - xx = annFUN.org(onto,mapping=geneuniverse,ID="ensembl") - allGenes = unique(unlist(xx)) - # check if the genes given by the user can be found in the org package (gene - # universe), that is in - # allGenes - if (length(intersect(sample,allGenes))==0){ - - print("None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably have no associated GO terms.") - return(c(NULL,NULL)) - - } - - geneList = factor(as.integer(allGenes %in% sample)) - names(geneList) <- allGenes - - - #topGO enrichment - - - # Creation of a topGOdata object - # It will contain : the list of genes of interest, the GO annotations and the GO hierarchy - # Parameters : - # ontology : character string specifying the ontology of interest (BP, CC, MF) - # allGenes : named vector of type numeric or factor - # annot : tells topGO how to map genes to GO annotations. - # argument not used here : nodeSize : at which minimal number of GO annotations - # do we consider a gene - - myGOdata = new("topGOdata", description="SEA with TopGO", ontology=onto, allGenes=geneList, annot = annFUN.org, mapping=geneuniverse,ID="ensembl") - - - # Performing enrichment tests - result <- runTest(myGOdata, algorithm=option, statistic="fisher") - return(c(result,myGOdata)) -} - -# Some libraries such as GOsummaries won't be able to treat the values such as -# "< 1e-30" produced by topGO. As such it is important to delete the < char -# with the deleteInfChar function. Nevertheless the user will have access to the original results in the text output. -deleteInfChar = function(values){ - - lines = grep("<",values) - if (length(lines)!=0){ - for (line in lines){ - values[line]=gsub("<","",values[line]) - } - } - return(values) -} - -corrMultipleTesting = function(result, myGOdata,correction,threshold){ - - # adjust for multiple testing - if (correction!="none"){ - # GenTable : transforms the result object into a list. Filters can be applied - # (e.g : with the topNodes argument, to get for instance only the n first - # GO terms with the lowest pvalues), but as we want to apply a correction we - # take all the GO terms, no matter their pvalues - allRes <- GenTable(myGOdata, test = result, orderBy = "result", ranksOf = "result",topNodes=length(attributes(result)$score)) - # Some pvalues given by topGO are not numeric (e.g : "<1e-30). As such, these - # values are converted to 1e-30 to be able to correct the pvalues - pvaluestmp = deleteInfChar(allRes$test) - - # the correction is done from the modified pvalues - allRes$qvalues = p.adjust(pvaluestmp, method = as.character(correction), n = length(pvaluestmp)) - allRes = as.data.frame(allRes) - - # Rename the test column by pvalues, so that is more explicit - nb = which(names(allRes) %in% c("test")) - - names(allRes)[nb] = "pvalues" - - allRes = allRes[which(as.numeric(allRes$pvalues) <= threshold),] - if (length(allRes$pvalues)==0){ - print("Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value") - return(NULL) - } - allRes = allRes[order(allRes$qvalues),] - } - - if (correction=="none"){ - # get all the go terms under user threshold - mysummary <- summary(attributes(result)$score <= threshold) - numsignif <- as.integer(mysummary[[3]]) - # get all significant nodes - allRes <- GenTable(myGOdata, test = result, orderBy = "result", ranksOf = "result",topNodes=numsignif) - - - allRes = as.data.frame(allRes) - # Rename the test column by pvalues, so that is more explicit - nb = which(names(allRes) %in% c("test")) - names(allRes)[nb] = "pvalues" - if (numsignif==0){ - - print("Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value") - return(NULL) - } - - allRes = allRes[order(allRes$pvalues),] - } - - return(allRes) -} - -# roundValues will simplify the results by rounding down the values. For instance 1.1e-17 becomes 1e-17 -roundValues = function(values){ - for (line in 1:length(values)){ - values[line]=as.numeric(gsub(".*e","1e",as.character(values[line]))) - } - return(values) -} - -createDotPlot = function(data, onto){ - - values = deleteInfChar(data$pvalues) - values = roundValues(values) - values = as.numeric(values) - - geneRatio = data$Significant/data$Annotated - goTerms = data$Term - count = data$Significant - - labely = paste("GO terms",onto,sep=" ") - ggplot(data,aes(x=geneRatio,y=goTerms, color=values,size=count)) +geom_point( ) + scale_colour_gradientn(colours=c("red","violet","blue")) + xlab("Gene Ratio") + ylab(labely) + labs(color="p-values\n" ) - ggsave("dotplot.png", device = "png", dpi = 320, limitsize = TRUE, width = 15, height = 15, units="cm") -} - -createBarPlot = function(data, onto){ - - - values = deleteInfChar(data$pvalues) - values = roundValues(values) - - values = as.numeric(values) - goTerms = data$Term - count = data$Significant - - labely = paste("GO terms",onto,sep=" ") - ggplot(data, aes(x=goTerms, y=count,fill=values,scale(scale = 0.5))) + ylab("Gene count") + xlab(labely) +geom_bar(stat="identity") + scale_fill_gradientn(colours=c("red","violet","blue")) + coord_flip() + labs(fill="p-values\n") - ggsave("barplot.png", device = "png", dpi = 320, limitsize = TRUE, width = 15, height = 15, units="cm") -} - - -# Produce the different outputs -createOutputs = function(result, cut_result,text, barplot, dotplot, onto){ - - - if (is.null(result)){ - - if (text=="TRUE"){ - - err_msg = "None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably either have no associated GO terms or are not ENSG identifiers (e.g : ENSG00000012048)." - write.table(err_msg, file='result.csv', quote=FALSE, sep='\t', col.names = T, row.names = F) - - } - - if (barplot=="TRUE"){ - - png(filename="barplot.png") - plot.new() - #text(0,0,err_msg) - dev.off() - } - - if (dotplot=="TRUE"){ - - png(filename="dotplot.png") - plot.new() - #text(0,0,err_msg) - dev.off() - - } - return(TRUE) - } - - - if (is.null(cut_result)){ - - - if (text=="TRUE"){ - - err_msg = "Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value." - write.table(err_msg, file='result.csv', quote=FALSE, sep='\t', col.names = T, row.names = F) - - } - - if (barplot=="TRUE"){ - - png(filename="barplot.png") - plot.new() - text(0,0,err_msg) - dev.off() - } - - if (dotplot=="TRUE"){ - - png(filename="dotplot.png") - plot.new() - text(0,0,err_msg) - dev.off() - - } - return(TRUE) - - - - } - - if (text=="TRUE"){ - write.table(cut_result, file='result.csv', quote=FALSE, sep='\t', col.names = T, row.names = F) - } - - if (barplot=="TRUE"){ - - createBarPlot(cut_result, onto) - } - - if (dotplot=="TRUE"){ - - createDotPlot(cut_result, onto) - } -} - - - -# Load R library ggplot2 to plot graphs -suppressMessages(library(ggplot2)) - -# Launch enrichment analysis -allresult = suppressMessages(goEnrichment(geneuniverse,sample,onto)) -result = allresult[1][[1]] -myGOdata = allresult[2][[1]] -if (!is.null(result)){ - - # Adjust the result with a multiple testing correction or not and with the user - # p-value cutoff - cut_result = corrMultipleTesting(result,myGOdata, correction,threshold) -}else{ - - cut_result=NULL - -} - - -createOutputs(result, cut_result,text, barplot, dotplot, onto) -
--- a/topGO.xml Fri Sep 21 05:32:38 2018 -0400 +++ b/topGO.xml Tue Dec 18 10:06:00 2018 -0500 @@ -1,15 +1,14 @@ -<tool id="topGO" name="topGO" version="2018.09.21"> - <description> - Enrichment analysis for Gene Ontology - </description> +<tool id="topGO" name="Enrichment analysis for Gene Ontology" version="2018.12.17"> + <description>(Human, Mouse, Rat)[topGO]</description> <requirements> <requirement type="package" version="3.4.1">R</requirement> <requirement type="package" version="3.0.0">r-ggplot2</requirement> <requirement type="package" version="3.5.0">bioconductor-org.hs.eg.db</requirement> <requirement type="package" version="3.5.0">bioconductor-org.mm.eg.db</requirement> - <requirement type="package" version="3.5.0">bioconductor-org.ce.eg.db</requirement> - <requirement type="package" version="3.5.0">bioconductor-org.dm.eg.db</requirement> - <requirement type="package" version="3.5.0">bioconductor-org.sc.sgd.db</requirement> + <requirement type="package" version="3.5.0">bioconductor-org.rn.eg.db</requirement> + <!--requirement type="package" version="3.6.0">bioconductor-org.ce.eg.db</requirement--> + <!--requirement type="package" version="3.6.0">bioconductor-org.dm.eg.db</requirement--> + <!--requirement type="package" version="3.6.0">bioconductor-org.sc.sgd.db</requirement--> <!--requirement type="package" version="3.5.0">bioconductor-org.at.tair.db</requirement--> <requirement type="package" version="1.56.0">bioconductor-graph</requirement> <requirement type="package" version="1.40.0">bioconductor-annotationdbi</requirement> @@ -20,52 +19,44 @@ <exit_code range="1:" /> </stdio> <command><![CDATA[ - - #if $inputtype.filetype == "file_all": - Rscript --vanilla $__tool_directory__/enrichment_v3.R - --inputtype tabfile - --input '$inputtype.genelist' - --ontology '$ontocat' - --option '$option' - --threshold '$threshold' - --correction '$correction' - --textoutput '$condtext.textoutput' - --barplotoutput '$condbar.barplotoutput' - --dotplotoutput '$conddot.dotplotoutput' - --column '$inputtype.column' - --geneuniverse '$geneuniverse' - --header '$inputtype.header' - #end if - - - #if $inputtype.filetype == "copy_paste": - Rscript --vanilla $__tool_directory__/enrichment_v3.R - --inputtype copypaste - --input '$inputtype.genelist' - --ontology '$ontocat' - --option '$option' - --threshold '$threshold' - --correction '$correction' - --textoutput '$condtext.textoutput' - --barplotoutput '$condbar.barplotoutput' - --dotplotoutput '$conddot.dotplotoutput' - --column c1 - --geneuniverse '$geneuniverse' - --header None + + Rscript --vanilla $__tool_directory__/topGO_enrichment.R + --inputtype="$inputtype.filetype" + --input='$inputtype.genelist' + + #if $inputtype.filetype == "file" + --column='$inputtype.column' + --header='$inputtype.header' #end if + --ontology='$ontocat' + --option='$option' + --threshold='$threshold' + --correction='$correction' + --textoutput='true' + --plot='$plot' + --geneuniverse='$geneuniverse' + --background="$background_genes.background" + #if $background_genes.background == "true" + --background_genes="$background_genes.inputtype.genelist" + --background_input_type="$background_genes.inputtype.filetype" + #if $background_genes.inputtype.filetype == "file" + --background_header="$background_genes.inputtype.header" + --background_column="$background_genes.inputtype.column" + #end if + #end if ]]></command> <inputs> <conditional name="inputtype"> - <param name="filetype" type="select" label="Select your type of input file" help="The identifiers must be Ensembl gene IDs (e.g : ENSG00000139618). If it is not the case, please use the ID Mapping tool."> - <option value="file_all" selected="true">Input file containing your identifiers</option> + <param name="filetype" type="select" label="Enter your IDs (Ensembl Gene only)" help="Copy/paste or from a file"> + <option value="file" selected="true">Input file containing your IDs</option> <option value="copy_paste">Copy/paste your list of IDs</option> </param> <when value="copy_paste"> - <param name="genelist" type="text" label="Enter a list of identifiers"> + <param name="genelist" type="text" label="Enter a list of IDs"> <sanitizer> <valid initial="string.printable"> <remove value="'"/> @@ -76,85 +67,90 @@ </sanitizer> </param> </when> - <when value="file_all"> - <param name="genelist" type="data" format="txt,tabular" label="Choose an input file" help="This file must imperatively have 1 column filled with IDs consistent with the database that will be used. Please use the MappingIDs component if this is not the case."/> - <param name="column" type="text" label="Please specify the column where your Ensembl IDs are (e.g : Enter 'c1' for column n°1..)" value="c1"/> - - <param name="header" type="select" label="Does your file have a header?" multiple="false" optional="false"> - <option value="TRUE" selected="true">Yes</option> - <option value="FALSE" selected="false">No</option> - </param> + <when value="file"> + <param name="genelist" type="data" format="txt,tabular" label="Select your file" help=""/> + <param name="column" type="text" label="Column number of IDs" help="For example, fill in 'c1' if it is the first column, 'c2' if it is the second column and so on"/> + <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does file contain header?" /> </when> </conditional> - <param name="geneuniverse" type="select" label="Select a specie"> - <!--option value="org.At.tair.db" >Arabidopsis</option--> - <option value="org.Ce.eg.db" >Worm (C. elegans)</option> - <option value="org.Dm.eg.db" >Fly (D. melanogaster)</option> - <option value="org.Hs.eg.db" selected="true">Human (H. sapiens)</option> - <option value="org.Mm.eg.db" >Mouse (M. musculus)</option> - <option value="org.Sc.sgd.db" >Yeast (S. cerevisiae)</option> - </param> - - <param name="ontocat" type="select" label="Ontology category"> - <option value="BP" >Biological Process</option> - <option value="CC" >Cellular Component</option> - <option value="MF" >Molecular Function</option> + <conditional name="background_genes"> + <param name="background" type="boolean" checked="false" truevalue="true" falsevalue="false" label="Define your own background IDs ?"/> + <when value="true"> + <conditional name="inputtype"> + <param name="filetype" type="select" label="Enter your background IDs (Ensembl gene IDs)" help="(e.g : ENSG00000139618)"> + <option value="file" selected="true">Input file containing your background IDs</option> + <option value="copy_paste">Copy/paste your background IDs</option> </param> - - <param name="option" type="select" label="Choose the topGO option for your analysis"> - <option value="classic" >Classic fisher test</option> - <option value="elim" selected="true">Elim</option> - <option value="weight01" >Weight01</option> - <option value="parentchild" >ParentChild</option> + <when value="copy_paste"> + <param name="genelist" type="text" label="Copy/paste your background IDs" help="IDs must be separated by spaces into the form field, for example: ENSG00000139618 ENSG00000007350"> + <sanitizer> + <valid initial="string.printable"> + <remove value="'"/> + </valid> + <mapping initial="none"> + <add source="'" target="__sq__"/> + </mapping> + </sanitizer> </param> - <param name="threshold" type="text" label="Enter the p-value threshold level under the form 1e-level wanted (e.g : 1e-3)" value="1e-3"/> - <param name="correction" label="Choose a correction for multiple testing" type="select"> - <option value="none" >None</option> - <option value="holm">Holm correction</option> - <option value="hochberg" >Hochberg correction</option> - <option value="hommel" >Hommel correction</option> - <option value="bonferroni" >Bonferroni correction</option> - <option value="BH" selected="true">Benjamini and Hochberg</option> - <option value="BY" >Benjamini and Yekutieli</option> - <option value="fdr" >FDR</option> - </param> - <conditional name="condtext"> - <param name="textoutput" type="select" label="Generate a text file for results"> - <option value="TRUE">Yes</option> - <option value="FALSE">No</option> - </param> - <when value="TRUE"/> - <when value="FALSE"/> - </conditional> - <conditional name="condbar"> - <param name="barplotoutput" type="select" label="Generate a barplot of over-represented GO terms"> - <option value="TRUE">Yes</option> - <option value="FALSE">No</option> - </param> - <when value="TRUE"/> - <when value="FALSE"/> - </conditional> - <conditional name="conddot"> - <param name="dotplotoutput" type="select" label="Generate a dotplot of over-represented GO terms"> - <option value="TRUE">Yes</option> - <option value="FALSE">No</option> - </param> - <when value="TRUE"/> - <when value="FALSE"/> - </conditional> + </when> + <when value="file"> + <param name="genelist" type="data" format="txt,tabular" label="Select file that contains your background IDs list" help=""/> + <param name="header" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Does file contain header ?" /> + <param name="column" type="text" label="Column number of IDs" value="c1" help="For example, fill in 'c1' if it is the first column, 'c2' if it is the second column and so on"/> + </when> + </conditional> + </when> + <when value="false"/> + </conditional> + <param name="geneuniverse" type="select" label="Species"> + <!--option value="org.At.tair.db" >Arabidopsis</option--> + <!--option value="org.Ce.eg.db" >Worm (C. elegans)</option--> + <!--option value="org.Dm.eg.db" >Fly (D. melanogaster)</option--> + <option value="org.Hs.eg.db" selected="true">Human (Homo sapiens)</option> + <option value="org.Mm.eg.db" >Mouse (Mouse musculus)</option> + <option value="org.Rn.eg.db" >Rat (Rattus norvegicus)</option> + <!--option value="org.Sc.sgd.db" >Yeast (S. cerevisiae)</option--> + </param> + <param name="ontocat" type="select" label="GO terms category"> + <option value="BP" >Biological Process</option> + <option value="CC" >Cellular Component</option> + <option value="MF" >Molecular Function</option> + </param> + <param name="option" type="select" label="Select the topGO parameter (see user doc)"> + <option value="classic" >Classic Fisher test</option> + <option value="elim" selected="true">Elim</option> + <option value="weight01" >Weight01</option> + <option value="parentchild" >ParentChild</option> + </param> + <param name="threshold" type="text" label="p-value threshold (e.g : 1e-3)" value="1e-3"/> + <param name="correction" label="Multiple testing procedure (p-value adjustment)" type="select"> + <option value="none" >None</option> + <option value="holm">Holm correction</option> + <option value="hochberg" >Hochberg correction</option> + <option value="hommel" >Hommel correction</option> + <option value="bonferroni" >Bonferroni correction</option> + <option value="BH" selected="true">Benjamini and Hochberg</option> + <option value="BY" >Benjamini and Yekutieli</option> + <option value="fdr" >FDR</option> + </param> + <!--param name="textoutput" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Generate a text file for results" /--> + <param name="plot" type="select" display="checkboxes" multiple="true" label="Graphical display" optional="false"> + <option selected = "true" value="dotplot">dot-plot</option> + <option value="barplot">bar-plot</option> + </param> </inputs> <outputs> - <data name="outputtext" format="tabular" label="Text output for topGO analysis $ontocat category" from_work_dir="result.csv"> - <filter>condtext['textoutput']=="TRUE"</filter> + <data name="outputtext" format="tsv" label="Text output for topGO analysis $ontocat category" from_work_dir="result"> + <filter>textoutput</filter> </data> <data name="outputbarplot" format="png" label="Barplot output for topGO analysis $ontocat category" from_work_dir="barplot.png"> - <filter>condbar['barplotoutput']=="TRUE"</filter> + <filter>barplot</filter> </data> <data name="outputdotplot" format="png" label="Dotplot output for topGO analysis $ontocat category" from_work_dir="dotplot.png"> - <filter>conddot['dotplotoutput']=="TRUE"</filter> + <filter>dotplot</filter> </data> </outputs> @@ -188,62 +184,56 @@ <help><![CDATA[ -**Galaxy component based on R package topGO.** +**Description** + +This tool is based on R package topGO. topGO package provides tools for testing GO terms while accounting for the topology of the GO graph. Different test statistics and different methods for eliminating local similarities and dependencies between GO terms can be applied. + +This component computes the GO terms representativity of a gene list in one ontology category (Biological Process "BP", Cellular Component "CC", Molecular Function "MF"). This representativity is evaluated in comparison to the background list of all genes/proteins (of the selected species) associated associated with GO terms of the chosen category (BP,CC,MF). + +----- **Input required** -This component works with Ensembl gene ids (e.g : ENSG0000013618). You can -copy/paste these identifiers or supply a tabular file (.csv, .tsv, .txt, .tab) -where there are contained. +This component works with Ensembl gene IDs (e.g : ENSG0000013618). You can copy/paste these identifiers or supply a tabular file (.csv, .tsv, .txt, .tab) +and then specifying the column number that contains the ENSG IDs. + +----- + +**Parameters** -**Principle** +"Species": "Species": the three available species are Homo sapiens, Mus musculus and Rattus norvegicus + +"GO terms category": select either Biogical Process (BP)(by default), Cellular Component (CC) or Molecular Function (MF) + +"Select the topGO parameter (see user doc)": topGO provides a classic Fisher test for evaluating which GO terms are over-represented in your gene/protein list; other methodologies are also provided (Elim, Weight01, Parentchild). For the merits of each option and their algorithmic descriptions, please refer to topGO manual: +https://bioconductor.org/packages/release/bioc/vignettes/topGO/inst/doc/topGO.pdf -This component provides the GO terms representativity of a gene list in one ontology category (Biological Process "BP", Cellular Component "CC", Molecular Function "MF"). This representativity is evaluated in comparison to the background list of all human genes associated associated with GO terms of the chosen category (BP,CC,MF). This background is given by the R package "org.Hs.eg.db", which is a genome wide association package for **human**. +"p-value threshold (e.g : 1e-3)": must be in the form of "1e-5" (i.e. 0.00001) + +"Multiple testing procedure (p-value adjustment): several FDR procedure for multiple testing and p-value adjustment are available: Holm, Hochberg +Hommel, Bonferroni, BH (Benjamini-Hochberg), BY (Benjamini-Yekutieli), FDR. Default is BH (most commonly used) + +----- **Output** -Three kind of outputs are available : a textual output, a barplot output and -a dotplot output. +Three outputs are available : a textual output, a barplot and/or a dotplot (set by default) graphical outputs. -*Textual output* : -The text output lists all the GO-terms that were found significant under the specified threshold. - - -The different fields are as follow : +*Textual output* -- Annotated : number of genes in org.Hs.eg.db which are annotated with the GO-term. - -- Significant : number of genes belonging to your input which are annotated with the GO-term. +The text output lists all the GO-terms that were found significantly enriched according to the specified threshold (p-value). -- Expected : show an estimate of the number of genes a node of size Annotated would have if the significant genes were to be randomly selected from the gene universe. - -- pvalues : pvalue obtained after the test - -- ( qvalues : additional column with adjusted pvalues ) +The different fields are as follow: - -**Tests** - -topGO provides a classic fisher test for evaluating if some GO terms are over-represented in your gene list, but other options are also provided (elim, weight01,parentchild). For the merits of each option and their algorithmic descriptions, please refer to topGO manual : -https://bioconductor.org/packages/release/bioc/vignettes/topGO/inst/doc/topGO.pdf +- Annotated : number of genes in the selected species that are annotated with the GO-term. -**Multiple testing corrections** - -Furthermore, the following corrections for multiple testing can also be applied : - -- holm +- Significant : number of genes belonging to your input annotated with the GO-term. -- hochberg - -- hommel - -- bonferroni +- Expected : represents the expected number of interesting genes mapped to the GO term if the interesting genes were randomly distributed over all GO terms. -- BH +- p-values : p-value obtained after the test -- BY - -- fdr +- ( q-values : additional column with adjusted pvalues ) ----- @@ -251,13 +241,17 @@ **Authors** -Alexa A and Rahnenfuhrer J (2016). topGO: Enrichment Analysis for Gene Ontology. R package version 2.30.0. +Alexa A, Rahnenführer J, Lengauer T. Improved scoring of functional groups from gene expression data by decorrelating GO graph structure. Bioinformatics. 2006. 22(13):1600-7. PubMed PMID: 16606683. + +----- + +.. class:: infomark **Galaxy integration** -Lisa Peru, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR +Lisa Perus, T.P. Lien Nguyen, Florence Combes, Yves Vandenbrouck - CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR -Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform +Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux - INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform, FR This work has been partially funded through the French National Agency for Research (ANR) IFB project.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/topGO_enrichment.R Tue Dec 18 10:06:00 2018 -0500 @@ -0,0 +1,352 @@ +options(warn=-1) #TURN OFF WARNINGS !!!!!! + +suppressMessages(library(ggplot2)) +suppressMessages(library(topGO)) + +get_args <- function(){ + + ## Collect arguments + args <- commandArgs(TRUE) + + ## Default setting when no arguments passed + if(length(args) < 1) { + args <- c("--help") + } + + ## Help section + if("--help" %in% args) { + cat("Pathview R script + Arguments: + --help Print this test + --input_type + --onto + --option + --correction + --threshold + --text + --plot + --column + --geneuniverse + --header + + Example: + Rscript --vanilla enrichment_v3.R --inputtype=tabfile (or copypaste) --input=file.txt --ontology='BP/CC/MF' --option=option + (e.g : classic/elim...) --threshold=threshold --correction=correction --textoutput=text --barplotoutput=barplot --dotplotoutput=dotplot + --column=column --geneuniver=human \n\n") + + q(save="no") + } + + parseArgs <- function(x) strsplit(sub("^--", "", x), "=") + argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) + args <- as.list(as.character(argsDF$V2)) + names(args) <- argsDF$V1 + + return(args) +} + +read_file <- function(path,header){ + file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="\"", check.names = F),silent=TRUE) + if (inherits(file,"try-error")){ + stop("File not found !") + }else{ + return(file) + } +} + +get_list_from_cp <-function(list){ + list = gsub(";"," ",list) + list = strsplit(list, "[ \t\n]+")[[1]] + list = list[list != ""] #remove empty entry + list = gsub("-.+", "", list) #Remove isoform accession number (e.g. "-2") + return(list) +} + +check_ens_ids <- function(vector) { + ens_pattern = "^(ENS[A-Z]+[0-9]{11}|[A-Z]{3}[0-9]{3}[A-Za-z](-[A-Za-z])?|CG[0-9]+|[A-Z0-9]+\\.[0-9]+|YM[A-Z][0-9]{3}[a-z][0-9])$" + return(grepl(ens_pattern,vector)) +} + +str2bool <- function(x){ + if (any(is.element(c("t","true"),tolower(x)))){ + return (TRUE) + }else if (any(is.element(c("f","false"),tolower(x)))){ + return (FALSE) + }else{ + return(NULL) + } +} + +# Some libraries such as GOsummaries won't be able to treat the values such as +# "< 1e-30" produced by topGO. As such it is important to delete the < char +# with the deleteInfChar function. Nevertheless the user will have access to the original results in the text output. +deleteInfChar = function(values){ + + lines = grep("<",values) + if (length(lines)!=0){ + for (line in lines){ + values[line]=gsub("<","",values[line]) + } + } + return(values) +} + +corrMultipleTesting = function(result, myGOdata,correction,threshold){ + + # adjust for multiple testing + if (correction!="none"){ + # GenTable : transforms the result object into a list. Filters can be applied + # (e.g : with the topNodes argument, to get for instance only the n first + # GO terms with the lowest pvalues), but as we want to apply a correction we + # take all the GO terms, no matter their pvalues + allRes <- GenTable(myGOdata, test = result, orderBy = "result", ranksOf = "result",topNodes=length(attributes(result)$score)) + # Some pvalues given by topGO are not numeric (e.g : "<1e-30). As such, these + # values are converted to 1e-30 to be able to correct the pvalues + pvaluestmp = deleteInfChar(allRes$test) + + # the correction is done from the modified pvalues + allRes$qvalues = p.adjust(pvaluestmp, method = as.character(correction), n = length(pvaluestmp)) + allRes = as.data.frame(allRes) + + # Rename the test column by pvalues, so that is more explicit + nb = which(names(allRes) %in% c("test")) + + names(allRes)[nb] = "pvalues" + + allRes = allRes[which(as.numeric(allRes$pvalues) <= threshold),] + if (length(allRes$pvalues)==0){ + print("Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value") + return(NULL) + } + allRes = allRes[order(allRes$qvalues),] + } + + if (correction=="none"){ + # get all the go terms under user threshold + mysummary <- summary(attributes(result)$score <= threshold) + numsignif <- as.integer(mysummary[[3]]) + # get all significant nodes + allRes <- GenTable(myGOdata, test = result, orderBy = "result", ranksOf = "result",topNodes=numsignif) + + + allRes = as.data.frame(allRes) + # Rename the test column by pvalues, so that is more explicit + nb = which(names(allRes) %in% c("test")) + names(allRes)[nb] = "pvalues" + if (numsignif==0){ + + print("Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value") + return(NULL) + } + + allRes = allRes[order(allRes$pvalues),] + } + + return(allRes) +} + +# roundValues will simplify the results by rounding down the values. For instance 1.1e-17 becomes 1e-17 +roundValues = function(values){ + for (line in 1:length(values)){ + values[line]=as.numeric(gsub(".*e","1e",as.character(values[line]))) + } + return(values) +} + +createDotPlot = function(data, onto){ + + values = deleteInfChar(data$pvalues) + values = roundValues(values) + values = as.numeric(values) + + geneRatio = data$Significant/data$Annotated + goTerms = data$Term + count = data$Significant + + labely = paste("GO terms",onto,sep=" ") + ggplot(data,aes(x=geneRatio,y=goTerms, color=values,size=count)) +geom_point( ) + scale_colour_gradientn(colours=c("red","violet","blue")) + xlab("Gene Ratio") + ylab(labely) + labs(color="p-values\n" ) + ggsave("dotplot.png", device = "png", dpi = 320, limitsize = TRUE, width = 15, height = 15, units="cm") +} + +createBarPlot = function(data, onto){ + + + values = deleteInfChar(data$pvalues) + values = roundValues(values) + values = as.numeric(values) + + goTerms = data$Term + count = data$Significant + + labely = paste("GO terms",onto,sep=" ") + ggplot(data, aes(x=goTerms, y=count,fill=values,scale(scale = 0.5))) + ylab("Gene count") + xlab(labely) +geom_bar(stat="identity") + scale_fill_gradientn(colours=c("red","violet","blue")) + coord_flip() + labs(fill="p-values\n") + ggsave("barplot.png", device = "png", dpi = 320, limitsize = TRUE, width = 15, height = 15, units="cm") +} + +# Produce the different outputs +createOutputs = function(result, cut_result,text, barplot, dotplot, onto){ + + if (is.null(result)){ + if (text){ + err_msg = "None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably either have no associated GO terms or are not ENSG identifiers (e.g : ENSG00000012048)." + write.table(err_msg, file='result', quote=FALSE, sep='\t', col.names = T, row.names = F) + } + if (barplot){ + png(filename="barplot.png") + plot.new() + #text(0,0,err_msg) + dev.off() + } + if (dotplot){ + png(filename="dotplot.png") + plot.new() + #text(0,0,err_msg) + dev.off() + } + opt <- options(show.error.messages=FALSE) + on.exit(options(opt)) + stop("null result") + } + + if (is.null(cut_result)){ + if (text){ + err_msg = "Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value." + write.table(err_msg, file='result', quote=FALSE, sep='\t', col.names = T, row.names = F) + } + if (barplot){ + png(filename="barplot.png") + plot.new() + text(0,0,err_msg) + dev.off() + } + if (dotplot){ + png(filename="dotplot.png") + plot.new() + text(0,0,err_msg) + dev.off() + } + opt <- options(show.error.messages=FALSE) + on.exit(options(opt)) + stop("null cut_result") + } + + if (text){ + write.table(cut_result, file='result', quote=FALSE, sep='\t', col.names = T, row.names = F) + } + + if (barplot){ + createBarPlot(cut_result, onto) + } + + if (dotplot){ + createDotPlot(cut_result, onto) + } +} + +# Launch enrichment analysis and return result data from the analysis or the null +# object if the enrichment could not be done. +goEnrichment = function(geneuniverse,sample,background_sample,onto){ + + if (is.null(background_sample)){ + xx = annFUN.org(onto,mapping=geneuniverse,ID="ensembl") # get all the GO terms of the corresponding ontology (BP/CC/MF) and all their associated ensembl ids according to the org package + allGenes = unique(unlist(xx)) # check if the genes given by the user can be found in the org package (gene universe), that is in allGenes + } else { + allGenes = background_sample + } + + if (length(intersect(sample,allGenes))==0){ + print("None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably have no associated GO terms.") + return(c(NULL,NULL)) + } + + geneList = factor(as.integer(allGenes %in% sample)) + if (length(levels(geneList)) == 1 ){ + stop("All or none of the background genes are found in tested genes dataset, enrichment analysis can't be done") + } + names(geneList) <- allGenes + + #topGO enrichment + + # Creation of a topGOdata object + # It will contain : the list of genes of interest, the GO annotations and the GO hierarchy + # Parameters : + # ontology : character string specifying the ontology of interest (BP, CC, MF) + # allGenes : named vector of type numeric or factor + # annot : tells topGO how to map genes to GO annotations. + # argument not used here : nodeSize : at which minimal number of GO annotations + # do we consider a gene + + myGOdata = new("topGOdata", description="SEA with TopGO", ontology=onto, allGenes=geneList, annot = annFUN.org, mapping=geneuniverse,ID="ensembl") + + # Performing enrichment tests + result <- runTest(myGOdata, algorithm=option, statistic="fisher") + return(c(result,myGOdata)) +} + +args <- get_args() + +#save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/topGO/args.rda") +#load("/home/dchristiany/proteore_project/ProteoRE/tools/topGO/args.rda") + +input_type = args$inputtype +input = args$input +onto = args$ontology +option = args$option +correction = args$correction +threshold = as.numeric(args$threshold) +text = str2bool(args$textoutput) +barplot = "barplot" %in% unlist(strsplit(args$plot,",")) +dotplot = "dotplot" %in% unlist(strsplit(args$plot,",")) +column = as.numeric(gsub("c","",args$column)) +geneuniverse = args$geneuniverse +header = str2bool(args$header) +background = str2bool(args$background) +if (background){ + background_genes = args$background_genes + background_input_type = args$background_input_type + background_header = str2bool(args$background_header) + background_column = as.numeric(gsub("c","",args$background_column)) +} + +#get input +if (input_type=="copy_paste"){ + sample <- get_list_from_cp(input) +} else if (input_type=="file"){ + tab=read_file(input,header) + sample = trimws(unlist(strsplit(tab[,column],";"))) +} + +#check of ENS ids +if (! any(check_ens_ids(sample))){ + print("no ensembl gene ids found in your ids list, please check your IDs in input or the selected column of your input file") + stop() +} + +#get input if background genes +if (background){ + if (background_input_type=="copy_paste"){ + background_sample <- get_list_from_cp(background_genes) + } else if (background_input_type=="file"){ + background_tab=read_file(background_genes,background_header) + background_sample = unique(trimws(unlist(strsplit(background_tab[,background_column],";")))) + } + #check of ENS ids + if (! any(check_ens_ids(background_sample))){ + print("no ensembl gene ids found in your background ids list, please check your IDs in input or the selected column of your input file") + stop() + } +} else { + background_sample=NULL +} + +# Launch enrichment analysis +allresult = suppressMessages(goEnrichment(geneuniverse,sample,background_sample,onto)) +result = allresult[1][[1]] +myGOdata = allresult[2][[1]] +if (!is.null(result)){ + cut_result = corrMultipleTesting(result,myGOdata, correction,threshold) # Adjust the result with a multiple testing correction or not and with the user, p-value cutoff +}else{ + cut_result=NULL +} + +createOutputs(result, cut_result,text, barplot, dotplot, onto)