Mercurial > repos > proteore > proteore_go_terms_enrich_comparison
diff GO_terms_enrich_comparison.R @ 7:2e7245bd643d draft default tip
"planemo upload commit 373a85c4179ac6b5cfc77013eab020badb8c8370-dirty"
author | proteore |
---|---|
date | Tue, 04 Feb 2020 05:16:19 -0500 |
parents | 04f363ee805a |
children |
line wrap: on
line diff
--- a/GO_terms_enrich_comparison.R Thu Jan 23 03:16:11 2020 -0500 +++ b/GO_terms_enrich_comparison.R Tue Feb 04 05:16:19 2020 -0500 @@ -1,327 +1,327 @@ -options(warn=-1) #TURN OFF WARNINGS !!!!!! -suppressMessages(library(clusterProfiler,quietly = TRUE)) -suppressMessages(library(plyr, quietly = TRUE)) -suppressMessages(library(ggplot2, quietly = TRUE)) -suppressMessages(library(DOSE, quietly = TRUE)) - -#return the number of character from the longest description found (from the 10 first) -max_str_length_10_first <- function(vector){ - vector <- as.vector(vector) - nb_description = length(vector) - if (nb_description >= 10){nb_description=10} - return(max(nchar(vector[1:nb_description]))) -} - -str2bool <- function(x){ - if (any(is.element(c("t","true"),tolower(x)))){ - return (TRUE) - }else if (any(is.element(c("f","false"),tolower(x)))){ - return (FALSE) - }else{ - return(NULL) - } -} - -get_args <- function(){ - - ## Collect arguments - args <- commandArgs(TRUE) - #value = character vector of length=nb of arguments - - ## Default setting when no arguments passed - if(length(args) < 1) { - args <- c("--help") - } - - ## Help section - if("--help" %in% args) { - cat("Selection and Annotation HPA - Arguments: - --inputtype1: type of input (list of id or filename) - --inputtype2: type of input (list of id or filename) - --inputtype3: type of input (list of id or filename) - --input1: input1 - --input2: input2 - --input3: input3 - --column1: the column number which you would like to apply... - --column2: the column number which you would like to apply... - --column3: the column number which you would like to apply... - --header1: true/false if your file contains a header - --header2: true/false if your file contains a header - --header3: true/false if your file contains a header - --ont: ontology to use - --org: organism db package - --list_name1: name of the first list - --list_name2: name of the second list - --list_name3: name of the third list \n") - - q(save="no") - } - - parseArgs <- function(x) strsplit(sub("^--", "", x), "=") - argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) - args <- as.list(as.character(argsDF$V2)) - names(args) <- argsDF$V1 - - return(args) -} - -get_ids=function(input, inputtype, header , ncol) { - - if (inputtype == "text") { - ids = strsplit(input, "[ \t\n]+")[[1]] - } else if (inputtype == "file") { - header=str2bool(header) - ncol=get_cols(ncol) - csv = read.csv(input,header=header, sep="\t", as.is=T) - ids=csv[,ncol] - } - - ids = unlist(strsplit(as.character(ids),";")) - ids = ids[which(!is.na(ids))] - - return(ids) -} - -str2bool <- function(x){ - if (any(is.element(c("t","true"),tolower(x)))){ - return (TRUE) - }else if (any(is.element(c("f","false"),tolower(x)))){ - return (FALSE) - }else{ - return(NULL) - } -} - -check_ids <- function(vector,type) { - uniprot_pattern = "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$" - entrez_id = "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$" - if (type == "entrez") - return(grepl(entrez_id,vector)) - else if (type == "uniprot") { - return(grepl(uniprot_pattern,vector)) - } -} - -#res.cmp@compareClusterResult$Description <- sapply(as.vector(res.cmp@compareClusterResult$Description), function(x) {ifelse(nchar(x)>50, substr(x,1,50),x)},USE.NAMES = FALSE) -fortify.compareClusterResult <- function(res.cmp, showCategory=30, by="geneRatio", split=NULL, includeAll=TRUE) { - clProf.df <- as.data.frame(res.cmp) - .split <- split - ## get top 5 (default) categories of each gene cluster. - if (is.null(showCategory)) { - result <- clProf.df - } else { - Cluster <- NULL # to satisfy codetools - topN <- function(res, showCategory) { - ddply(.data = res, .variables = .(Cluster), .fun = function(df, N) { - if (length(df$Count) > N) { - if (any(colnames(df) == "pvalue")) { - idx <- order(df$pvalue, decreasing=FALSE)[1:N] - } else { - ## for groupGO - idx <- order(df$Count, decreasing=T)[1:N] - } - return(df[idx,]) - } else { - return(df) - } - }, - N=showCategory - ) - } - if (!is.null(.split) && .split %in% colnames(clProf.df)) { - lres <- split(clProf.df, as.character(clProf.df[, .split])) - lres <- lapply(lres, topN, showCategory = showCategory) - result <- do.call('rbind', lres) - } else { - result <- topN(clProf.df, showCategory) - } - } - ID <- NULL - if (includeAll == TRUE) { - result = subset(clProf.df, ID %in% result$ID) - } - ## remove zero count - result$Description <- as.character(result$Description) ## un-factor - GOlevel <- result[,c("ID", "Description")] ## GO ID and Term - #GOlevel <- unique(GOlevel) - result <- result[result$Count != 0, ] - #bug 19.11.2019 - #result$Description <- factor(result$Description,levels=rev(GOlevel[,2])) - if (by=="rowPercentage") { - Description <- Count <- NULL # to satisfy codetools - result <- ddply(result,.(Description),transform,Percentage = Count/sum(Count),Total = sum(Count)) - ## label GO Description with gene counts. - x <- mdply(result[, c("Description", "Total")], paste, sep=" (") - y <- sapply(x[,3], paste, ")", sep="") - result$Description <- y - - ## restore the original order of GO Description - xx <- result[,c(2,3)] - xx <- unique(xx) - rownames(xx) <- xx[,1] - Termlevel <- xx[as.character(GOlevel[,1]),2] - - ##drop the *Total* column - result <- result[, colnames(result) != "Total"] - result$Description <- factor(result$Description, levels=rev(Termlevel)) - - } else if (by == "count") { - ## nothing - } else if (by == "geneRatio") { ##default - gsize <- as.numeric(sub("/\\d+$", "", as.character(result$GeneRatio))) - gcsize <- as.numeric(sub("^\\d+/", "", as.character(result$GeneRatio))) - result$GeneRatio = gsize/gcsize - cluster <- paste(as.character(result$Cluster),"\n", "(", gcsize, ")", sep="") - lv <- unique(cluster)[order(as.numeric(unique(result$Cluster)))] - result$Cluster <- factor(cluster, levels = lv) - } else { - ## nothing - } - return(result) -} - -##function plotting.clusteProfile from clusterProfiler pkg -plotting.clusterProfile <- function(clProf.reshape.df,x = ~Cluster,type = "dot", colorBy = "p.adjust",by = "geneRatio",title="",font.size=12) { - - Description <- Percentage <- Count <- Cluster <- GeneRatio <- p.adjust <- pvalue <- NULL # to - if (type == "dot") { - if (by == "rowPercentage") { - p <- ggplot(clProf.reshape.df, - aes_(x = x, y = ~Description, size = ~Percentage)) - } else if (by == "count") { - p <- ggplot(clProf.reshape.df, - aes_(x = x, y = ~Description, size = ~Count)) - } else if (by == "geneRatio") { ##DEFAULT - p <- ggplot(clProf.reshape.df, - aes_(x = x, y = ~Description, size = ~GeneRatio)) - } else { - ## nothing here - } - if (any(colnames(clProf.reshape.df) == colorBy)) { - p <- p + - geom_point() + - aes_string(color=colorBy) + - scale_color_continuous(low="red", high="blue", guide=guide_colorbar(reverse=TRUE)) - ## scale_color_gradientn(guide=guide_colorbar(reverse=TRUE), colors = enrichplot:::sig_palette) - } else { - p <- p + geom_point(colour="steelblue") - } - } - - p <- p + xlab("") + ylab("") + ggtitle(title) + - theme_dose(font.size) - - ## theme(axis.text.x = element_text(colour="black", size=font.size, vjust = 1)) + - ## theme(axis.text.y = element_text(colour="black", - ## size=font.size, hjust = 1)) + - ## ggtitle(title)+theme_bw() - ## p <- p + theme(axis.text.x = element_text(angle=angle.axis.x, - ## hjust=hjust.axis.x, - ## vjust=vjust.axis.x)) - - return(p) -} - -make_dotplot<-function(res.cmp,ontology) { - - dfok<-fortify.compareClusterResult(res.cmp) - dfok$Description <- sapply(as.vector(dfok$Description), function(x) {ifelse(nchar(x)>50, substr(x,1,50),x)},USE.NAMES = FALSE) - p<-plotting.clusterProfile(dfok, title="") - - #plot(p, type="dot") # - output_path= paste("GO_enrich_comp_",ontology,".png",sep="") - png(output_path,height = 720, width = 600) - pl <- plot(p, type="dot") - print(pl) - dev.off() -} - -get_cols <-function(input_cols) { - input_cols <- gsub("c","",gsub("C","",gsub(" ","",input_cols))) - if (grepl(":",input_cols)) { - first_col=unlist(strsplit(input_cols,":"))[1] - last_col=unlist(strsplit(input_cols,":"))[2] - cols=first_col:last_col - } else { - cols = as.integer(unlist(strsplit(input_cols,","))) - } - return(cols) -} - -#to check -cmp.GO <- function(l,fun="enrichGO",orgdb, ontology, readable=TRUE) { - cmpGO<-compareCluster(geneClusters = l, - fun=fun, - OrgDb = orgdb, - ont=ontology, - readable=TRUE) - - return(cmpGO) -} - -check_ids <- function(vector,type) { - uniprot_pattern = "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$" - entrez_id = "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$" - if (type == "entrez") - return(grepl(entrez_id,vector)) - else if (type == "uniprot") { - return(grepl(uniprot_pattern,vector)) - } -} - -main = function() { - - #to get the args of the command line - args=get_args() - - #saved - #l<-list() - #for(j in 1:args$nb){ - # i<-j-1 - # ids<-get_ids(args$input.i, args$inputtype.i, args$header.i, args$column.i) - # l[[args$name.i]]<-ids - - #} - #saved - - - l<-list() - for(j in 1:args$nb){ - i<-j-1 - ids<-get_ids( args[[paste("input",i,sep=".")]], - args[[paste("inputtype",i,sep=".")]], - args[[paste("header",i,sep=".")]], - args[[paste("column",i,sep=".")]] ) - - l[[args[[paste("name",i,sep=".")]]]]<-ids - -} - - ont = strsplit(args$ont, ",")[[1]] - org=args$org - - #load annot package - suppressMessages(library(args$org, character.only = TRUE, quietly = TRUE)) - - # Extract OrgDb - if (args$org=="org.Hs.eg.db") { - orgdb<-org.Hs.eg.db - } else if (args$org=="org.Mm.eg.db") { - orgdb<-org.Mm.eg.db - } else if (args$org=="org.Rn.eg.db") { - orgdb<-org.Rn.eg.db - } - - for(ontology in ont) { - - res.cmp<-cmp.GO(l=l,fun="enrichGO",orgdb, ontology, readable=TRUE) - make_dotplot(res.cmp,ontology) - output_path = paste("GO_enrich_comp_",ontology,".tsv",sep="") - write.table(res.cmp@compareClusterResult, output_path, sep="\t", row.names=F, quote=F) - } - -} #end main - -main() - +options(warn=-1) #TURN OFF WARNINGS !!!!!! +suppressMessages(library(clusterProfiler,quietly = TRUE)) +suppressMessages(library(plyr, quietly = TRUE)) +suppressMessages(library(ggplot2, quietly = TRUE)) +suppressMessages(library(DOSE, quietly = TRUE)) + +#return the number of character from the longest description found (from the 10 first) +max_str_length_10_first <- function(vector){ + vector <- as.vector(vector) + nb_description = length(vector) + if (nb_description >= 10){nb_description=10} + return(max(nchar(vector[1:nb_description]))) +} + +str2bool <- function(x){ + if (any(is.element(c("t","true"),tolower(x)))){ + return (TRUE) + }else if (any(is.element(c("f","false"),tolower(x)))){ + return (FALSE) + }else{ + return(NULL) + } +} + +get_args <- function(){ + + ## Collect arguments + args <- commandArgs(TRUE) + #value = character vector of length=nb of arguments + + ## Default setting when no arguments passed + if(length(args) < 1) { + args <- c("--help") + } + + ## Help section + if("--help" %in% args) { + cat("Selection and Annotation HPA + Arguments: + --inputtype1: type of input (list of id or filename) + --inputtype2: type of input (list of id or filename) + --inputtype3: type of input (list of id or filename) + --input1: input1 + --input2: input2 + --input3: input3 + --column1: the column number which you would like to apply... + --column2: the column number which you would like to apply... + --column3: the column number which you would like to apply... + --header1: true/false if your file contains a header + --header2: true/false if your file contains a header + --header3: true/false if your file contains a header + --ont: ontology to use + --org: organism db package + --list_name1: name of the first list + --list_name2: name of the second list + --list_name3: name of the third list \n") + + q(save="no") + } + + parseArgs <- function(x) strsplit(sub("^--", "", x), "=") + argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) + args <- as.list(as.character(argsDF$V2)) + names(args) <- argsDF$V1 + + return(args) +} + +get_ids=function(input, inputtype, header , ncol) { + + if (inputtype == "text") { + ids = strsplit(input, "[ \t\n]+")[[1]] + } else if (inputtype == "file") { + header=str2bool(header) + ncol=get_cols(ncol) + csv = read.csv(input,header=header, sep="\t", as.is=T) + ids=csv[,ncol] + } + + ids = unlist(strsplit(as.character(ids),";")) + ids = ids[which(!is.na(ids))] + + return(ids) +} + +str2bool <- function(x){ + if (any(is.element(c("t","true"),tolower(x)))){ + return (TRUE) + }else if (any(is.element(c("f","false"),tolower(x)))){ + return (FALSE) + }else{ + return(NULL) + } +} + +check_ids <- function(vector,type) { + uniprot_pattern = "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$" + entrez_id = "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$" + if (type == "entrez") + return(grepl(entrez_id,vector)) + else if (type == "uniprot") { + return(grepl(uniprot_pattern,vector)) + } +} + +#res.cmp@compareClusterResult$Description <- sapply(as.vector(res.cmp@compareClusterResult$Description), function(x) {ifelse(nchar(x)>50, substr(x,1,50),x)},USE.NAMES = FALSE) +fortify.compareClusterResult <- function(res.cmp, showCategory=30, by="geneRatio", split=NULL, includeAll=TRUE) { + clProf.df <- as.data.frame(res.cmp) + .split <- split + ## get top 5 (default) categories of each gene cluster. + if (is.null(showCategory)) { + result <- clProf.df + } else { + Cluster <- NULL # to satisfy codetools + topN <- function(res, showCategory) { + ddply(.data = res, .variables = .(Cluster), .fun = function(df, N) { + if (length(df$Count) > N) { + if (any(colnames(df) == "pvalue")) { + idx <- order(df$pvalue, decreasing=FALSE)[1:N] + } else { + ## for groupGO + idx <- order(df$Count, decreasing=T)[1:N] + } + return(df[idx,]) + } else { + return(df) + } + }, + N=showCategory + ) + } + if (!is.null(.split) && .split %in% colnames(clProf.df)) { + lres <- split(clProf.df, as.character(clProf.df[, .split])) + lres <- lapply(lres, topN, showCategory = showCategory) + result <- do.call('rbind', lres) + } else { + result <- topN(clProf.df, showCategory) + } + } + ID <- NULL + if (includeAll == TRUE) { + result = subset(clProf.df, ID %in% result$ID) + } + ## remove zero count + result$Description <- as.character(result$Description) ## un-factor + GOlevel <- result[,c("ID", "Description")] ## GO ID and Term + #GOlevel <- unique(GOlevel) + result <- result[result$Count != 0, ] + #bug 19.11.2019 + #result$Description <- factor(result$Description,levels=rev(GOlevel[,2])) + if (by=="rowPercentage") { + Description <- Count <- NULL # to satisfy codetools + result <- ddply(result,.(Description),transform,Percentage = Count/sum(Count),Total = sum(Count)) + ## label GO Description with gene counts. + x <- mdply(result[, c("Description", "Total")], paste, sep=" (") + y <- sapply(x[,3], paste, ")", sep="") + result$Description <- y + + ## restore the original order of GO Description + xx <- result[,c(2,3)] + xx <- unique(xx) + rownames(xx) <- xx[,1] + Termlevel <- xx[as.character(GOlevel[,1]),2] + + ##drop the *Total* column + result <- result[, colnames(result) != "Total"] + result$Description <- factor(result$Description, levels=rev(Termlevel)) + + } else if (by == "count") { + ## nothing + } else if (by == "geneRatio") { ##default + gsize <- as.numeric(sub("/\\d+$", "", as.character(result$GeneRatio))) + gcsize <- as.numeric(sub("^\\d+/", "", as.character(result$GeneRatio))) + result$GeneRatio = gsize/gcsize + cluster <- paste(as.character(result$Cluster),"\n", "(", gcsize, ")", sep="") + lv <- unique(cluster)[order(as.numeric(unique(result$Cluster)))] + result$Cluster <- factor(cluster, levels = lv) + } else { + ## nothing + } + return(result) +} + +##function plotting.clusteProfile from clusterProfiler pkg +plotting.clusterProfile <- function(clProf.reshape.df,x = ~Cluster,type = "dot", colorBy = "p.adjust",by = "geneRatio",title="",font.size=12) { + + Description <- Percentage <- Count <- Cluster <- GeneRatio <- p.adjust <- pvalue <- NULL # to + if (type == "dot") { + if (by == "rowPercentage") { + p <- ggplot(clProf.reshape.df, + aes_(x = x, y = ~Description, size = ~Percentage)) + } else if (by == "count") { + p <- ggplot(clProf.reshape.df, + aes_(x = x, y = ~Description, size = ~Count)) + } else if (by == "geneRatio") { ##DEFAULT + p <- ggplot(clProf.reshape.df, + aes_(x = x, y = ~Description, size = ~GeneRatio)) + } else { + ## nothing here + } + if (any(colnames(clProf.reshape.df) == colorBy)) { + p <- p + + geom_point() + + aes_string(color=colorBy) + + scale_color_continuous(low="red", high="blue", guide=guide_colorbar(reverse=TRUE)) + ## scale_color_gradientn(guide=guide_colorbar(reverse=TRUE), colors = enrichplot:::sig_palette) + } else { + p <- p + geom_point(colour="steelblue") + } + } + + p <- p + xlab("") + ylab("") + ggtitle(title) + + theme_dose(font.size) + + ## theme(axis.text.x = element_text(colour="black", size=font.size, vjust = 1)) + + ## theme(axis.text.y = element_text(colour="black", + ## size=font.size, hjust = 1)) + + ## ggtitle(title)+theme_bw() + ## p <- p + theme(axis.text.x = element_text(angle=angle.axis.x, + ## hjust=hjust.axis.x, + ## vjust=vjust.axis.x)) + + return(p) +} + +make_dotplot<-function(res.cmp,ontology) { + + dfok<-fortify.compareClusterResult(res.cmp) + dfok$Description <- sapply(as.vector(dfok$Description), function(x) {ifelse(nchar(x)>50, substr(x,1,50),x)},USE.NAMES = FALSE) + p<-plotting.clusterProfile(dfok, title="") + + #plot(p, type="dot") # + output_path= paste("GO_enrich_comp_",ontology,".png",sep="") + png(output_path,height = 720, width = 600) + pl <- plot(p, type="dot") + print(pl) + dev.off() +} + +get_cols <-function(input_cols) { + input_cols <- gsub("c","",gsub("C","",gsub(" ","",input_cols))) + if (grepl(":",input_cols)) { + first_col=unlist(strsplit(input_cols,":"))[1] + last_col=unlist(strsplit(input_cols,":"))[2] + cols=first_col:last_col + } else { + cols = as.integer(unlist(strsplit(input_cols,","))) + } + return(cols) +} + +#to check +cmp.GO <- function(l,fun="enrichGO",orgdb, ontology, readable=TRUE) { + cmpGO<-compareCluster(geneClusters = l, + fun=fun, + OrgDb = orgdb, + ont=ontology, + readable=TRUE) + + return(cmpGO) +} + +check_ids <- function(vector,type) { + uniprot_pattern = "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$" + entrez_id = "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$" + if (type == "entrez") + return(grepl(entrez_id,vector)) + else if (type == "uniprot") { + return(grepl(uniprot_pattern,vector)) + } +} + +main = function() { + + #to get the args of the command line + args=get_args() + + #saved + #l<-list() + #for(j in 1:args$nb){ + # i<-j-1 + # ids<-get_ids(args$input.i, args$inputtype.i, args$header.i, args$column.i) + # l[[args$name.i]]<-ids + + #} + #saved + + + l<-list() + for(j in 1:args$nb){ + i<-j-1 + ids<-get_ids( args[[paste("input",i,sep=".")]], + args[[paste("inputtype",i,sep=".")]], + args[[paste("header",i,sep=".")]], + args[[paste("column",i,sep=".")]] ) + + l[[args[[paste("name",i,sep=".")]]]]<-ids + +} + + ont = strsplit(args$ont, ",")[[1]] + org=args$org + + #load annot package + suppressMessages(library(args$org, character.only = TRUE, quietly = TRUE)) + + # Extract OrgDb + if (args$org=="org.Hs.eg.db") { + orgdb<-org.Hs.eg.db + } else if (args$org=="org.Mm.eg.db") { + orgdb<-org.Mm.eg.db + } else if (args$org=="org.Rn.eg.db") { + orgdb<-org.Rn.eg.db + } + + for(ontology in ont) { + + res.cmp<-cmp.GO(l=l,fun="enrichGO",orgdb, ontology, readable=TRUE) + make_dotplot(res.cmp,ontology) + output_path = paste("GO_enrich_comp_",ontology,".tsv",sep="") + write.table(res.cmp@compareClusterResult, output_path, sep="\t", row.names=F, quote=F) + } + +} #end main + +main() +