Mercurial > repos > proteore > proteore_maps_visualization
diff kegg_maps_visualization.R @ 0:9845dc9c7323 draft
planemo upload commit ad5f1c5a1a71d7fa2bc8bac408856aa80b0fc2a3
author | proteore |
---|---|
date | Tue, 18 Dec 2018 10:02:54 -0500 |
parents | |
children | 6f389729a30b |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kegg_maps_visualization.R Tue Dec 18 10:02:54 2018 -0500 @@ -0,0 +1,354 @@ +#!/usr/bin/Rscript +#Rscript made for mapping genesID on KEGG pathway with Pathview package +#input : csv file containing ids (uniprot or geneID) to map, plus parameters +#output : KEGG pathway : jpeg or pdf file. + +options(warn=-1) #TURN OFF WARNINGS !!!!!! +suppressMessages(library("pathview")) +suppressMessages(library(KEGGREST)) + +read_file <- function(path,header){ + file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="\"", check.names = F, comment.char = "#"),silent=TRUE) + if (inherits(file,"try-error")){ + stop("File not found !") + }else{ + return(file) + } +} + +##### fuction to clean and concatenate pathway name (allow more flexibility for user input) +concat_string <- function(x){ + x <- gsub(" - .*","",x) + x <- gsub(" ","",x) + x <- gsub("-","",x) + x <- gsub("_","",x) + x <- gsub(",","",x) + x <- gsub("\\'","",x) + x <- gsub("\\(.*)","",x) + x <- gsub("\\/","",x) + x <- tolower(x) + return(x) +} + +#return output suffix (pathway name) from id kegg (ex : hsa:00010) +get_suffix <- function(pathways_list,species,id){ + suffix = gsub("/","or",pathways_list[pathways_list[,1]==paste(species,id,sep=""),2]) + suffix = gsub(" ","_",suffix) + if (nchar(suffix) > 50){ + suffix = substr(suffix,1,50) + } + return(suffix) +} + +str2bool <- function(x){ + if (any(is.element(c("t","true"),tolower(x)))){ + return (TRUE) + }else if (any(is.element(c("f","false"),tolower(x)))){ + return (FALSE) + }else{ + return(NULL) + } +} + +is.letter <- function(x) grepl("[[:alpha:]]", x) + +#### hsa00010 -> 00010 +remove_kegg_prefix <- function(x){ + x = gsub(":","",x) + if (substr(x,1,4) == 'path'){ + x=substr(x,5,nchar(x)) + } + if (is.letter(substr(x,1,3))){ + x <- substr(x,4,nchar(x)) + } + return(x) +} + +kegg_to_geneID <- function(vector){ + vector <- sapply(vector, function(x) unlist(strsplit(x,":"))[2],USE.NAMES = F) + return (vector) +} + +clean_bad_character <- function(string) { + string <- gsub("X","",string) + return(string) +} + +get_list_from_cp <-function(list){ + list = strsplit(list, "[ \t\n]+")[[1]] + list = list[list != ""] #remove empty entry + list = gsub("-.+", "", list) #Remove isoform accession number (e.g. "-2") + return(list) +} + +get_ref_pathways <- function(species){ + ##all available pathways for the species + pathways <-keggLink("pathway", species) + tot_path<-unique(pathways) + + ##formating the dat into a list object + ##key= pathway ID, value = genes of the pathway in the kegg format + pathways_list <- sapply(tot_path, function(pathway) names(which(pathways==pathway))) + return (pathways_list) +} + +mapping_summary <- function(pv.out,species,id,id_type,pathways_list,geneID,uniprotID,mapped2geneID){ + ref_pathways = get_ref_pathways(species) + names(ref_pathways) <- sapply(names(ref_pathways), function(x) gsub("path:[a-z]{3}","",x),USE.NAMES = F) + + #genes present in pathway + genes = ref_pathways[id][[1]] + nb_genes = length(genes) + + #genes mapped on pathway genes + mapped <- unlist(sapply(pv.out$plot.data.gene$all.mapped, function(x) strsplit(x,",")),use.names = F) + mapped = unique(mapped[mapped!=""]) + nb_mapped <- length(mapped) + + #compue ratio of mapping + ratio = round((nb_mapped/nb_genes)*100, 2) + if (is.nan(ratio)) { ratio = ""} + pathway_id = paste(species,id,sep="") + pathway_name = as.character(pathways_list[pathways_list[,1]==pathway_id,][2]) + + if (id_type=="geneid" || id_type=="keggid") { + row <- c(pathway_id,pathway_name,length(unique(geneID)),nb_mapped,nb_genes,ratio,paste(mapped,collapse=";")) + names(row) <- c("KEGG pathway ID","pathway name","nb of Entrez gene ID used","nb of Entrez gene ID mapped", + "nb of Entrez gene ID in the pathway", "ratio of Entrez gene ID mapped (%)","Entrez gene ID mapped") + } else if (id_type=="uniprotid") { + row <- c(pathway_id,pathway_name,length(unique(uniprotID)),length(unique(geneID)),nb_mapped,nb_genes,ratio,paste(mapped,collapse=";"),paste(mapped2geneID[which(mapped2geneID[,2] %in% mapped)],collapse=";")) + names(row) <- c("KEGG pathway ID","pathway name","nb of Uniprot_AC used","nb of Entrez gene ID used","nb of Entrez gene ID mapped", + "nb of Entrez gene ID in the pathway", "ratio of Entrez gene ID mapped (%)","Entrez gene ID mapped","uniprot_AC mapped") + } + return(row) +} + +#take data frame, return data frame +split_ids_per_line <- function(line,ncol){ + + #print (line) + header = colnames(line) + line[ncol] = gsub("[[:blank:]]|\u00A0","",line[ncol]) + + if (length(unlist(strsplit(as.character(line[ncol]),";")))>1) { + if (length(line)==1 ) { + lines = as.data.frame(unlist(strsplit(as.character(line[ncol]),";")),stringsAsFactors = F) + } else { + if (ncol==1) { #first column + lines = suppressWarnings(cbind(unlist(strsplit(as.character(line[ncol]),";")), line[2:length(line)])) + } else if (ncol==length(line)) { #last column + lines = suppressWarnings(cbind(line[1:ncol-1],unlist(strsplit(as.character(line[ncol]),";")))) + } else { + lines = suppressWarnings(cbind(line[1:ncol-1], unlist(strsplit(as.character(line[ncol]),";"),use.names = F), line[(ncol+1):length(line)])) + } + } + colnames(lines)=header + return(lines) + } else { + return(line) + } +} + +#create new lines if there's more than one id per cell in the columns in order to have only one id per line +one_id_one_line <-function(tab,ncol){ + + if (ncol(tab)>1){ + + tab[,ncol] = sapply(tab[,ncol],function(x) gsub("[[:blank:]]","",x)) + header=colnames(tab) + res=as.data.frame(matrix(ncol=ncol(tab),nrow=0)) + for (i in 1:nrow(tab) ) { + lines = split_ids_per_line(tab[i,],ncol) + res = rbind(res,lines) + } + }else { + res = unlist(sapply(tab[,1],function(x) strsplit(x,";")),use.names = F) + res = data.frame(res[which(!is.na(res[res!=""]))],stringsAsFactors = F) + colnames(res)=colnames(tab) + } + return(res) +} + +get_limit <- function(mat) { + min = min(apply(mat,2,min)) + max = max(apply(mat,2,max)) + return(c(min,max)) +} + +get_args <- function(){ + + ## Collect arguments + args <- commandArgs(TRUE) + + ## Default setting when no arguments passed + if(length(args) < 1) { + args <- c("--help") + } + + ## Help section + if("--help" %in% args) { + cat("Pathview R script + Arguments: + --help Print this test + --input path of the input file (must contains a colum of uniprot and/or geneID accession number) + --id_list list of ids to use, ',' separated + --pathways_id Id(s) of pathway(s) to use, if several, semicolon separated list : hsa00010;hsa05412 + --id_type Type of accession number ('uniprotID' or 'geneID') + --id_column Column containing accesion number of interest (ex : 'c1') + --header Boolean, TRUE if header FALSE if not + --output Output filename + --fold_change_col Column(s) containing fold change values (comma separated) + --native_kegg TRUE : native KEGG graph, FALSE : Graphviz graph + --species KEGG species (hsa, mmu, ...) + --pathways_input Tab with pathways in a column, output format of find_pathways + --pathway_col Column of pathways to use + --header2 Boolean, TRUE if header FALSE if not + --pathways_list path of file containg the species pathways list (hsa_pathways.loc, mmu_pathways.loc, ...) + + Example: + ./PathView.R --input 'input.csv' --pathway_id '05412' --id_type 'uniprotID' --id_column 'c1' --header TRUE \n\n") + + q(save="no") + } + + parseArgs <- function(x) strsplit(sub("^--", "", x), "=") + argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) + args <- as.list(as.character(argsDF$V2)) + names(args) <- argsDF$V1 + + return(args) +} + +main <- function(){ + + args <- get_args() + + #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/kegg_maps_visualization/args.Rda") + #load("/home/dchristiany/proteore_project/ProteoRE/tools/kegg_maps_visualization/args.Rda") + + ###setting variables + if (!is.null(args$pathways_id)) { + ids <- get_list_from_cp(clean_bad_character(args$pathways_id)) + ids <- sapply(ids, function(x) remove_kegg_prefix(x),USE.NAMES = FALSE) + }else if (!is.null(args$pathways_input)){ + header2 <- str2bool(args$header2) + pathway_col <- as.numeric(gsub("c", "" ,args$pathway_col)) + pathways_file = read_file(args$pathways_input,header2) + ids <- sapply(rapply(strsplit(clean_bad_character(pathways_file[,pathway_col]),","),c), function(x) remove_kegg_prefix(x),USE.NAMES = FALSE) + } + pathways_list <- read_file(args$pathways_list,F) + if (!is.null(args$id_list)) { + id_list <- get_list_from_cp(args$id_list) + } + id_type <- tolower(args$id_type) + ncol <- as.numeric(gsub("c", "" ,args$id_column)) + header <- str2bool(args$header) + native_kegg <- str2bool(args$native_kegg) + species=args$species + fold_change_data = str2bool(args$fold_change_data) + + #org list used in mapped2geneID + org <- c('Hs','Mm','Rn') + names(org) <- c('hsa','mmu','rno') + + #read input file or list + if (!is.null(args$input)){ + tab <- read_file(args$input,header) + tab <- data.frame(tab[which(tab[ncol]!=""),],stringsAsFactors = F) + tab = one_id_one_line(tab,ncol) + } else { + id_list = gsub("[[:blank:]]|\u00A0|NA","",id_list) + id_list = unique(id_list[id_list!=""]) + tab <- data.frame(id_list,stringsAsFactors = F) + ncol=1 + } + + + ##### map uniprotID to entrez geneID and kegg to geneID + uniprotID="" + mapped2geneID="" + if (id_type == "uniprotid") { + uniprotID=tab[,ncol] + mapped2geneID = id2eg(ids = uniprotID, category = "uniprot", org = org[[species]], pkg.name = NULL) + geneID = mapped2geneID[,2] + tab = cbind(tab,geneID) + ncol=ncol(tab) + }else if (id_type == "keggid"){ + keggID = tab[,ncol] + geneID = kegg_to_geneID(keggID) + tab = cbind(tab,geneID) + ncol=ncol(tab) + }else if (id_type == "geneid"){ + colnames(tab)[ncol] <- "geneID" + } + + ##### build matrix to map on KEGG pathway (kgml : KEGG xml) + geneID_indices = which(!is.na(tab$geneID)) + if (fold_change_data) { + fold_change <- as.integer(unlist(strsplit(gsub("c","",args$fold_change_col),","))) + if (length(fold_change) > 3) { fold_change= fold_change[1:3] } + if (length(fold_change)==1){ + tab[,fold_change] <- as.double(gsub(",",".",as.character(tab[,fold_change]) )) + } else { + tab[,fold_change] <- apply(tab[,fold_change],2,function(x) as.double(gsub(",",".",as.character(x)))) + } + mat = tab[geneID_indices,c(ncol,fold_change)] + mat = mat[(!duplicated(mat$geneID)),] + geneID=mat$geneID + mat = as.data.frame(mat[,-1]) + row.names(mat)=geneID + limit = get_limit(mat) + } else { + mat = unique(as.character(tab$geneID[!is.na(tab$geneID[tab$geneID!=""])])) + geneID=mat + limit=1 + } + + #####mapping geneID (with or without expression values) on KEGG pathway + plot.col.key= TRUE + low_color = "green" + mid_color = "#F3F781" #yellow + high_color = "red" + if (!fold_change_data) { + plot.col.key= FALSE #if there's no exrepession data, we don't show the color key + high_color = "#81BEF7" #blue + } + + #create graph(s) and text output + for (id in ids) { + suffix= get_suffix(pathways_list,species,id) + pv.out <- suppressMessages(pathview(gene.data = mat, + gene.idtype = "entrez", + pathway.id = id, + species = species, + kegg.dir = ".", + out.suffix=suffix, + kegg.native = native_kegg, + low = list(gene = low_color, cpd = "blue"), + mid = list(gene = mid_color, cpd = "transparent"), + high = list(gene = high_color, cpd = "yellow"), + na.col="#D8D8D8", #gray + cpd.data=NULL, + plot.col.key = plot.col.key, + pdf.size=c(9,9), + limit=list(gene=limit, cpd=limit))) + + if (is.list(pv.out)){ + + #creating text file + if (!exists("DF")) { + DF <- data.frame(t(mapping_summary(pv.out,species,id,id_type,pathways_list,geneID,uniprotID,mapped2geneID)),stringsAsFactors = F,check.names = F) + } else { + #print (mapping_summary(pv.out,species,id)) + DF <- rbind(DF,data.frame(t(mapping_summary(pv.out,species,id,id_type,pathways_list,geneID,uniprotID,mapped2geneID)),stringsAsFactors = F,check.names = F)) + } + } + } + + DF <- as.data.frame(apply(DF, c(1,2), function(x) gsub("^$|^ $", NA, x))) #convert "" et " " to NA + + #text file output + write.table(DF,file=args$output,quote=FALSE, sep='\t',row.names = FALSE, col.names = TRUE) +} + +main()