Mercurial > repos > proteore > proteore_maps_visualization
comparison kegg_maps_visualization.R @ 0:9845dc9c7323 draft
planemo upload commit ad5f1c5a1a71d7fa2bc8bac408856aa80b0fc2a3
author | proteore |
---|---|
date | Tue, 18 Dec 2018 10:02:54 -0500 |
parents | |
children | 6f389729a30b |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9845dc9c7323 |
---|---|
1 #!/usr/bin/Rscript | |
2 #Rscript made for mapping genesID on KEGG pathway with Pathview package | |
3 #input : csv file containing ids (uniprot or geneID) to map, plus parameters | |
4 #output : KEGG pathway : jpeg or pdf file. | |
5 | |
6 options(warn=-1) #TURN OFF WARNINGS !!!!!! | |
7 suppressMessages(library("pathview")) | |
8 suppressMessages(library(KEGGREST)) | |
9 | |
10 read_file <- function(path,header){ | |
11 file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="\"", check.names = F, comment.char = "#"),silent=TRUE) | |
12 if (inherits(file,"try-error")){ | |
13 stop("File not found !") | |
14 }else{ | |
15 return(file) | |
16 } | |
17 } | |
18 | |
19 ##### fuction to clean and concatenate pathway name (allow more flexibility for user input) | |
20 concat_string <- function(x){ | |
21 x <- gsub(" - .*","",x) | |
22 x <- gsub(" ","",x) | |
23 x <- gsub("-","",x) | |
24 x <- gsub("_","",x) | |
25 x <- gsub(",","",x) | |
26 x <- gsub("\\'","",x) | |
27 x <- gsub("\\(.*)","",x) | |
28 x <- gsub("\\/","",x) | |
29 x <- tolower(x) | |
30 return(x) | |
31 } | |
32 | |
33 #return output suffix (pathway name) from id kegg (ex : hsa:00010) | |
34 get_suffix <- function(pathways_list,species,id){ | |
35 suffix = gsub("/","or",pathways_list[pathways_list[,1]==paste(species,id,sep=""),2]) | |
36 suffix = gsub(" ","_",suffix) | |
37 if (nchar(suffix) > 50){ | |
38 suffix = substr(suffix,1,50) | |
39 } | |
40 return(suffix) | |
41 } | |
42 | |
43 str2bool <- function(x){ | |
44 if (any(is.element(c("t","true"),tolower(x)))){ | |
45 return (TRUE) | |
46 }else if (any(is.element(c("f","false"),tolower(x)))){ | |
47 return (FALSE) | |
48 }else{ | |
49 return(NULL) | |
50 } | |
51 } | |
52 | |
53 is.letter <- function(x) grepl("[[:alpha:]]", x) | |
54 | |
55 #### hsa00010 -> 00010 | |
56 remove_kegg_prefix <- function(x){ | |
57 x = gsub(":","",x) | |
58 if (substr(x,1,4) == 'path'){ | |
59 x=substr(x,5,nchar(x)) | |
60 } | |
61 if (is.letter(substr(x,1,3))){ | |
62 x <- substr(x,4,nchar(x)) | |
63 } | |
64 return(x) | |
65 } | |
66 | |
67 kegg_to_geneID <- function(vector){ | |
68 vector <- sapply(vector, function(x) unlist(strsplit(x,":"))[2],USE.NAMES = F) | |
69 return (vector) | |
70 } | |
71 | |
72 clean_bad_character <- function(string) { | |
73 string <- gsub("X","",string) | |
74 return(string) | |
75 } | |
76 | |
77 get_list_from_cp <-function(list){ | |
78 list = strsplit(list, "[ \t\n]+")[[1]] | |
79 list = list[list != ""] #remove empty entry | |
80 list = gsub("-.+", "", list) #Remove isoform accession number (e.g. "-2") | |
81 return(list) | |
82 } | |
83 | |
84 get_ref_pathways <- function(species){ | |
85 ##all available pathways for the species | |
86 pathways <-keggLink("pathway", species) | |
87 tot_path<-unique(pathways) | |
88 | |
89 ##formating the dat into a list object | |
90 ##key= pathway ID, value = genes of the pathway in the kegg format | |
91 pathways_list <- sapply(tot_path, function(pathway) names(which(pathways==pathway))) | |
92 return (pathways_list) | |
93 } | |
94 | |
95 mapping_summary <- function(pv.out,species,id,id_type,pathways_list,geneID,uniprotID,mapped2geneID){ | |
96 ref_pathways = get_ref_pathways(species) | |
97 names(ref_pathways) <- sapply(names(ref_pathways), function(x) gsub("path:[a-z]{3}","",x),USE.NAMES = F) | |
98 | |
99 #genes present in pathway | |
100 genes = ref_pathways[id][[1]] | |
101 nb_genes = length(genes) | |
102 | |
103 #genes mapped on pathway genes | |
104 mapped <- unlist(sapply(pv.out$plot.data.gene$all.mapped, function(x) strsplit(x,",")),use.names = F) | |
105 mapped = unique(mapped[mapped!=""]) | |
106 nb_mapped <- length(mapped) | |
107 | |
108 #compue ratio of mapping | |
109 ratio = round((nb_mapped/nb_genes)*100, 2) | |
110 if (is.nan(ratio)) { ratio = ""} | |
111 pathway_id = paste(species,id,sep="") | |
112 pathway_name = as.character(pathways_list[pathways_list[,1]==pathway_id,][2]) | |
113 | |
114 if (id_type=="geneid" || id_type=="keggid") { | |
115 row <- c(pathway_id,pathway_name,length(unique(geneID)),nb_mapped,nb_genes,ratio,paste(mapped,collapse=";")) | |
116 names(row) <- c("KEGG pathway ID","pathway name","nb of Entrez gene ID used","nb of Entrez gene ID mapped", | |
117 "nb of Entrez gene ID in the pathway", "ratio of Entrez gene ID mapped (%)","Entrez gene ID mapped") | |
118 } else if (id_type=="uniprotid") { | |
119 row <- c(pathway_id,pathway_name,length(unique(uniprotID)),length(unique(geneID)),nb_mapped,nb_genes,ratio,paste(mapped,collapse=";"),paste(mapped2geneID[which(mapped2geneID[,2] %in% mapped)],collapse=";")) | |
120 names(row) <- c("KEGG pathway ID","pathway name","nb of Uniprot_AC used","nb of Entrez gene ID used","nb of Entrez gene ID mapped", | |
121 "nb of Entrez gene ID in the pathway", "ratio of Entrez gene ID mapped (%)","Entrez gene ID mapped","uniprot_AC mapped") | |
122 } | |
123 return(row) | |
124 } | |
125 | |
126 #take data frame, return data frame | |
127 split_ids_per_line <- function(line,ncol){ | |
128 | |
129 #print (line) | |
130 header = colnames(line) | |
131 line[ncol] = gsub("[[:blank:]]|\u00A0","",line[ncol]) | |
132 | |
133 if (length(unlist(strsplit(as.character(line[ncol]),";")))>1) { | |
134 if (length(line)==1 ) { | |
135 lines = as.data.frame(unlist(strsplit(as.character(line[ncol]),";")),stringsAsFactors = F) | |
136 } else { | |
137 if (ncol==1) { #first column | |
138 lines = suppressWarnings(cbind(unlist(strsplit(as.character(line[ncol]),";")), line[2:length(line)])) | |
139 } else if (ncol==length(line)) { #last column | |
140 lines = suppressWarnings(cbind(line[1:ncol-1],unlist(strsplit(as.character(line[ncol]),";")))) | |
141 } else { | |
142 lines = suppressWarnings(cbind(line[1:ncol-1], unlist(strsplit(as.character(line[ncol]),";"),use.names = F), line[(ncol+1):length(line)])) | |
143 } | |
144 } | |
145 colnames(lines)=header | |
146 return(lines) | |
147 } else { | |
148 return(line) | |
149 } | |
150 } | |
151 | |
152 #create new lines if there's more than one id per cell in the columns in order to have only one id per line | |
153 one_id_one_line <-function(tab,ncol){ | |
154 | |
155 if (ncol(tab)>1){ | |
156 | |
157 tab[,ncol] = sapply(tab[,ncol],function(x) gsub("[[:blank:]]","",x)) | |
158 header=colnames(tab) | |
159 res=as.data.frame(matrix(ncol=ncol(tab),nrow=0)) | |
160 for (i in 1:nrow(tab) ) { | |
161 lines = split_ids_per_line(tab[i,],ncol) | |
162 res = rbind(res,lines) | |
163 } | |
164 }else { | |
165 res = unlist(sapply(tab[,1],function(x) strsplit(x,";")),use.names = F) | |
166 res = data.frame(res[which(!is.na(res[res!=""]))],stringsAsFactors = F) | |
167 colnames(res)=colnames(tab) | |
168 } | |
169 return(res) | |
170 } | |
171 | |
172 get_limit <- function(mat) { | |
173 min = min(apply(mat,2,min)) | |
174 max = max(apply(mat,2,max)) | |
175 return(c(min,max)) | |
176 } | |
177 | |
178 get_args <- function(){ | |
179 | |
180 ## Collect arguments | |
181 args <- commandArgs(TRUE) | |
182 | |
183 ## Default setting when no arguments passed | |
184 if(length(args) < 1) { | |
185 args <- c("--help") | |
186 } | |
187 | |
188 ## Help section | |
189 if("--help" %in% args) { | |
190 cat("Pathview R script | |
191 Arguments: | |
192 --help Print this test | |
193 --input path of the input file (must contains a colum of uniprot and/or geneID accession number) | |
194 --id_list list of ids to use, ',' separated | |
195 --pathways_id Id(s) of pathway(s) to use, if several, semicolon separated list : hsa00010;hsa05412 | |
196 --id_type Type of accession number ('uniprotID' or 'geneID') | |
197 --id_column Column containing accesion number of interest (ex : 'c1') | |
198 --header Boolean, TRUE if header FALSE if not | |
199 --output Output filename | |
200 --fold_change_col Column(s) containing fold change values (comma separated) | |
201 --native_kegg TRUE : native KEGG graph, FALSE : Graphviz graph | |
202 --species KEGG species (hsa, mmu, ...) | |
203 --pathways_input Tab with pathways in a column, output format of find_pathways | |
204 --pathway_col Column of pathways to use | |
205 --header2 Boolean, TRUE if header FALSE if not | |
206 --pathways_list path of file containg the species pathways list (hsa_pathways.loc, mmu_pathways.loc, ...) | |
207 | |
208 Example: | |
209 ./PathView.R --input 'input.csv' --pathway_id '05412' --id_type 'uniprotID' --id_column 'c1' --header TRUE \n\n") | |
210 | |
211 q(save="no") | |
212 } | |
213 | |
214 parseArgs <- function(x) strsplit(sub("^--", "", x), "=") | |
215 argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) | |
216 args <- as.list(as.character(argsDF$V2)) | |
217 names(args) <- argsDF$V1 | |
218 | |
219 return(args) | |
220 } | |
221 | |
222 main <- function(){ | |
223 | |
224 args <- get_args() | |
225 | |
226 #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/kegg_maps_visualization/args.Rda") | |
227 #load("/home/dchristiany/proteore_project/ProteoRE/tools/kegg_maps_visualization/args.Rda") | |
228 | |
229 ###setting variables | |
230 if (!is.null(args$pathways_id)) { | |
231 ids <- get_list_from_cp(clean_bad_character(args$pathways_id)) | |
232 ids <- sapply(ids, function(x) remove_kegg_prefix(x),USE.NAMES = FALSE) | |
233 }else if (!is.null(args$pathways_input)){ | |
234 header2 <- str2bool(args$header2) | |
235 pathway_col <- as.numeric(gsub("c", "" ,args$pathway_col)) | |
236 pathways_file = read_file(args$pathways_input,header2) | |
237 ids <- sapply(rapply(strsplit(clean_bad_character(pathways_file[,pathway_col]),","),c), function(x) remove_kegg_prefix(x),USE.NAMES = FALSE) | |
238 } | |
239 pathways_list <- read_file(args$pathways_list,F) | |
240 if (!is.null(args$id_list)) { | |
241 id_list <- get_list_from_cp(args$id_list) | |
242 } | |
243 id_type <- tolower(args$id_type) | |
244 ncol <- as.numeric(gsub("c", "" ,args$id_column)) | |
245 header <- str2bool(args$header) | |
246 native_kegg <- str2bool(args$native_kegg) | |
247 species=args$species | |
248 fold_change_data = str2bool(args$fold_change_data) | |
249 | |
250 #org list used in mapped2geneID | |
251 org <- c('Hs','Mm','Rn') | |
252 names(org) <- c('hsa','mmu','rno') | |
253 | |
254 #read input file or list | |
255 if (!is.null(args$input)){ | |
256 tab <- read_file(args$input,header) | |
257 tab <- data.frame(tab[which(tab[ncol]!=""),],stringsAsFactors = F) | |
258 tab = one_id_one_line(tab,ncol) | |
259 } else { | |
260 id_list = gsub("[[:blank:]]|\u00A0|NA","",id_list) | |
261 id_list = unique(id_list[id_list!=""]) | |
262 tab <- data.frame(id_list,stringsAsFactors = F) | |
263 ncol=1 | |
264 } | |
265 | |
266 | |
267 ##### map uniprotID to entrez geneID and kegg to geneID | |
268 uniprotID="" | |
269 mapped2geneID="" | |
270 if (id_type == "uniprotid") { | |
271 uniprotID=tab[,ncol] | |
272 mapped2geneID = id2eg(ids = uniprotID, category = "uniprot", org = org[[species]], pkg.name = NULL) | |
273 geneID = mapped2geneID[,2] | |
274 tab = cbind(tab,geneID) | |
275 ncol=ncol(tab) | |
276 }else if (id_type == "keggid"){ | |
277 keggID = tab[,ncol] | |
278 geneID = kegg_to_geneID(keggID) | |
279 tab = cbind(tab,geneID) | |
280 ncol=ncol(tab) | |
281 }else if (id_type == "geneid"){ | |
282 colnames(tab)[ncol] <- "geneID" | |
283 } | |
284 | |
285 ##### build matrix to map on KEGG pathway (kgml : KEGG xml) | |
286 geneID_indices = which(!is.na(tab$geneID)) | |
287 if (fold_change_data) { | |
288 fold_change <- as.integer(unlist(strsplit(gsub("c","",args$fold_change_col),","))) | |
289 if (length(fold_change) > 3) { fold_change= fold_change[1:3] } | |
290 if (length(fold_change)==1){ | |
291 tab[,fold_change] <- as.double(gsub(",",".",as.character(tab[,fold_change]) )) | |
292 } else { | |
293 tab[,fold_change] <- apply(tab[,fold_change],2,function(x) as.double(gsub(",",".",as.character(x)))) | |
294 } | |
295 mat = tab[geneID_indices,c(ncol,fold_change)] | |
296 mat = mat[(!duplicated(mat$geneID)),] | |
297 geneID=mat$geneID | |
298 mat = as.data.frame(mat[,-1]) | |
299 row.names(mat)=geneID | |
300 limit = get_limit(mat) | |
301 } else { | |
302 mat = unique(as.character(tab$geneID[!is.na(tab$geneID[tab$geneID!=""])])) | |
303 geneID=mat | |
304 limit=1 | |
305 } | |
306 | |
307 #####mapping geneID (with or without expression values) on KEGG pathway | |
308 plot.col.key= TRUE | |
309 low_color = "green" | |
310 mid_color = "#F3F781" #yellow | |
311 high_color = "red" | |
312 if (!fold_change_data) { | |
313 plot.col.key= FALSE #if there's no exrepession data, we don't show the color key | |
314 high_color = "#81BEF7" #blue | |
315 } | |
316 | |
317 #create graph(s) and text output | |
318 for (id in ids) { | |
319 suffix= get_suffix(pathways_list,species,id) | |
320 pv.out <- suppressMessages(pathview(gene.data = mat, | |
321 gene.idtype = "entrez", | |
322 pathway.id = id, | |
323 species = species, | |
324 kegg.dir = ".", | |
325 out.suffix=suffix, | |
326 kegg.native = native_kegg, | |
327 low = list(gene = low_color, cpd = "blue"), | |
328 mid = list(gene = mid_color, cpd = "transparent"), | |
329 high = list(gene = high_color, cpd = "yellow"), | |
330 na.col="#D8D8D8", #gray | |
331 cpd.data=NULL, | |
332 plot.col.key = plot.col.key, | |
333 pdf.size=c(9,9), | |
334 limit=list(gene=limit, cpd=limit))) | |
335 | |
336 if (is.list(pv.out)){ | |
337 | |
338 #creating text file | |
339 if (!exists("DF")) { | |
340 DF <- data.frame(t(mapping_summary(pv.out,species,id,id_type,pathways_list,geneID,uniprotID,mapped2geneID)),stringsAsFactors = F,check.names = F) | |
341 } else { | |
342 #print (mapping_summary(pv.out,species,id)) | |
343 DF <- rbind(DF,data.frame(t(mapping_summary(pv.out,species,id,id_type,pathways_list,geneID,uniprotID,mapped2geneID)),stringsAsFactors = F,check.names = F)) | |
344 } | |
345 } | |
346 } | |
347 | |
348 DF <- as.data.frame(apply(DF, c(1,2), function(x) gsub("^$|^ $", NA, x))) #convert "" et " " to NA | |
349 | |
350 #text file output | |
351 write.table(DF,file=args$output,quote=FALSE, sep='\t',row.names = FALSE, col.names = TRUE) | |
352 } | |
353 | |
354 main() |