Mercurial > repos > proteore > proteore_kegg_pathways_coverage
changeset 6:f4e32dee3b28 draft default tip
"planemo upload commit 151e7b469b231bbc43c4c39e8e836b05ab6d2253-dirty"
author | proteore |
---|---|
date | Mon, 17 May 2021 12:29:42 +0000 |
parents | 86d2f0377e69 |
children | |
files | kegg_identification.R kegg_identification.xml |
diffstat | 2 files changed, 137 insertions(+), 116 deletions(-) [+] |
line wrap: on
line diff
--- a/kegg_identification.R Fri Jan 24 10:46:35 2020 -0500 +++ b/kegg_identification.R Mon May 17 12:29:42 2021 +0000 @@ -1,19 +1,19 @@ -options(warn=-1) #TURN OFF WARNINGS !!!!!! +options(warn = -1) #TURN OFF WARNINGS !!!!!! suppressMessages(library(KEGGREST)) -get_args <- function(){ - +get_args <- function() { + ## Collect arguments args <- commandArgs(TRUE) - + ## Default setting when no arguments passed - if(length(args) < 1) { + if (length(args) < 1) { args <- c("--help") } - + ## Help section - if("--help" %in% args) { + if ("--help" %in% args) { cat("Pathview R script Arguments: --help Print this test @@ -24,180 +24,201 @@ --nb_pathways number of pathways to return --header boolean --output output path - --species species used to get specific pathways (hsa,mmu,rno) + --species species used to get specific pathways(hsa,mmu,rno) Example: - Rscript keggrest.R --input='P31946,P62258' --id_type='uniprot' --id_column 'c1' --header TRUE \n\n") - - q(save="no") + Rscript keggrest.R --input='P31946,P62258' --id_type='uniprot' + --id_column 'c1' --header TRUE \n\n") + + q(save = "no") } - - parseArgs <- function(x) strsplit(sub("^--", "", x), "=") - argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) - args <- as.list(as.character(argsDF$V2)) - names(args) <- argsDF$V1 - + + parseargs <- function(x) strsplit(sub("^--", "", x), "=") + argsdf <- as.data.frame(do.call("rbind", parseargs(args))) + args <- as.list(as.character(argsdf$V2)) + names(args) <- argsdf$V1 + return(args) } -str2bool <- function(x){ - if (any(is.element(c("t","true"),tolower(x)))){ - return (TRUE) - }else if (any(is.element(c("f","false"),tolower(x)))){ - return (FALSE) - }else{ +str2bool <- function(x) { + if (any(is.element(c("t", "true"), tolower(x)))) { + return(TRUE) + }else if (any(is.element(c("f", "false"), tolower(x)))) { + return(FALSE) + }else { return(NULL) } } -read_file <- function(path,header){ - file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="\"", check.names = F),silent=TRUE) - if (inherits(file,"try-error")){ +read_file <- function(path, header) { + file <- try(read.csv(path, header = header, sep = "\t", + stringsAsFactors = FALSE, quote = "\"", check.names = F), silent = TRUE) + if (inherits(file, "try-error")) { stop("File not found !") - }else{ + }else { return(file) } } -get_pathways_list <- function(species){ +get_pathways_list <- function(species) { ##all available pathways for the species - pathways <-keggLink("pathway", species) - tot_path<-unique(pathways) - + pathways <- keggLink("pathway", species) + tot_path <- unique(pathways) + ##formating the dat into a list object ##key= pathway ID, value = genes of the pathway in the kegg format - pathways_list <- sapply(tot_path, function(pathway) names(which(pathways==pathway))) - return (pathways_list) + pathways_list <- sapply(tot_path, function(pathway) + names(which(pathways == pathway))) + return(pathways_list) } -get_list_from_cp <-function(list){ - list = strsplit(list, "[ \t\n]+")[[1]] - list = gsub("[[:blank:]]|\u00A0|NA","",list) - list = list[which(!is.na(list[list != ""]))] #remove empty entry - list = unique(gsub("-.+", "", list)) #Remove isoform accession number (e.g. "-2") +get_list_from_cp <- function(list) { + list <- strsplit(list, "[ \t\n]+")[[1]] + list <- gsub("[[:blank:]]|\u00A0|NA", "", list) + list <- list[which(!is.na(list[list != ""]))] #remove empty entry + list <- unique(gsub("-.+", "", list)) + #Remove isoform accession number (e.g. "-2") return(list) } -geneID_to_kegg <- function(vector,species){ - vector <- sapply(vector, function(x) paste(species,x,sep=":"),USE.NAMES = F) - return (vector) +geneid_to_kegg <- function(vector, species) { + vector <- sapply(vector, function(x) paste(species, x, sep = ":"), + USE.NAMES = F) + return(vector) } -to_keggID <- function(id_list,id_type){ - if (id_type == "ncbi-geneid") { - id_list <- unique(geneID_to_kegg(id_list,args$species)) - } else if (id_type=="uniprot"){ - id_list <- unique(sapply(id_list, function(x) paste(id_type,":",x,sep=""),USE.NAMES = F)) - if (length(id_list)>250){ - id_list <- split(id_list, ceiling(seq_along(id_list)/250)) - id_list <- sapply(id_list, function(x) keggConv("genes",x)) +to_keggid <- function(id_list, id_type) { + if (id_type == "ncbi-geneid") { + id_list <- unique(geneid_to_kegg(id_list, args$species)) + }else if (id_type == "uniprot") { + id_list <- unique(sapply(id_list, function(x) + paste(id_type, ":", x, sep = ""), USE.NAMES = F)) + if (length(id_list) > 250) { + id_list <- split(id_list, ceiling(seq_along(id_list) / 250)) + id_list <- sapply(id_list, function(x) keggConv("genes", x)) id_list <- unique(unlist(id_list)) } else { id_list <- unique(keggConv("genes", id_list)) } - } else if (id_type=="kegg-id") { + } else if (id_type == "kegg-id") { id_list <- unique(id_list) } - return (id_list) + return(id_list) } #take data frame, return data frame -split_ids_per_line <- function(line,ncol){ - +split_ids_per_line <- function(line, ncol) { + #print (line) - header = colnames(line) - line[ncol] = gsub("[[:blank:]]|\u00A0","",line[ncol]) - - if (length(unlist(strsplit(as.character(line[ncol]),";")))>1) { - if (length(line)==1 ) { - lines = as.data.frame(unlist(strsplit(as.character(line[ncol]),";")),stringsAsFactors = F) + header <- colnames(line) + line[ncol] <- gsub("[[:blank:]]|\u00A0", "", line[ncol]) + + if (length(unlist(strsplit(as.character(line[ncol]), ";"))) > 1) { + if (length(line) == 1) { + lines <- as.data.frame(unlist(strsplit( + as.character(line[ncol]), ";")), stringsAsFactors = F) } else { - if (ncol==1) { #first column - lines = suppressWarnings(cbind(unlist(strsplit(as.character(line[ncol]),";")), line[2:length(line)])) - } else if (ncol==length(line)) { #last column - lines = suppressWarnings(cbind(line[1:ncol-1],unlist(strsplit(as.character(line[ncol]),";")))) + if (ncol == 1) { #first column + lines <- suppressWarnings(cbind(unlist(strsplit( + as.character(line[ncol]), ";")), line[2:length(line)])) + } else if (ncol == length(line)) { #last column + lines <- suppressWarnings(cbind(line[1:ncol - 1], + unlist(strsplit(as.character(line[ncol]), ";")))) } else { - lines = suppressWarnings(cbind(line[1:ncol-1], unlist(strsplit(as.character(line[ncol]),";"),use.names = F), line[(ncol+1):length(line)])) + lines <- suppressWarnings(cbind(line[1:ncol - 1], + unlist(strsplit(as.character(line[ncol]), ";"), use.names = F), + line[(ncol + 1):length(line)])) } } - colnames(lines)=header + colnames(lines) <- header return(lines) } else { return(line) } } -#create new lines if there's more than one id per cell in the columns in order to have only one id per line -one_id_one_line <-function(tab,ncol){ - - if (ncol(tab)>1){ - - tab[,ncol] = sapply(tab[,ncol],function(x) gsub("[[:blank:]]","",x)) - header=colnames(tab) - res=as.data.frame(matrix(ncol=ncol(tab),nrow=0)) - for (i in 1:nrow(tab) ) { - lines = split_ids_per_line(tab[i,],ncol) - res = rbind(res,lines) +#create new lines if there's more than one id per cell in the columns in order +#to have only one id per line +one_id_one_line <- function(tab, ncol) { + + if (ncol(tab) > 1) { + + tab[, ncol] <- sapply(tab[, ncol], function(x) gsub("[[:blank:]]", "", x)) + header <- colnames(tab) + res <- as.data.frame(matrix(ncol = ncol(tab), nrow = 0)) + for (i in seq_len(nrow(tab))) { + lines <- split_ids_per_line(tab[i, ], ncol) + res <- rbind(res, lines) } - }else { - res = unlist(sapply(tab[,1],function(x) strsplit(x,";")),use.names = F) - res = data.frame(res[which(!is.na(res[res!=""]))],stringsAsFactors = F) - colnames(res)=colnames(tab) + } else { + res <- unlist(sapply(tab[, 1], function(x) strsplit(x, ";")), use.names = F) + res <- data.frame(res[which(!is.na(res[res != ""]))], stringsAsFactors = F) + colnames(res) <- colnames(tab) } return(res) } -kegg_mapping<- function(kegg_id_list,id_type,ref_ids) { - +kegg_mapping <- function(kegg_id_list, id_type, ref_ids) { + #mapping - map<-lapply(ref_ids, is.element, unique(kegg_id_list)) - names(map) <- sapply(names(map), function(x) gsub("path:","",x),USE.NAMES = FALSE) #remove the prefix "path:" - - in.path<-sapply(map, function(x) length(which(x==TRUE))) - tot.path<-sapply(map, length) - - ratio <- (as.numeric(in.path[which(in.path!=0)])) / (as.numeric(tot.path[which(in.path!=0)])) - ratio <- as.numeric(format(round(ratio*100, 2), nsmall = 2)) - + map <- lapply(ref_ids, is.element, unique(kegg_id_list)) + names(map) <- sapply(names(map), function(x) gsub("path:", "", x), + USE.NAMES = FALSE) #remove the prefix "path:" + + in_path <- sapply(map, function(x) length(which(x == TRUE))) + tot_path <- sapply(map, length) + + ratio <- (as.numeric(in_path[which(in_path != 0)])) / + (as.numeric(tot_path[which(in_path != 0)])) + ratio <- as.numeric(format(round(ratio * 100, 2), nsmall = 2)) + ##useful but LONG ## to do before : in step 1 - path.names<-names(in.path[which(in.path!=0)]) - name <- sapply(path.names, function(x) keggGet(x)[[1]]$NAME,USE.NAMES = FALSE) - - res<-data.frame(I(names(in.path[which(in.path!=0)])), I(name), ratio, as.numeric(in.path[which(in.path!=0)]), as.numeric(tot.path[which(in.path!=0)])) - res <- res[order(as.numeric(res[,3]),decreasing = TRUE),] - colnames(res)<-c("pathway_ID", "Description" , "Ratio IDs mapped/total IDs (%)" ,"nb KEGG genes IDs mapped in the pathway", "nb total of KEGG genes IDs present in the pathway") - + path_names <- names(in_path[which(in_path != 0)]) + name <- sapply(path_names, function(x) keggGet(x)[[1]]$NAME, + USE.NAMES = FALSE) + + res <- data.frame(I(names(in_path[which(in_path != 0)])), I(name), ratio, + as.numeric(in_path[which(in_path != 0)]), + as.numeric(tot_path[which(in_path != 0)])) + res <- res[order(as.numeric(res[, 3]), decreasing = TRUE), ] + colnames(res) <- c("pathway_ID", "Description", + "Ratio IDs mapped / total IDs (%)", + "nb KEGG genes IDs mapped in the pathway", + "nb total of KEGG genes IDs present in the pathway") + return(res) - + } #get args from command line args <- get_args() -#save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/kegg_identification/args.Rda") -#load("/home/dchristiany/proteore_project/ProteoRE/tools/kegg_identification/args.Rda") - ###setting variables -header = str2bool(args$header) -if (!is.null(args$id_list)) {id_list <- get_list_from_cp(args$id_list)} #get ids from copy/paste input -if (!is.null(args$input)) { #get ids from input file - csv <- read_file(args$input,header) - ncol <- as.numeric(gsub("c", "" ,args$id_column)) - csv <- one_id_one_line(csv,ncol) - id_list <- as.vector(csv[,ncol]) - id_list <- unique(id_list[which(!is.na(id_list[id_list!=""]))]) +header <- str2bool(args$header) +if (!is.null(args$id_list)) { + id_list <- get_list_from_cp(args$id_list) + } #get ids from copy/paste input +if (!is.null(args$input)) { #get ids from input file + csv <- read_file(args$input, header) + ncol <- as.numeric(gsub("c", "", args$id_column)) + csv <- one_id_one_line(csv, ncol) + id_list <- as.vector(csv[, ncol]) + id_list <- unique(id_list[which(!is.na(id_list[id_list != ""]))]) } #convert to keggID if needed -id_list <- to_keggID(id_list,args$id_type) +id_list <- to_keggid(id_list, args$id_type) #get pathways of species with associated KEGG ID genes pathways_list <- get_pathways_list(args$species) #mapping on pathways -res <- kegg_mapping(id_list,args$id_type,pathways_list) -if (nrow(res) > as.numeric(args$nb_pathways)) { res <- res[1:args$nb_pathways,] } +res <- kegg_mapping(id_list, args$id_type, pathways_list) +if (nrow(res) > as.numeric(args$nb_pathways)) { + res <- res[1:args$nb_pathways, ] + } -write.table(res, file=args$output, quote=FALSE, sep='\t',row.names = FALSE, col.names = TRUE) - +write.table(res, file = args$output, quote = FALSE, sep = "\t", + row.names = FALSE, col.names = TRUE)
--- a/kegg_identification.xml Fri Jan 24 10:46:35 2020 -0500 +++ b/kegg_identification.xml Mon May 17 12:29:42 2021 +0000 @@ -1,4 +1,4 @@ -<tool id="kegg_identification" name="KEGG pathways mapping and coverage" version="2020.01.23"> +<tool id="kegg_identification" name="KEGG pathways mapping and coverage" version="2021.04.12"> <description>[PathView]</description> <requirements> <requirement type="package" version="1.18.0">bioconductor-keggrest</requirement>