comparison id_converter_UniProt.R @ 5:0584344186eb draft

planemo upload commit 6e27a0eaf8172154128960497271e8f54341acd7-dirty
author proteore
date Tue, 20 Mar 2018 10:51:41 -0400
parents 134949593a3b
children 659f1248f535
comparison
equal deleted inserted replaced
4:134949593a3b 5:0584344186eb
37 # Ensembl_ENSG: Ensembl gene identifiers (e.g. ENSG00000166913) 37 # Ensembl_ENSG: Ensembl gene identifiers (e.g. ENSG00000166913)
38 # Ensembl_ENST: Ensembl transcript identifiers (e.g. ENST00000353703; ENST00000372839) 38 # Ensembl_ENST: Ensembl transcript identifiers (e.g. ENST00000353703; ENST00000372839)
39 # Ensembl_ENSP: Ensembl protein identifiers (e.g. ENSP00000300161; ENSP00000361930) 39 # Ensembl_ENSP: Ensembl protein identifiers (e.g. ENSP00000300161; ENSP00000361930)
40 40
41 mapping = function() { 41 mapping = function() {
42 # Extract arguments 42 args <- commandArgs(TRUE)
43 args = commandArgs(trailingOnly = TRUE) 43 if(length(args)<1) {
44 #print(args) 44 args <- c("--help")
45 if (length(args) != 6) {
46 stop("Not enough/Too many arguments", call. = FALSE)
47 } 45 }
48 else { 46
49 input_id_type = args[1] 47 # Help section
50 list_id = args[2] 48 if("--help" %in% args) {
51 list_id_input_type = args[3] 49 cat("Selection and Annotation HPA
52 options = strsplit(args[4], ",")[[1]] 50 Arguments:
53 output = args[5] 51 --ref_file: path to reference file (human_id_mapping_file.txt)
54 human_id_mapping_file = args[6] 52 --input_type: type of input (list of id or filename)
53 --id_type: type of input IDs
54 --input: list of IDs (text or filename)
55 --column_number: the column number which contains list of input IDs
56 --header: true/false if your file contains a header
57 --target_ids: target IDs to map to
58 --output: output filename \n")
59 q(save="no")
60 }
61
62 # Parse arguments
63 parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
64 argsDF <- as.data.frame(do.call("rbind", parseArgs(args)))
65 args <- as.list(as.character(argsDF$V2))
66 names(args) <- argsDF$V1
67
68 input_id_type = args$id_type # Uniprot, ENSG....
69 list_id_input_type = args$input_type # list or file
70 options = strsplit(args$target_ids, ",")[[1]]
71 output = args$output
72 human_id_mapping_file = args$ref_file
55 73
56 # Extract ID maps 74 # Extract input IDs
57 human_id_map = read.table(human_id_mapping_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings = "", quote = "") 75 if (list_id_input_type == "list") {
76 print(args$input)
77 list_id = strsplit(args$input, "[ \t\n]+")[[1]]
78 # Remove isoform accession number (e.g. "-2")
79 list_id = gsub("-.+", "", list_id)
80 }
81 else if (list_id_input_type == "file") {
82 filename = args$input
83 column_number = as.numeric(gsub("c", "" ,args$column_number))
84 header = args$header
85 file_all = readfile(filename, header)
86 list_id = c()
87 list_id = sapply(strsplit(file_all[,column_number], ";"), "[", 1)
88 # Remove isoform accession number (e.g. "-2")
89 list_id = gsub("-.+", "", list_id)
90 }
91
92 # Extract ID maps
93 human_id_map = read.table(human_id_mapping_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings = "", quote = "")
94
95 names = c()
58 96
59 # Extract input IDs 97 # Map IDs
60 if (list_id_input_type == "list") { 98 res = matrix(nrow=length(list_id), ncol=0)
61 list_id = strsplit(args[2], " ")[[1]]
62 # Remove isoform accession number (e.g. "-2")
63 list_id = gsub("-.+", "", list_id)
64 }
65 else if (list_id_input_type == "file") {
66 filename = as.character(strsplit(list_id, ",")[[1]][1])
67 column_number = as.numeric(gsub("c", "" ,strsplit(list_id, ",")[[1]][2]))
68 header = strsplit(list_id, ",")[[1]][3]
69 file_all = readfile(filename, header)
70 print(class(file_all))
71 str(file_all)
72 print(class(file_all[,1]))
73 list_id = c()
74 list_id = sapply(strsplit(file_all[,column_number], ";"), "[", 1)
75 # Remove isoform accession number (e.g. "-2")
76 list_id = gsub("-.+", "", list_id)
77 }
78 names = c()
79
80 # Map IDs
81 res = matrix(nrow=length(list_id), ncol=0)
82 99
83 for (opt in options) { 100 for (opt in options) {
84 names = c(names, opt) 101 names = c(names, opt)
85 mapped = human_id_map[match(list_id, human_id_map[input_id_type][,]),][opt][,] 102 mapped = human_id_map[match(list_id, human_id_map[input_id_type][,]),][opt][,]
86 res = cbind(res, matrix(mapped)) 103 res = cbind(res, matrix(mapped))
87 } 104 }
88 105
89 # Write output 106 # Write output
90 if (list_id_input_type == "list") { 107 if (list_id_input_type == "list") {
91 res = cbind(as.matrix(list_id), res) 108 res = cbind(as.matrix(list_id), res)
92 names = c(input_id_type, names) 109 names = c(input_id_type, names)
93 colnames(res) = names 110 colnames(res) = names
94 write.table(res, output, row.names = FALSE, sep = "\t", quote = FALSE) 111 write.table(res, output, row.names = FALSE, sep = "\t", quote = FALSE)
95 } 112 }
96 else if (list_id_input_type == "file") { 113 else if (list_id_input_type == "file") {
97 names(res) = options 114 names(res) = options
98 names = c(names(file_all), names) 115 names = c(names(file_all), names)
99 output_content = cbind(file_all, res) 116 output_content = cbind(file_all, res)
100 colnames(output_content) = names 117 colnames(output_content) = names
101 write.table(output_content, output, row.names = FALSE, sep = "\t", quote = FALSE) 118 write.table(output_content, output, row.names = FALSE, sep = "\t", quote = FALSE)
102 }
103 } 119 }
104 } 120 }
105 121
106 mapping() 122 mapping()
107 123