comparison protein_features.R @ 1:bfc679370c64 draft

planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
author proteore
date Fri, 16 Feb 2018 04:06:16 -0500
parents
children 867d47ff782c
comparison
equal deleted inserted replaced
0:e3b52db3d583 1:bfc679370c64
1 # Read file and return file content as data.frame
2 readfile = function(filename, header) {
3 if (header == "true") {
4 # Read only first line of the file as header:
5 headers <- read.table(filename, nrows = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE)
6 #Read the data of the files (skipping the first row)
7 file <- read.table(filename, skip = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE)
8 # Remove empty rows
9 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE]
10 #And assign the header to the data
11 names(file) <- headers
12 }
13 else {
14 file <- read.table(filename, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE)
15 # Remove empty rows
16 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE]
17 }
18 return(file)
19 }
20
21 protein_features = function() {
22 args <- commandArgs(TRUE)
23 if(length(args)<1) {
24 args <- c("--help")
25 }
26
27 # Help section
28 if("--help" %in% args) {
29 cat("Selection and Annotation HPA
30 Arguments:
31 --inputtype: type of input (list of id or filename)
32 --input: input
33 --nextprot: path to nextprot information file
34 --column: the column number which you would like to apply...
35 --header: true/false if your file contains a header
36 --type: the type of input IDs (UniProt/EntrezID)
37 --argsP1: IsoPoint,SeqLength,MW
38 --argsP2: Chr,SubcellLocations
39 --argsP3: Diseases
40 --output: text output filename \n")
41 q(save="no")
42 }
43
44 # Parse arguments
45 parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
46 argsDF <- as.data.frame(do.call("rbind", parseArgs(args)))
47 args <- as.list(as.character(argsDF$V2))
48 names(args) <- argsDF$V1
49
50 inputtype = args$inputtype
51 if (inputtype == "copypaste") {
52 input = strsplit(args$input, " ")[[1]]
53 }
54 else if (inputtype == "tabfile") {
55 filename = args$input
56 ncol = args$column
57 # Check ncol
58 if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) {
59 stop("Please enter an integer for level")
60 }
61 else {
62 ncol = as.numeric(gsub("c", "", ncol))
63 }
64 header = args$header
65 # Get file content
66 file = readfile(filename, header)
67 # Extract Protein IDs list
68 input = c()
69 for (row in as.character(file[,ncol])) {
70 input = c(input, strsplit(row, ";")[[1]][1])
71 }
72 }
73 nextprot_file = args$nextprot
74 nextprot = human_id_map = read.table(nextprot_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings = "")
75 typeid = args$type
76 P1_args = strsplit(args$argsP1, ",")[[1]]
77 P2_args = strsplit(args$argsP2, ",")[[1]]
78 P3_args = strsplit(args$argsP3, ",")[[1]]
79 output = args$output
80
81 # Change the sample ids if they are uniprot ids to be able to match them with
82 # Nextprot data
83 if (typeid=="uniprot"){
84 input = gsub("^","NX_",input)
85 }
86
87 # Select user input protein ids in nextprot
88 if ((length(input[input %in% nextprot[,1]]))==0){
89 write.table("None of the input ids are can be found in Nextprot",file=filename,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE)
90 } else {
91 names = c()
92 res = matrix(nrow=length(input), ncol=0)
93
94 # Get information from neXtProt
95 if (length(P1_args)>0) {
96 for (arg in P1_args) {
97 names = c(names, arg)
98 info = nextprot[match(input, nextprot["NextprotID"][,]),][arg][,]
99 res = cbind(res, info)
100 }
101 }
102 if (length(P2_args)>0) {
103 for (arg in P2_args) {
104 names = c(names, arg)
105 info = nextprot[match(input, nextprot["NextprotID"][,]),][arg][,]
106 res = cbind(res, info)
107 }
108 }
109 if (length(P3_args)>0) {
110 for (arg in P3_args) {
111 names = c(names, arg)
112 info = nextprot[match(input, nextprot["NextprotID"][,]),][arg][,]
113 res = cbind(res, info)
114 }
115 }
116
117 # Write output
118 if (inputtype == "copypaste") {
119 res = cbind(as.matrix(input), res)
120 names = c(typeid, names)
121 colnames(res) = names
122 write.table(res, output, row.names = FALSE, sep = "\t", quote = FALSE)
123 }
124 else if (inputtype == "tabfile") {
125 names = c(names(file), names)
126 output_content = cbind(file, res)
127 colnames(output_content) = names
128 write.table(output_content, output, row.names = FALSE, sep = "\t", quote = FALSE)
129 }
130 }
131
132 }
133 protein_features()