Mercurial > repos > proteore > proteore_prot_features
comparison protein_features.R @ 1:bfc679370c64 draft
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
author | proteore |
---|---|
date | Fri, 16 Feb 2018 04:06:16 -0500 |
parents | |
children | 867d47ff782c |
comparison
equal
deleted
inserted
replaced
0:e3b52db3d583 | 1:bfc679370c64 |
---|---|
1 # Read file and return file content as data.frame | |
2 readfile = function(filename, header) { | |
3 if (header == "true") { | |
4 # Read only first line of the file as header: | |
5 headers <- read.table(filename, nrows = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE) | |
6 #Read the data of the files (skipping the first row) | |
7 file <- read.table(filename, skip = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE) | |
8 # Remove empty rows | |
9 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE] | |
10 #And assign the header to the data | |
11 names(file) <- headers | |
12 } | |
13 else { | |
14 file <- read.table(filename, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE) | |
15 # Remove empty rows | |
16 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE] | |
17 } | |
18 return(file) | |
19 } | |
20 | |
21 protein_features = function() { | |
22 args <- commandArgs(TRUE) | |
23 if(length(args)<1) { | |
24 args <- c("--help") | |
25 } | |
26 | |
27 # Help section | |
28 if("--help" %in% args) { | |
29 cat("Selection and Annotation HPA | |
30 Arguments: | |
31 --inputtype: type of input (list of id or filename) | |
32 --input: input | |
33 --nextprot: path to nextprot information file | |
34 --column: the column number which you would like to apply... | |
35 --header: true/false if your file contains a header | |
36 --type: the type of input IDs (UniProt/EntrezID) | |
37 --argsP1: IsoPoint,SeqLength,MW | |
38 --argsP2: Chr,SubcellLocations | |
39 --argsP3: Diseases | |
40 --output: text output filename \n") | |
41 q(save="no") | |
42 } | |
43 | |
44 # Parse arguments | |
45 parseArgs <- function(x) strsplit(sub("^--", "", x), "=") | |
46 argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) | |
47 args <- as.list(as.character(argsDF$V2)) | |
48 names(args) <- argsDF$V1 | |
49 | |
50 inputtype = args$inputtype | |
51 if (inputtype == "copypaste") { | |
52 input = strsplit(args$input, " ")[[1]] | |
53 } | |
54 else if (inputtype == "tabfile") { | |
55 filename = args$input | |
56 ncol = args$column | |
57 # Check ncol | |
58 if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) { | |
59 stop("Please enter an integer for level") | |
60 } | |
61 else { | |
62 ncol = as.numeric(gsub("c", "", ncol)) | |
63 } | |
64 header = args$header | |
65 # Get file content | |
66 file = readfile(filename, header) | |
67 # Extract Protein IDs list | |
68 input = c() | |
69 for (row in as.character(file[,ncol])) { | |
70 input = c(input, strsplit(row, ";")[[1]][1]) | |
71 } | |
72 } | |
73 nextprot_file = args$nextprot | |
74 nextprot = human_id_map = read.table(nextprot_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings = "") | |
75 typeid = args$type | |
76 P1_args = strsplit(args$argsP1, ",")[[1]] | |
77 P2_args = strsplit(args$argsP2, ",")[[1]] | |
78 P3_args = strsplit(args$argsP3, ",")[[1]] | |
79 output = args$output | |
80 | |
81 # Change the sample ids if they are uniprot ids to be able to match them with | |
82 # Nextprot data | |
83 if (typeid=="uniprot"){ | |
84 input = gsub("^","NX_",input) | |
85 } | |
86 | |
87 # Select user input protein ids in nextprot | |
88 if ((length(input[input %in% nextprot[,1]]))==0){ | |
89 write.table("None of the input ids are can be found in Nextprot",file=filename,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) | |
90 } else { | |
91 names = c() | |
92 res = matrix(nrow=length(input), ncol=0) | |
93 | |
94 # Get information from neXtProt | |
95 if (length(P1_args)>0) { | |
96 for (arg in P1_args) { | |
97 names = c(names, arg) | |
98 info = nextprot[match(input, nextprot["NextprotID"][,]),][arg][,] | |
99 res = cbind(res, info) | |
100 } | |
101 } | |
102 if (length(P2_args)>0) { | |
103 for (arg in P2_args) { | |
104 names = c(names, arg) | |
105 info = nextprot[match(input, nextprot["NextprotID"][,]),][arg][,] | |
106 res = cbind(res, info) | |
107 } | |
108 } | |
109 if (length(P3_args)>0) { | |
110 for (arg in P3_args) { | |
111 names = c(names, arg) | |
112 info = nextprot[match(input, nextprot["NextprotID"][,]),][arg][,] | |
113 res = cbind(res, info) | |
114 } | |
115 } | |
116 | |
117 # Write output | |
118 if (inputtype == "copypaste") { | |
119 res = cbind(as.matrix(input), res) | |
120 names = c(typeid, names) | |
121 colnames(res) = names | |
122 write.table(res, output, row.names = FALSE, sep = "\t", quote = FALSE) | |
123 } | |
124 else if (inputtype == "tabfile") { | |
125 names = c(names(file), names) | |
126 output_content = cbind(file, res) | |
127 colnames(output_content) = names | |
128 write.table(output_content, output, row.names = FALSE, sep = "\t", quote = FALSE) | |
129 } | |
130 } | |
131 | |
132 } | |
133 protein_features() |