annotate get_data_HPA_v2.R @ 5:f15cdeeba4b4 draft

planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
author proteore
date Mon, 19 Mar 2018 10:07:38 -0400
parents cf2fa609625b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
1 # Usage :
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
2 # Rscript --vanilla get_data_HPA_v2.R --typeinput copypaste --input
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
3 # ENSG00000283071 --header FALSE --proteinatlas proteinatlas.csv --column c1
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
4 # --select RNA.tissue.category,Reliability..IH.,Reliability..IF. --output
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
5 # output.txt
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
6
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
7 # INPUTS :
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
8 # --typeinput : "copypaste" or "tabfile"
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
9 # --input : either a file name (e.g : input.txt) or a list of blank-separated
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
10 # ENSG identifiers (e.g : ENSG00000283071 ENSG00000283072)
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
11 # --header : "TRUE" or "FALSE" : indicates in case the input is a file if said
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
12 # file has an header
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
13 # --proteinatlas : HPA proteinatlas tab file
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
14 # --column : column containing in input ENSG identifiers
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
15 # --select : information from HPA to select, may be
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
16 # : RNA.tissue.category,Reliability..IH.,Reliability..IF. (comma-separated)
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
17 # --output : output file name
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
18 # Useful functions
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
19
5
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
20 # Read file and return file content as data.frame
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
21 readfile = function(filename, header) {
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
22 if (header == "true") {
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
23 # Read only first line of the file as header:
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
24 headers <- read.table(filename, nrows = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "")
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
25 #Read the data of the files (skipping the first row)
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
26 file <- read.table(filename, skip = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "")
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
27 # Remove empty rows
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
28 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE]
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
29 #And assign the header to the data
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
30 names(file) <- headers
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
31 }
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
32 else {
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
33 file <- read.table(filename, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "")
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
34 # Remove empty rows
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
35 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE]
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
36 }
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
37 return(file)
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
38 }
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
39
0
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
40 '%!in%' <- function(x,y)!('%in%'(x,y))
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
41
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
42 args = commandArgs(trailingOnly = TRUE)
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
43
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
44 # create a list of the arguments from the command line, separated by a blank space
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
45 hh <- paste(unlist(args),collapse=' ')
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
46 # delete the first element of the list which is always a blank space
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
47 listoptions <- unlist(strsplit(hh,'--'))[-1]
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
48 # for each input, split the arguments with blank space as separator, unlist, and delete the first element which is the input name (e.g --protatlas)
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
49 options.args <- sapply(listoptions,function(x){
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
50 unlist(strsplit(x, ' '))[-1]
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
51 })
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
52 # same as the step above, except that only the names are kept
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
53 options.names <- sapply(listoptions,function(x){
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
54 option <- unlist(strsplit(x, ' '))[1]
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
55 })
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
56 names(options.args) <- unlist(options.names)
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
57
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
58
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
59 typeinput = as.character(options.args[1])
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
60 proteinatlas = read.table(as.character(options.args[4]),header=TRUE,sep="\t",quote="\"",fill=TRUE,blank.lines.skip=TRUE, na.strings=c("NA"," ",""))
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
61 listfile = options.args[2]
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
62
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
63 header = as.character(options.args[3])
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
64 column = as.numeric(gsub("c","",options.args[5]))
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
65 select = as.character(options.args[6])
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
66 output = as.character(options.args[7])
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
67
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
68 if (typeinput=="copypaste"){
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
69 sample = as.data.frame(unlist(listfile))
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
70 sample = sample[,column]
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
71 }
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
72 if (typeinput=="tabfile"){
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
73
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
74 if (header=="TRUE"){
5
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
75 listfile = readfile(listfile, "true")
0
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
76 }else{
5
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
77 listfile = readfile(listfile, "false")
0
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
78 }
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
79 sample = listfile[,column]
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
80
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
81 }
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
82
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
83 # Select user input ensembl ids in HPA protein atlas file
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
84
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
85 if ((length(sample[sample %in% proteinatlas[,3]]))==0){
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
86 write.table("None of the input ENSG ids are can be found in HPA data file",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE)
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
87
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
88 }else{
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
89
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
90
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
91 to_keep = c()
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
92
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
93 if (select!="None"){
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
94 select = unlist(strsplit(select,","))
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
95 for (arg in select){
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
96 colnb = which(colnames(proteinatlas) %in% c(arg))
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
97 to_keep = c(to_keep,colnb)
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
98 }
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
99 }
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
100
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
101 to_keep = c(3,to_keep)
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
102 lines = which(proteinatlas[,3] %in% sample)
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
103 data = proteinatlas[lines,]
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
104 data = data[,to_keep]
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
105 # if only some of the proteins were not found in proteinatlas they will be added to
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
106 # the file with the fields "Protein not found in proteinatlas"
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
107 if (length(which(sample %!in% proteinatlas[,3]))!=0){
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
108 proteins_not_found = as.data.frame(sample[which(sample %!in% proteinatlas[,3])])
5
f15cdeeba4b4 planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents: 0
diff changeset
109 proteins_not_found = cbind(proteins_not_found,matrix(rep("Protein not found in HPA",length(proteins_not_found)),nrow=length(proteins_not_found),ncol=length(colnames(data))-1))
0
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
110
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
111 colnames(proteins_not_found)=colnames(data)
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
112
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
113 data = rbind(data,proteins_not_found)
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
114 }
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
115
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
116 # Merge original data and data selected from proteinatlas
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
117
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
118 # Before that, if the initial ids were uniprot ids change them back from
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
119 # proteinatlas to uniprot ids in data
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
120 data = merge(listfile, data, by.x = column, by.y=1)
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
121 colnames(data)[1] = "Ensembl gene ids"
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
122 # Write result
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
123 write.table(data,file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE)
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
124
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
125 }
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
126
cf2fa609625b planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
127