Mercurial > repos > proteore > proteore_expression_rnaseq_abbased
annotate get_data_HPA_v2.R @ 5:f15cdeeba4b4 draft
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
author | proteore |
---|---|
date | Mon, 19 Mar 2018 10:07:38 -0400 |
parents | cf2fa609625b |
children |
rev | line source |
---|---|
0
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
1 # Usage : |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
2 # Rscript --vanilla get_data_HPA_v2.R --typeinput copypaste --input |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
3 # ENSG00000283071 --header FALSE --proteinatlas proteinatlas.csv --column c1 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
4 # --select RNA.tissue.category,Reliability..IH.,Reliability..IF. --output |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
5 # output.txt |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
6 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
7 # INPUTS : |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
8 # --typeinput : "copypaste" or "tabfile" |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
9 # --input : either a file name (e.g : input.txt) or a list of blank-separated |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
10 # ENSG identifiers (e.g : ENSG00000283071 ENSG00000283072) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
11 # --header : "TRUE" or "FALSE" : indicates in case the input is a file if said |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
12 # file has an header |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
13 # --proteinatlas : HPA proteinatlas tab file |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
14 # --column : column containing in input ENSG identifiers |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
15 # --select : information from HPA to select, may be |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
16 # : RNA.tissue.category,Reliability..IH.,Reliability..IF. (comma-separated) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
17 # --output : output file name |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
18 # Useful functions |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
19 |
5
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
20 # Read file and return file content as data.frame |
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
21 readfile = function(filename, header) { |
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
22 if (header == "true") { |
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
23 # Read only first line of the file as header: |
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
24 headers <- read.table(filename, nrows = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "") |
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
25 #Read the data of the files (skipping the first row) |
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
26 file <- read.table(filename, skip = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "") |
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
27 # Remove empty rows |
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
28 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE] |
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
29 #And assign the header to the data |
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
30 names(file) <- headers |
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
31 } |
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
32 else { |
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
33 file <- read.table(filename, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "") |
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
34 # Remove empty rows |
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
35 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE] |
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
36 } |
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
37 return(file) |
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
38 } |
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
39 |
0
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
40 '%!in%' <- function(x,y)!('%in%'(x,y)) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
41 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
42 args = commandArgs(trailingOnly = TRUE) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
43 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
44 # create a list of the arguments from the command line, separated by a blank space |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
45 hh <- paste(unlist(args),collapse=' ') |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
46 # delete the first element of the list which is always a blank space |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
47 listoptions <- unlist(strsplit(hh,'--'))[-1] |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
48 # for each input, split the arguments with blank space as separator, unlist, and delete the first element which is the input name (e.g --protatlas) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
49 options.args <- sapply(listoptions,function(x){ |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
50 unlist(strsplit(x, ' '))[-1] |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
51 }) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
52 # same as the step above, except that only the names are kept |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
53 options.names <- sapply(listoptions,function(x){ |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
54 option <- unlist(strsplit(x, ' '))[1] |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
55 }) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
56 names(options.args) <- unlist(options.names) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
57 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
58 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
59 typeinput = as.character(options.args[1]) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
60 proteinatlas = read.table(as.character(options.args[4]),header=TRUE,sep="\t",quote="\"",fill=TRUE,blank.lines.skip=TRUE, na.strings=c("NA"," ","")) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
61 listfile = options.args[2] |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
62 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
63 header = as.character(options.args[3]) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
64 column = as.numeric(gsub("c","",options.args[5])) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
65 select = as.character(options.args[6]) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
66 output = as.character(options.args[7]) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
67 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
68 if (typeinput=="copypaste"){ |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
69 sample = as.data.frame(unlist(listfile)) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
70 sample = sample[,column] |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
71 } |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
72 if (typeinput=="tabfile"){ |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
73 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
74 if (header=="TRUE"){ |
5
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
75 listfile = readfile(listfile, "true") |
0
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
76 }else{ |
5
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
77 listfile = readfile(listfile, "false") |
0
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
78 } |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
79 sample = listfile[,column] |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
80 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
81 } |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
82 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
83 # Select user input ensembl ids in HPA protein atlas file |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
84 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
85 if ((length(sample[sample %in% proteinatlas[,3]]))==0){ |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
86 write.table("None of the input ENSG ids are can be found in HPA data file",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
87 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
88 }else{ |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
89 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
90 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
91 to_keep = c() |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
92 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
93 if (select!="None"){ |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
94 select = unlist(strsplit(select,",")) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
95 for (arg in select){ |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
96 colnb = which(colnames(proteinatlas) %in% c(arg)) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
97 to_keep = c(to_keep,colnb) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
98 } |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
99 } |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
100 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
101 to_keep = c(3,to_keep) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
102 lines = which(proteinatlas[,3] %in% sample) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
103 data = proteinatlas[lines,] |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
104 data = data[,to_keep] |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
105 # if only some of the proteins were not found in proteinatlas they will be added to |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
106 # the file with the fields "Protein not found in proteinatlas" |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
107 if (length(which(sample %!in% proteinatlas[,3]))!=0){ |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
108 proteins_not_found = as.data.frame(sample[which(sample %!in% proteinatlas[,3])]) |
5
f15cdeeba4b4
planemo upload commit 4af7ac25de19ca10b1654820e909c647a2d337b2-dirty
proteore
parents:
0
diff
changeset
|
109 proteins_not_found = cbind(proteins_not_found,matrix(rep("Protein not found in HPA",length(proteins_not_found)),nrow=length(proteins_not_found),ncol=length(colnames(data))-1)) |
0
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
110 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
111 colnames(proteins_not_found)=colnames(data) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
112 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
113 data = rbind(data,proteins_not_found) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
114 } |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
115 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
116 # Merge original data and data selected from proteinatlas |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
117 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
118 # Before that, if the initial ids were uniprot ids change them back from |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
119 # proteinatlas to uniprot ids in data |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
120 data = merge(listfile, data, by.x = column, by.y=1) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
121 colnames(data)[1] = "Ensembl gene ids" |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
122 # Write result |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
123 write.table(data,file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
124 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
125 } |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
126 |
cf2fa609625b
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff
changeset
|
127 |