Mercurial > repos > proteore > proteore_expression_rnaseq_abbased
comparison get_data_HPA_v2.R @ 0:cf2fa609625b draft
planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
author | proteore |
---|---|
date | Sun, 26 Nov 2017 20:49:17 -0500 |
parents | |
children | f15cdeeba4b4 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:cf2fa609625b |
---|---|
1 # Usage : | |
2 # Rscript --vanilla get_data_HPA_v2.R --typeinput copypaste --input | |
3 # ENSG00000283071 --header FALSE --proteinatlas proteinatlas.csv --column c1 | |
4 # --select RNA.tissue.category,Reliability..IH.,Reliability..IF. --output | |
5 # output.txt | |
6 | |
7 # INPUTS : | |
8 # --typeinput : "copypaste" or "tabfile" | |
9 # --input : either a file name (e.g : input.txt) or a list of blank-separated | |
10 # ENSG identifiers (e.g : ENSG00000283071 ENSG00000283072) | |
11 # --header : "TRUE" or "FALSE" : indicates in case the input is a file if said | |
12 # file has an header | |
13 # --proteinatlas : HPA proteinatlas tab file | |
14 # --column : column containing in input ENSG identifiers | |
15 # --select : information from HPA to select, may be | |
16 # : RNA.tissue.category,Reliability..IH.,Reliability..IF. (comma-separated) | |
17 # --output : output file name | |
18 # Useful functions | |
19 | |
20 '%!in%' <- function(x,y)!('%in%'(x,y)) | |
21 | |
22 args = commandArgs(trailingOnly = TRUE) | |
23 | |
24 # create a list of the arguments from the command line, separated by a blank space | |
25 hh <- paste(unlist(args),collapse=' ') | |
26 # delete the first element of the list which is always a blank space | |
27 listoptions <- unlist(strsplit(hh,'--'))[-1] | |
28 # for each input, split the arguments with blank space as separator, unlist, and delete the first element which is the input name (e.g --protatlas) | |
29 options.args <- sapply(listoptions,function(x){ | |
30 unlist(strsplit(x, ' '))[-1] | |
31 }) | |
32 # same as the step above, except that only the names are kept | |
33 options.names <- sapply(listoptions,function(x){ | |
34 option <- unlist(strsplit(x, ' '))[1] | |
35 }) | |
36 names(options.args) <- unlist(options.names) | |
37 | |
38 | |
39 typeinput = as.character(options.args[1]) | |
40 proteinatlas = read.table(as.character(options.args[4]),header=TRUE,sep="\t",quote="\"",fill=TRUE,blank.lines.skip=TRUE, na.strings=c("NA"," ","")) | |
41 listfile = options.args[2] | |
42 | |
43 header = as.character(options.args[3]) | |
44 column = as.numeric(gsub("c","",options.args[5])) | |
45 select = as.character(options.args[6]) | |
46 output = as.character(options.args[7]) | |
47 | |
48 if (typeinput=="copypaste"){ | |
49 sample = as.data.frame(unlist(listfile)) | |
50 sample = sample[,column] | |
51 } | |
52 if (typeinput=="tabfile"){ | |
53 | |
54 if (header=="TRUE"){ | |
55 listfile = read.table(listfile,header=TRUE,sep="\t",quote="\"",fill=TRUE, na.strings=c("","NA")) | |
56 }else{ | |
57 listfile = read.table(listfile,header=FALSE,sep="\t",quote="\"",fill=TRUE, na.strings=c("","NA")) | |
58 } | |
59 sample = listfile[,column] | |
60 | |
61 } | |
62 | |
63 # Select user input ensembl ids in HPA protein atlas file | |
64 | |
65 if ((length(sample[sample %in% proteinatlas[,3]]))==0){ | |
66 write.table("None of the input ENSG ids are can be found in HPA data file",file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) | |
67 | |
68 }else{ | |
69 | |
70 | |
71 to_keep = c() | |
72 | |
73 if (select!="None"){ | |
74 select = unlist(strsplit(select,",")) | |
75 for (arg in select){ | |
76 colnb = which(colnames(proteinatlas) %in% c(arg)) | |
77 to_keep = c(to_keep,colnb) | |
78 } | |
79 } | |
80 | |
81 to_keep = c(3,to_keep) | |
82 lines = which(proteinatlas[,3] %in% sample) | |
83 data = proteinatlas[lines,] | |
84 data = data[,to_keep] | |
85 # if only some of the proteins were not found in proteinatlas they will be added to | |
86 # the file with the fields "Protein not found in proteinatlas" | |
87 if (length(which(sample %!in% proteinatlas[,3]))!=0){ | |
88 proteins_not_found = as.data.frame(sample[which(sample %!in% proteinatlas[,3])]) | |
89 proteins_not_found = cbind(proteins_not_found,matrix(rep("Protein not found in HPA",length(proteins_not_found)),nrow=length(proteins_not_found),ncol=length(colnames(data))-1)) | |
90 | |
91 colnames(proteins_not_found)=colnames(data) | |
92 | |
93 data = rbind(data,proteins_not_found) | |
94 } | |
95 | |
96 # Merge original data and data selected from proteinatlas | |
97 | |
98 # Before that, if the initial ids were uniprot ids change them back from | |
99 # proteinatlas to uniprot ids in data | |
100 data = merge(listfile, data, by.x = column, by.y=1) | |
101 colnames(data)[1] = "Ensembl gene ids" | |
102 # Write result | |
103 write.table(data,file=output,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE) | |
104 | |
105 } | |
106 | |
107 |