annotate get_data_nextprot.R @ 0:e3b52db3d583 draft

planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
author proteore
date Sun, 26 Nov 2017 19:45:52 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
1 # Usage : Rscript --vanilla get_data_nextprot.R --inputtype copypaste (or
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
2 # tabfile) --input file.txt --nextprot result_nextprot.txt --column column
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
3 # --argsP1 IsoPoint,SeqLength,MW
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
4 # --argsP2 Chr,SubcellLocations --argsP3 Diseases --type id nextprot (uniprot)
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
5 # --output output.txt --header TRUE
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
6
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
7 # e.g :
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
8 # Rscript --vanilla get_data_nextprot.R --inputtype copypaste --input P01133 P00533 P62158 Q16566 P31323 P17612 P10644
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
9 # P22612 P31321 P13861 P22694 P25098 P16220 Q14573 Q14571 Q14643 Q05655 Q02156
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
10 # P19174 O43865 Q01064 P54750 Q14123 P51828 Q08828 O60266 Q08462 O60503 O43306
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
11 # Q8NFM4 O95622 P40145 P17252 P05129 --nextprot
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
12 # result_nextprot.txt--column c1 --argsP1 IsoPoint --argsP2
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
13 # Chr --argsP3 Diseases --typeid uniprot --output output.txt --header FALSE
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
14
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
15 # Useful functions
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
16
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
17 '%!in%' <- function(x,y)!('%in%'(x,y))
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
18
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
19 # Parse arguments
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
20
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
21 args = commandArgs(trailingOnly = TRUE)
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
22
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
23 # create a list of the arguments from the command line, separated by a blank space
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
24 hh <- paste(unlist(args),collapse=' ')
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
25 # delete the first element of the list which is always a blank space
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
26 listoptions <- unlist(strsplit(hh,'--'))[-1]
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
27 # for each input, split the arguments with blank space as separator, unlist, and delete the first element which is the input name (e.g --protalas)
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
28 options.args <- sapply(listoptions,function(x){
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
29 unlist(strsplit(x, ' '))[-1]
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
30 })
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
31 # same as the step above, except that only the names are kept
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
32 options.names <- sapply(listoptions,function(x){
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
33 option <- unlist(strsplit(x, ' '))[1]
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
34 })
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
35 names(options.args) <- unlist(options.names)
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
36
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
37
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
38 typeinput = as.character(options.args[1])
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
39 nextprot = read.table(as.character(options.args[3]),header=TRUE,sep="\t",quote="\"")
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
40 listfile = as.character(options.args[2])
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
41 column = as.numeric(gsub("c","",options.args[4]))
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
42 P1_args = as.character(options.args[5])
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
43 P2_args = as.character(options.args[6])
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
44 P3_args = as.character(options.args[7])
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
45 typeid = as.character(options.args[8])
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
46 filename = as.character(options.args[9])
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
47 header = as.character(options.args[10])
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
48
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
49 if (typeinput=="copypaste"){
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
50 sample = as.data.frame(unlist(listfile))
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
51 sample = sample[,column]
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
52 }
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
53 if (typeinput=="tabfile"){
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
54
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
55 if (header=="TRUE"){
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
56 listfile = read.table(listfile,header=TRUE,sep="\t",quote="\"",fill=TRUE)
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
57 }else{
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
58 listfile = read.table(listfile,header=FALSE,sep="\t",quote="\"",fill=TRUE)
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
59 }
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
60 sample = listfile[,column]
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
61
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
62 }
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
63 # Change the sample ids if they are uniprot ids to be able to match them with
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
64 # Nextprot data
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
65 if (typeid=="uniprot"){
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
66 sample = gsub("^","NX_",sample)
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
67 }
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
68
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
69 # Select user input protein ids in nextprot
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
70
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
71 if ((length(sample[sample %in% nextprot[,1]]))==0){
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
72
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
73 write.table("None of the input ids are can be found in Nextprot",file=filename,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE)
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
74
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
75 }else{
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
76
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
77
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
78 to_keep = c()
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
79
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
80 if (P1_args!="None"){
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
81 P1_args = unlist(strsplit(P1_args,","))
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
82 for (arg in P1_args){
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
83 colnb = which(colnames(nextprot) %in% c(arg))
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
84 to_keep = c(to_keep,colnb)
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
85 }
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
86 }
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
87
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
88 if (P2_args!="None"){
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
89 P2_args = unlist(strsplit(P2_args,","))
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
90 for (arg in P2_args){
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
91 colnb = which(colnames(nextprot) %in% c(arg))
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
92 to_keep = c(to_keep,colnb)
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
93 }
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
94 }
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
95
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
96 if (P3_args!="None"){
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
97 P3_args = unlist(strsplit(P3_args,","))
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
98 for (arg in P3_args){
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
99 colnb = which(colnames(nextprot) %in% c(arg))
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
100 to_keep = c(to_keep,colnb)
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
101 }
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
102 }
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
103 to_keep = c(1,to_keep)
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
104 lines = which(nextprot[,1] %in% sample)
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
105 data = nextprot[lines,]
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
106
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
107 data = data[,to_keep]
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
108
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
109
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
110 # if only some of the proteins were not found in nextprot they will be added to
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
111 # the file with the fields "Protein not found in Nextprot"
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
112 if (length(which(sample %!in% nextprot[,1]))!=0){
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
113 proteins_not_found = as.data.frame(sample[which(sample %!in% nextprot[,1])])
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
114
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
115 proteins_not_found = cbind(proteins_not_found,matrix(rep("Protein not found in Nextprot",length(proteins_not_found)),nrow=length(proteins_not_found),ncol=length(colnames(data))-1))
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
116
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
117 colnames(proteins_not_found)=colnames(data)
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
118 data = rbind(data,proteins_not_found)
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
119 }
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
120
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
121 # Merge original data and data selected from nextprot
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
122
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
123 # Before that, if the initial ids were uniprot ids change them back from
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
124 # Nextprot to uniprot ids in data
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
125 if (typeid=="uniprot"){
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
126 data[,1] = gsub("^NX_","",data[,1])
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
127 }
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
128 data = merge(listfile, data, by.x = column, by.y=1)
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
129 if (typeid=="uniprot"){
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
130 colnames(data)[1] = "UniprotID"
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
131 }
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
132 if (typeid=="nextprot"){
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
133 colnames(data)[1] = "NextprotID"
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
134 }
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
135 # Write result
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
136 write.table(data,file=filename,sep="\t",quote=FALSE,col.names=TRUE,row.names=FALSE)
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
137
e3b52db3d583 planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
proteore
parents:
diff changeset
138 }