5
|
1 NAMEOFOUTPUTFILE<-"output1.csv"
|
|
2
|
|
3 SuperAwesometrial <- read.delim2("input1.tabular", header=FALSE)
|
|
4 SBF<-read.csv("input3.csv", stringsAsFactors = FALSE, header = FALSE)
|
|
5 SBF<-t(SBF)
|
|
6 PositiveMotifs <- read.csv("input2.csv", stringsAsFactors=FALSE)
|
|
7
|
|
8
|
|
9 YsToim<-rep("xY",times=nrow(PositiveMotifs))
|
|
10 PositiveMotifs[,11]<-YsToim
|
|
11
|
|
12
|
|
13
|
|
14 #this code is meant to take a list of proteins, and our list of phosphopeptides, and find which Y-containing peptides could have been found phosphorylated but weren't
|
|
15
|
|
16
|
|
17
|
|
18 #first then I create the list of phosphopeptides
|
|
19 Positive9Letters<-PositiveMotifs[,4:18]
|
|
20 PositiveTrueMotifs<-c()
|
|
21
|
|
22 #then I take the proteins
|
|
23 AccessionNumbers<-as.character(SBF[2:nrow(SBF),1])
|
|
24 AccessionNumbers<-AccessionNumbers[!is.na(AccessionNumbers)]
|
|
25 #the above is only those proteins from which our phosphopeptides sprung, the below is every protein in the human proteome
|
|
26 ALLPOSSIBLE<-SuperAwesometrial[,1]
|
|
27 ALLPOSSIBLE<-as.character(ALLPOSSIBLE)
|
|
28
|
|
29 for (q in 1:nrow(Positive9Letters)) {
|
|
30 LeftJust<-0
|
|
31 RightJust<-0
|
|
32
|
|
33
|
|
34 motifmotif<-Positive9Letters[q,]
|
|
35 motifmotif<-paste(motifmotif, collapse = "",sep = "")
|
|
36 motifmotif<-unlist(strsplit(motifmotif, split = ""))
|
|
37 position <- match(x = "x", table = motifmotif)
|
|
38 LeftJust<-position-1
|
|
39 RightJust<-length(motifmotif)-position-1
|
|
40 #find which position was the phospho-amino acid, it is marked with an X
|
|
41
|
|
42 LeftSpaces<-rep(x=" ", times=(7-LeftJust))
|
|
43 RightSpaces<-rep(x=" ", times=(7-RightJust))
|
|
44 motifmotif<-motifmotif[!motifmotif %in% c("x")]
|
|
45 motifmotif<-c(LeftSpaces,motifmotif,RightSpaces)
|
|
46 motifmotif<-paste(motifmotif, collapse = "",sep = "")
|
|
47 #put spaces on either side of the motif if the motif does not fill out a -7 to +7 motif
|
|
48
|
|
49 PositiveTrueMotifs<-c(PositiveTrueMotifs,motifmotif)
|
|
50 }
|
|
51
|
|
52
|
|
53
|
|
54 allmotifs<-matrix(data=rep("Motifs", times= 1000000),ncol = 1)
|
|
55 thenames<-matrix(data=rep("AccessionNumbers", times= 1000000),ncol = 1)
|
|
56 #I preallocate vectors for efficiency, but I have no way of knowing how big these particular vectors need to be, so I just make them way bigger
|
|
57 #than I know they need to be. A vector 1 million long is plenty big
|
|
58
|
|
59 MotifNumber<-2
|
|
60
|
|
61 locations<-unique(grep(paste(AccessionNumbers,collapse="|"), ALLPOSSIBLE))
|
|
62
|
|
63
|
|
64 if (sum(locations)>0){
|
|
65 whereisit<-locations
|
|
66 for (u in 1:length(whereisit)) {
|
|
67 i<-whereisit[u]
|
|
68 name<-c()
|
|
69 data<-c()
|
|
70 name<-as.character(SuperAwesometrial[i,1])
|
|
71 #the name of each protein is the first column
|
|
72 name<-sub(x=name, pattern=",", replacement="")
|
|
73 #the names may contain commas, remove them
|
|
74 data<-as.character(SuperAwesometrial[i,3])
|
|
75 #the amino acids are stored in the third column
|
|
76 data<-strsplit(data,"")
|
|
77 #split them into their component letters
|
|
78 data<-unlist(data)
|
|
79 #turn them into a vector
|
|
80 motif<-c()
|
|
81
|
|
82 #this part below is where I can speed things up
|
|
83 The_Ys<-data=="Y"
|
|
84 #find any Y in the protein
|
|
85 if (sum(The_Ys>0)){ #if there is at least one Y
|
|
86 Where_are_they<-which(The_Ys %in% TRUE)
|
|
87 for (z in 1:length(Where_are_they)) { #then for every Y, make a motif
|
|
88
|
|
89 j<-Where_are_they[z]
|
|
90 a <- j-7
|
|
91 a<-ifelse(a<1, a <- 1, a <- a)
|
|
92 b<-j+7
|
|
93 b<-ifelse(b>length(data), b <- length(data), b <-
|
|
94 b)
|
|
95 #take the motif that is +/- 4 from that Y, sanity checks so that values are never off the grid from the protein
|
|
96
|
|
97 LeftSide<-7-(j-a)
|
|
98 RightSide<-7-(b-j)
|
|
99 #how is the motif justified? Does it have exactly 4 letters to the left/right, or does it not?
|
|
100
|
|
101 leftspaces<-rep(" ",times=LeftSide)
|
|
102 rightspaces<-rep(" ",times=RightSide)
|
|
103 #add blank spaces if the motif has less than 4 letters to the left/right
|
|
104
|
|
105
|
|
106 motif<-(data[(a):(b)])
|
|
107 motif<-c(leftspaces,motif,rightspaces)
|
|
108 #save that motif, which is the Y and +/- 4 amino acids, including truncation
|
|
109
|
|
110 motif<-paste(motif, sep="", collapse="")
|
|
111 #the 4 amino acids, put them back together into a single string
|
|
112 motif<-matrix(data=c(motif),nrow = 1)
|
|
113 namesss<-matrix(data=c(name),nrow = 1)
|
|
114 #keep this motif and separately keep the name of the protein it came from
|
|
115
|
|
116 allmotifs[MotifNumber,1]<-motif
|
|
117 thenames[MotifNumber,1]<-namesss
|
|
118 MotifNumber<-MotifNumber+1
|
|
119
|
|
120 }
|
|
121
|
|
122 }
|
|
123 }
|
|
124 }
|
|
125
|
|
126
|
|
127
|
|
128
|
|
129 names(allmotifs)<-thenames
|
|
130
|
|
131 truemotifs<-allmotifs[!duplicated(allmotifs)]
|
|
132 #remove duplicates from the motifs and names
|
|
133
|
|
134 #make the motifs and names into matrices
|
|
135 truemotifs<-truemotifs[!truemotifs %in% PositiveTrueMotifs]
|
|
136 outputfile<-cbind(names(truemotifs),truemotifs)
|
|
137 outputfile <- gsub(",","",outputfile)
|
|
138 write.table(outputfile, file=NAMEOFOUTPUTFILE, quote=FALSE, sep=",",
|
|
139 row.names=FALSE,col.names = FALSE, na="", append=TRUE)
|