0
|
1 NAMEOFOUTPUTFILE<-"output1.csv"
|
|
2
|
|
3 SuperAwesometrial <- read.delim2("input1.tabular", header=FALSE)
|
|
4 #once you've used the other script to turn the FASFA into a CSV, copypaste the filepath and name
|
|
5 #of the csv into this line between the quote marks.
|
|
6
|
1
|
7 SBF<-read.csv("input3.csv", stringsAsFactors = FALSE, header = FALSE)
|
|
8 SBF<-t(SBF)
|
0
|
9
|
|
10 PositiveMotifs <- read.csv("input2.csv", stringsAsFactors=FALSE)
|
|
11 #because of R reasons, it is required that the motifs in this file have blank cells instead of spaces where there is no letter in
|
|
12 #the motif
|
|
13
|
1
|
14 YsToim<-rep("xY",times=nrow(PositiveMotifs))
|
|
15 PositiveMotifs[,11]<-YsToim
|
|
16
|
|
17
|
|
18
|
0
|
19 ################################################################################################################################
|
|
20 #I have to paste them, then split and unlist them, then find the x and paste again
|
|
21 Positive9Letters<-PositiveMotifs[,4:18]
|
|
22 #head(Positive9Letters)
|
|
23 PositiveTrueMotifs<-c()
|
|
24
|
1
|
25 AccessionNumbers<-as.character(SBF[2:nrow(SBF),1])
|
|
26 AccessionNumbers<-AccessionNumbers[!is.na(AccessionNumbers)]
|
0
|
27 ALLPOSSIBLE<-SuperAwesometrial[,1]
|
|
28 ALLPOSSIBLE<-as.character(ALLPOSSIBLE)
|
|
29 ################################################################################################################################
|
|
30
|
|
31 for (q in 1:nrow(Positive9Letters)) {
|
|
32 LeftJust<-0
|
|
33 RightJust<-0
|
|
34
|
|
35 motifmotif<-Positive9Letters[q,]
|
|
36 motifmotif<-paste(motifmotif, collapse = "",sep = "")
|
|
37
|
|
38 motifmotif<-unlist(strsplit(motifmotif, split = ""))
|
|
39
|
|
40 position <- match(x = "x", table = motifmotif)
|
|
41 LeftJust<-position-1
|
|
42 RightJust<-length(motifmotif)-position-1
|
|
43
|
|
44 LeftSpaces<-rep(x=" ", times=(7-LeftJust))
|
|
45 RightSpaces<-rep(x=" ", times=(7-RightJust))
|
|
46
|
|
47 motifmotif<-motifmotif[!motifmotif %in% c("x")]
|
|
48
|
|
49 motifmotif<-c(LeftSpaces,motifmotif,RightSpaces)
|
|
50 motifmotif<-paste(motifmotif, collapse = "",sep = "")
|
|
51 PositiveTrueMotifs<-c(PositiveTrueMotifs,motifmotif)
|
|
52 }
|
|
53
|
|
54
|
|
55 ################################################################################################################################
|
|
56 allmotifs<-matrix(data=c("Motifs"),nrow=1)
|
|
57 thenames<-matrix(data=c("AccessionNumbers"),nrow = 1)
|
|
58 ################################################################################################################################
|
|
59
|
|
60 ################################################################################################################################
|
|
61
|
|
62 #TrueMotifNums<-which(ALLPOSSIBLE %in% AccessionNumbers)
|
|
63 #fihlodeANs<-c()
|
|
64 for (q in 1:length(AccessionNumbers)) {
|
1
|
65 patterno<-as.character(AccessionNumbers[q])
|
0
|
66 location<-sapply(ALLPOSSIBLE, grepl, pattern=patterno, fixed=TRUE)
|
|
67 if (sum(location)>0){
|
|
68 whereisit<-which(location %in% TRUE)
|
|
69 for (u in 1:length(whereisit)) {
|
|
70 i<-whereisit[u]
|
|
71 name<-c()
|
|
72 data<-c()
|
|
73 name<-as.character(SuperAwesometrial[i,1])
|
|
74 #the name of each protein is the first column
|
|
75 name<-sub(x=name, pattern=",", replacement="")
|
|
76 #the names may contain commas, remove them
|
|
77 data<-as.character(SuperAwesometrial[i,3])
|
|
78 #the amino acids are stored in the third column
|
|
79 data<-strsplit(data,"")
|
|
80 #split them into their component letters
|
|
81 data<-unlist(data)
|
|
82 #turn them into a vector
|
|
83 motif<-c()
|
|
84 for (j in 1:length(data)){
|
|
85 if ("Y" %in% data[j]){
|
|
86 #if there is a Y aka Tyrosine in the data
|
|
87 #allmotifs=rbind(allmotifs,data[(i-4):(i+4)])
|
|
88 a<-j-7
|
|
89 if (a<1){
|
|
90 a<-1
|
|
91 }
|
|
92 b<-j+7
|
|
93 if (b>length(data)){
|
|
94 b<-length(data)
|
|
95 }
|
|
96 #take the motif that is +/- 4 from that Y, sanity checks so that values are never off the grid from the protein
|
|
97
|
|
98 LeftSide<-7-(j-a)
|
|
99 RightSide<-7-(b-j)
|
|
100 #how is the motif justified? Does it have exactly 4 letters to the left/right, or does it not?
|
|
101
|
|
102 leftspaces<-rep(" ",times=LeftSide)
|
|
103 rightspaces<-rep(" ",times=RightSide)
|
|
104 #add blank spaces if the motif has less than 4 letters to the left/right
|
|
105
|
|
106
|
|
107 motif<-(data[(a):(b)])
|
|
108 motif<-c(leftspaces,motif,rightspaces)
|
|
109 #save that motif, which is the Y and +/- 4 amino acids, including truncation
|
|
110
|
|
111 # lens<-c(lens,length(motif))
|
|
112 # leni<-c(leni,i)
|
|
113 # lenj<-c(lenj,j)
|
|
114
|
|
115 motif<-paste(motif, sep="", collapse="")
|
|
116 #the 4 amino acids, put them back together into a single string
|
|
117 motif<-matrix(data=c(motif),nrow = 1)
|
|
118 namesss<-matrix(data=c(name),nrow = 1)
|
|
119 #keep this motif and separately keep the name of the protein it came from
|
|
120 allmotifs<-rbind(allmotifs,motif)
|
|
121 thenames<-rbind(thenames,namesss)
|
|
122 #add names and motifs to a growing list
|
|
123
|
|
124 # write.table(motif, file="TRIALTIALRIAALSKFDJSD.csv", quote=FALSE, sep=",",
|
|
125 # row.names=FALSE,col.names = FALSE, na="", append=TRUE)
|
|
126 #and then write it into a csv, the sep is needed so that the two pieces of the data frame are separated
|
|
127 #append has 1to equal true because this thing will loop around many times adding more and more data points
|
|
128 #you must create a new filename/filepath with each new data you run
|
|
129 }
|
|
130 }
|
|
131 }
|
|
132 }
|
|
133 }
|
|
134
|
|
135
|
|
136
|
|
137
|
|
138 ################################################################################################################################
|
|
139 ################################################################################################################################
|
|
140 ################################################################################################################################
|
|
141
|
|
142
|
|
143 # for (i in 1:nrow(SuperAwesometrial)){
|
|
144 #
|
|
145 # }
|
|
146
|
|
147 names(allmotifs)<-thenames
|
|
148
|
|
149 truemotifs<-allmotifs[!duplicated(allmotifs)]
|
|
150 #truenames<-thenames[!duplicated(thenames)]
|
|
151 #remove duplicates from the motifs and names
|
|
152
|
|
153 #make the motifs and names into matrices
|
|
154
|
|
155 # for (w in 1:nrow(truemotifs)) {
|
|
156 # for (e in 1:length(PositiveTrueMotifs)){
|
|
157 # if (grepl(pattern=PositiveTrueMotifs[e], x=truemotifs[w,1],ignore.case = TRUE)==TRUE){
|
|
158 # truemotifs[w,1]<-NA
|
|
159 # }
|
|
160 # }
|
|
161 # }
|
|
162
|
|
163 truemotifs<-truemotifs[!truemotifs %in% PositiveTrueMotifs]
|
|
164
|
|
165 # truemotifs<-matrix(data = truemotifs,ncol = 1)
|
|
166 # truenames<-matrix(data=truenames,ncol = 1)
|
|
167 #
|
|
168 #
|
|
169 # #program only works if there are more motifs than names, fuck it
|
|
170 #
|
|
171 # rowsrows<-nrow(truemotifs)-nrow(truenames)
|
|
172 # nanas<-rep(NA,times=rowsrows)
|
|
173 # nanas<-matrix(data = nanas,ncol = 1)
|
|
174 # truenames<-rbind(truenames,nanas)
|
|
175 # #to turn the motifs and names into a single output matrix, add enough rows of NAs so the two initial matrices are equivalent,
|
|
176 # #then put them together columnwise
|
|
177
|
|
178 outputfile<-cbind(names(truemotifs),truemotifs)
|
|
179
|
|
180 outputfile <- gsub(",","",outputfile)
|
|
181
|
|
182 write.table(outputfile, file=NAMEOFOUTPUTFILE, quote=FALSE, sep=",",
|
|
183 row.names=FALSE,col.names = FALSE, na="", append=TRUE)
|