comparison NMF/NMF.R @ 0:dd301fc4b54e draft

Uploaded
author jfb
date Fri, 23 Feb 2018 16:37:29 -0500
parents
children a098e1274f63
comparison
equal deleted inserted replaced
-1:000000000000 0:dd301fc4b54e
1 NAMEOFOUTPUTFILE<-"output1.csv"
2 #this is the name of the file you will create
3
4
5
6 SuperAwesometrial <- read.delim2("input1.tabular", header=FALSE)
7 #once you've used the other script to turn the FASFA into a CSV, copypaste the filepath and name
8 #of the csv into this line between the quote marks.
9
10 SBF<-read.csv("input3.csv", stringsAsFactors = FALSE)
11
12
13 PositiveMotifs <- read.csv("input2.csv", stringsAsFactors=FALSE)
14 #because of R reasons, it is required that the motifs in this file have blank cells instead of spaces where there is no letter in
15 #the motif
16
17 ################################################################################################################################
18 #I have to paste them, then split and unlist them, then find the x and paste again
19 Positive9Letters<-PositiveMotifs[,4:18]
20 #head(Positive9Letters)
21 PositiveTrueMotifs<-c()
22
23 AccessionNumbers<-SBF[,1]
24 ALLPOSSIBLE<-SuperAwesometrial[,1]
25 ALLPOSSIBLE<-as.character(ALLPOSSIBLE)
26 ################################################################################################################################
27
28 for (q in 1:nrow(Positive9Letters)) {
29 LeftJust<-0
30 RightJust<-0
31
32 motifmotif<-Positive9Letters[q,]
33 motifmotif<-paste(motifmotif, collapse = "",sep = "")
34
35 motifmotif<-unlist(strsplit(motifmotif, split = ""))
36
37 position <- match(x = "x", table = motifmotif)
38 LeftJust<-position-1
39 RightJust<-length(motifmotif)-position-1
40
41 LeftSpaces<-rep(x=" ", times=(7-LeftJust))
42 RightSpaces<-rep(x=" ", times=(7-RightJust))
43
44 motifmotif<-motifmotif[!motifmotif %in% c("x")]
45
46 motifmotif<-c(LeftSpaces,motifmotif,RightSpaces)
47 motifmotif<-paste(motifmotif, collapse = "",sep = "")
48 PositiveTrueMotifs<-c(PositiveTrueMotifs,motifmotif)
49 }
50
51
52 ################################################################################################################################
53 allmotifs<-matrix(data=c("Motifs"),nrow=1)
54 thenames<-matrix(data=c("AccessionNumbers"),nrow = 1)
55 ################################################################################################################################
56
57 ################################################################################################################################
58
59 #TrueMotifNums<-which(ALLPOSSIBLE %in% AccessionNumbers)
60 #fihlodeANs<-c()
61 for (q in 1:length(AccessionNumbers)) {
62 patterno<-AccessionNumbers[q]
63 location<-sapply(ALLPOSSIBLE, grepl, pattern=patterno, fixed=TRUE)
64 if (sum(location)>0){
65 whereisit<-which(location %in% TRUE)
66 for (u in 1:length(whereisit)) {
67 i<-whereisit[u]
68 name<-c()
69 data<-c()
70 name<-as.character(SuperAwesometrial[i,1])
71 #the name of each protein is the first column
72 name<-sub(x=name, pattern=",", replacement="")
73 #the names may contain commas, remove them
74 data<-as.character(SuperAwesometrial[i,3])
75 #the amino acids are stored in the third column
76 data<-strsplit(data,"")
77 #split them into their component letters
78 data<-unlist(data)
79 #turn them into a vector
80 motif<-c()
81 for (j in 1:length(data)){
82 if ("Y" %in% data[j]){
83 #if there is a Y aka Tyrosine in the data
84 #allmotifs=rbind(allmotifs,data[(i-4):(i+4)])
85 a<-j-7
86 if (a<1){
87 a<-1
88 }
89 b<-j+7
90 if (b>length(data)){
91 b<-length(data)
92 }
93 #take the motif that is +/- 4 from that Y, sanity checks so that values are never off the grid from the protein
94
95 LeftSide<-7-(j-a)
96 RightSide<-7-(b-j)
97 #how is the motif justified? Does it have exactly 4 letters to the left/right, or does it not?
98
99 leftspaces<-rep(" ",times=LeftSide)
100 rightspaces<-rep(" ",times=RightSide)
101 #add blank spaces if the motif has less than 4 letters to the left/right
102
103
104 motif<-(data[(a):(b)])
105 motif<-c(leftspaces,motif,rightspaces)
106 #save that motif, which is the Y and +/- 4 amino acids, including truncation
107
108 # lens<-c(lens,length(motif))
109 # leni<-c(leni,i)
110 # lenj<-c(lenj,j)
111
112 motif<-paste(motif, sep="", collapse="")
113 #the 4 amino acids, put them back together into a single string
114 motif<-matrix(data=c(motif),nrow = 1)
115 namesss<-matrix(data=c(name),nrow = 1)
116 #keep this motif and separately keep the name of the protein it came from
117 allmotifs<-rbind(allmotifs,motif)
118 thenames<-rbind(thenames,namesss)
119 #add names and motifs to a growing list
120
121 # write.table(motif, file="TRIALTIALRIAALSKFDJSD.csv", quote=FALSE, sep=",",
122 # row.names=FALSE,col.names = FALSE, na="", append=TRUE)
123 #and then write it into a csv, the sep is needed so that the two pieces of the data frame are separated
124 #append has 1to equal true because this thing will loop around many times adding more and more data points
125 #you must create a new filename/filepath with each new data you run
126 }
127 }
128 }
129 }
130 }
131
132
133
134
135 ################################################################################################################################
136 ################################################################################################################################
137 ################################################################################################################################
138
139
140 # for (i in 1:nrow(SuperAwesometrial)){
141 #
142 # }
143
144 names(allmotifs)<-thenames
145
146 truemotifs<-allmotifs[!duplicated(allmotifs)]
147 #truenames<-thenames[!duplicated(thenames)]
148 #remove duplicates from the motifs and names
149
150 #make the motifs and names into matrices
151
152 # for (w in 1:nrow(truemotifs)) {
153 # for (e in 1:length(PositiveTrueMotifs)){
154 # if (grepl(pattern=PositiveTrueMotifs[e], x=truemotifs[w,1],ignore.case = TRUE)==TRUE){
155 # truemotifs[w,1]<-NA
156 # }
157 # }
158 # }
159
160 truemotifs<-truemotifs[!truemotifs %in% PositiveTrueMotifs]
161
162 # truemotifs<-matrix(data = truemotifs,ncol = 1)
163 # truenames<-matrix(data=truenames,ncol = 1)
164 #
165 #
166 # #program only works if there are more motifs than names, fuck it
167 #
168 # rowsrows<-nrow(truemotifs)-nrow(truenames)
169 # nanas<-rep(NA,times=rowsrows)
170 # nanas<-matrix(data = nanas,ncol = 1)
171 # truenames<-rbind(truenames,nanas)
172 # #to turn the motifs and names into a single output matrix, add enough rows of NAs so the two initial matrices are equivalent,
173 # #then put them together columnwise
174
175 outputfile<-cbind(names(truemotifs),truemotifs)
176
177 outputfile <- gsub(",","",outputfile)
178
179 write.table(outputfile, file=NAMEOFOUTPUTFILE, quote=FALSE, sep=",",
180 row.names=FALSE,col.names = FALSE, na="", append=TRUE)