Mercurial > repos > jfb > negative_motif_finder_7_7
comparison NMF/NMF.R @ 0:dd301fc4b54e draft
Uploaded
author | jfb |
---|---|
date | Fri, 23 Feb 2018 16:37:29 -0500 |
parents | |
children | a098e1274f63 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dd301fc4b54e |
---|---|
1 NAMEOFOUTPUTFILE<-"output1.csv" | |
2 #this is the name of the file you will create | |
3 | |
4 | |
5 | |
6 SuperAwesometrial <- read.delim2("input1.tabular", header=FALSE) | |
7 #once you've used the other script to turn the FASFA into a CSV, copypaste the filepath and name | |
8 #of the csv into this line between the quote marks. | |
9 | |
10 SBF<-read.csv("input3.csv", stringsAsFactors = FALSE) | |
11 | |
12 | |
13 PositiveMotifs <- read.csv("input2.csv", stringsAsFactors=FALSE) | |
14 #because of R reasons, it is required that the motifs in this file have blank cells instead of spaces where there is no letter in | |
15 #the motif | |
16 | |
17 ################################################################################################################################ | |
18 #I have to paste them, then split and unlist them, then find the x and paste again | |
19 Positive9Letters<-PositiveMotifs[,4:18] | |
20 #head(Positive9Letters) | |
21 PositiveTrueMotifs<-c() | |
22 | |
23 AccessionNumbers<-SBF[,1] | |
24 ALLPOSSIBLE<-SuperAwesometrial[,1] | |
25 ALLPOSSIBLE<-as.character(ALLPOSSIBLE) | |
26 ################################################################################################################################ | |
27 | |
28 for (q in 1:nrow(Positive9Letters)) { | |
29 LeftJust<-0 | |
30 RightJust<-0 | |
31 | |
32 motifmotif<-Positive9Letters[q,] | |
33 motifmotif<-paste(motifmotif, collapse = "",sep = "") | |
34 | |
35 motifmotif<-unlist(strsplit(motifmotif, split = "")) | |
36 | |
37 position <- match(x = "x", table = motifmotif) | |
38 LeftJust<-position-1 | |
39 RightJust<-length(motifmotif)-position-1 | |
40 | |
41 LeftSpaces<-rep(x=" ", times=(7-LeftJust)) | |
42 RightSpaces<-rep(x=" ", times=(7-RightJust)) | |
43 | |
44 motifmotif<-motifmotif[!motifmotif %in% c("x")] | |
45 | |
46 motifmotif<-c(LeftSpaces,motifmotif,RightSpaces) | |
47 motifmotif<-paste(motifmotif, collapse = "",sep = "") | |
48 PositiveTrueMotifs<-c(PositiveTrueMotifs,motifmotif) | |
49 } | |
50 | |
51 | |
52 ################################################################################################################################ | |
53 allmotifs<-matrix(data=c("Motifs"),nrow=1) | |
54 thenames<-matrix(data=c("AccessionNumbers"),nrow = 1) | |
55 ################################################################################################################################ | |
56 | |
57 ################################################################################################################################ | |
58 | |
59 #TrueMotifNums<-which(ALLPOSSIBLE %in% AccessionNumbers) | |
60 #fihlodeANs<-c() | |
61 for (q in 1:length(AccessionNumbers)) { | |
62 patterno<-AccessionNumbers[q] | |
63 location<-sapply(ALLPOSSIBLE, grepl, pattern=patterno, fixed=TRUE) | |
64 if (sum(location)>0){ | |
65 whereisit<-which(location %in% TRUE) | |
66 for (u in 1:length(whereisit)) { | |
67 i<-whereisit[u] | |
68 name<-c() | |
69 data<-c() | |
70 name<-as.character(SuperAwesometrial[i,1]) | |
71 #the name of each protein is the first column | |
72 name<-sub(x=name, pattern=",", replacement="") | |
73 #the names may contain commas, remove them | |
74 data<-as.character(SuperAwesometrial[i,3]) | |
75 #the amino acids are stored in the third column | |
76 data<-strsplit(data,"") | |
77 #split them into their component letters | |
78 data<-unlist(data) | |
79 #turn them into a vector | |
80 motif<-c() | |
81 for (j in 1:length(data)){ | |
82 if ("Y" %in% data[j]){ | |
83 #if there is a Y aka Tyrosine in the data | |
84 #allmotifs=rbind(allmotifs,data[(i-4):(i+4)]) | |
85 a<-j-7 | |
86 if (a<1){ | |
87 a<-1 | |
88 } | |
89 b<-j+7 | |
90 if (b>length(data)){ | |
91 b<-length(data) | |
92 } | |
93 #take the motif that is +/- 4 from that Y, sanity checks so that values are never off the grid from the protein | |
94 | |
95 LeftSide<-7-(j-a) | |
96 RightSide<-7-(b-j) | |
97 #how is the motif justified? Does it have exactly 4 letters to the left/right, or does it not? | |
98 | |
99 leftspaces<-rep(" ",times=LeftSide) | |
100 rightspaces<-rep(" ",times=RightSide) | |
101 #add blank spaces if the motif has less than 4 letters to the left/right | |
102 | |
103 | |
104 motif<-(data[(a):(b)]) | |
105 motif<-c(leftspaces,motif,rightspaces) | |
106 #save that motif, which is the Y and +/- 4 amino acids, including truncation | |
107 | |
108 # lens<-c(lens,length(motif)) | |
109 # leni<-c(leni,i) | |
110 # lenj<-c(lenj,j) | |
111 | |
112 motif<-paste(motif, sep="", collapse="") | |
113 #the 4 amino acids, put them back together into a single string | |
114 motif<-matrix(data=c(motif),nrow = 1) | |
115 namesss<-matrix(data=c(name),nrow = 1) | |
116 #keep this motif and separately keep the name of the protein it came from | |
117 allmotifs<-rbind(allmotifs,motif) | |
118 thenames<-rbind(thenames,namesss) | |
119 #add names and motifs to a growing list | |
120 | |
121 # write.table(motif, file="TRIALTIALRIAALSKFDJSD.csv", quote=FALSE, sep=",", | |
122 # row.names=FALSE,col.names = FALSE, na="", append=TRUE) | |
123 #and then write it into a csv, the sep is needed so that the two pieces of the data frame are separated | |
124 #append has 1to equal true because this thing will loop around many times adding more and more data points | |
125 #you must create a new filename/filepath with each new data you run | |
126 } | |
127 } | |
128 } | |
129 } | |
130 } | |
131 | |
132 | |
133 | |
134 | |
135 ################################################################################################################################ | |
136 ################################################################################################################################ | |
137 ################################################################################################################################ | |
138 | |
139 | |
140 # for (i in 1:nrow(SuperAwesometrial)){ | |
141 # | |
142 # } | |
143 | |
144 names(allmotifs)<-thenames | |
145 | |
146 truemotifs<-allmotifs[!duplicated(allmotifs)] | |
147 #truenames<-thenames[!duplicated(thenames)] | |
148 #remove duplicates from the motifs and names | |
149 | |
150 #make the motifs and names into matrices | |
151 | |
152 # for (w in 1:nrow(truemotifs)) { | |
153 # for (e in 1:length(PositiveTrueMotifs)){ | |
154 # if (grepl(pattern=PositiveTrueMotifs[e], x=truemotifs[w,1],ignore.case = TRUE)==TRUE){ | |
155 # truemotifs[w,1]<-NA | |
156 # } | |
157 # } | |
158 # } | |
159 | |
160 truemotifs<-truemotifs[!truemotifs %in% PositiveTrueMotifs] | |
161 | |
162 # truemotifs<-matrix(data = truemotifs,ncol = 1) | |
163 # truenames<-matrix(data=truenames,ncol = 1) | |
164 # | |
165 # | |
166 # #program only works if there are more motifs than names, fuck it | |
167 # | |
168 # rowsrows<-nrow(truemotifs)-nrow(truenames) | |
169 # nanas<-rep(NA,times=rowsrows) | |
170 # nanas<-matrix(data = nanas,ncol = 1) | |
171 # truenames<-rbind(truenames,nanas) | |
172 # #to turn the motifs and names into a single output matrix, add enough rows of NAs so the two initial matrices are equivalent, | |
173 # #then put them together columnwise | |
174 | |
175 outputfile<-cbind(names(truemotifs),truemotifs) | |
176 | |
177 outputfile <- gsub(",","",outputfile) | |
178 | |
179 write.table(outputfile, file=NAMEOFOUTPUTFILE, quote=FALSE, sep=",", | |
180 row.names=FALSE,col.names = FALSE, na="", append=TRUE) |