Mercurial > repos > jfb > negative_motif_finder_7_7
comparison NMF/NMF-working-2-5-20.R @ 4:220d4359ec9b draft
Uploaded
author | jfb |
---|---|
date | Thu, 06 Feb 2020 14:20:36 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3:a69be20d500d | 4:220d4359ec9b |
---|---|
1 NAMEOFOUTPUTFILE<-"output1.csv" | |
2 | |
3 SuperAwesometrial <- read.delim2("input1.tabular", header=FALSE) | |
4 #once you've used the other script to turn the FASFA into a CSV, copypaste the filepath and name | |
5 #of the csv into this line between the quote marks. | |
6 | |
7 SBF<-read.csv("input3.csv", stringsAsFactors = FALSE, header = FALSE) | |
8 SBF<-t(SBF) | |
9 | |
10 PositiveMotifs <- read.csv("input2.csv", stringsAsFactors=FALSE) | |
11 #because of R reasons, it is required that the motifs in this file have blank cells instead of spaces where there is no letter in | |
12 #the motif | |
13 | |
14 YsToim<-rep("xY",times=nrow(PositiveMotifs)) | |
15 PositiveMotifs[,11]<-YsToim | |
16 | |
17 | |
18 | |
19 ################################################################################################################################ | |
20 #I have to paste them, then split and unlist them, then find the x and paste again | |
21 Positive9Letters<-PositiveMotifs[,4:18] | |
22 #head(Positive9Letters) | |
23 PositiveTrueMotifs<-c() | |
24 | |
25 AccessionNumbers<-as.character(SBF[2:nrow(SBF),1]) | |
26 AccessionNumbers<-AccessionNumbers[!is.na(AccessionNumbers)] | |
27 ALLPOSSIBLE<-SuperAwesometrial[,1] | |
28 ALLPOSSIBLE<-as.character(ALLPOSSIBLE) | |
29 ################################################################################################################################ | |
30 | |
31 for (q in 1:nrow(Positive9Letters)) { | |
32 LeftJust<-0 | |
33 RightJust<-0 | |
34 | |
35 motifmotif<-Positive9Letters[q,] | |
36 motifmotif<-paste(motifmotif, collapse = "",sep = "") | |
37 | |
38 motifmotif<-unlist(strsplit(motifmotif, split = "")) | |
39 | |
40 position <- match(x = "x", table = motifmotif) | |
41 LeftJust<-position-1 | |
42 RightJust<-length(motifmotif)-position-1 | |
43 | |
44 LeftSpaces<-rep(x=" ", times=(7-LeftJust)) | |
45 RightSpaces<-rep(x=" ", times=(7-RightJust)) | |
46 | |
47 motifmotif<-motifmotif[!motifmotif %in% c("x")] | |
48 | |
49 motifmotif<-c(LeftSpaces,motifmotif,RightSpaces) | |
50 motifmotif<-paste(motifmotif, collapse = "",sep = "") | |
51 PositiveTrueMotifs<-c(PositiveTrueMotifs,motifmotif) | |
52 } | |
53 | |
54 | |
55 | |
56 ################################################################################################################################ | |
57 allmotifs<-matrix(data=rep("Motifs", times= 1000000),ncol = 1) | |
58 thenames<-matrix(data=rep("AccessionNumbers", times= 1000000),ncol = 1) | |
59 ################################################################################################################################ | |
60 | |
61 ################################################################################################################################ | |
62 | |
63 #I need to preallocate these vectors. I will find out how many y's there are total and then make the vector that many long | |
64 #Or what I need is two separate loops. First loop finds all the accession number positions that Grep to the FASTA (which is called ALLPOSSIBLE) | |
65 #then take only those AAs from the fasta and count their y's, preallocate the vector for part 2 to that many y's | |
66 #those accessions and such as saved in a vector... this seems like it would be no faster actually | |
67 | |
68 #then_that_are <- which(AccessionNumbers %in% ALLPOSSIBLE) | |
69 | |
70 MotifNumber<-2 | |
71 | |
72 #TrueMotifNums<-which(ALLPOSSIBLE %in% AccessionNumbers) | |
73 #fihlodeANs<-c() | |
74 | |
75 locations<-unique(grep(paste(AccessionNumbers,collapse="|"), ALLPOSSIBLE)) | |
76 | |
77 if (sum(locations)>0){ | |
78 whereisit<-locations | |
79 for (u in 1:length(whereisit)) { | |
80 i<-whereisit[u] | |
81 name<-c() | |
82 data<-c() | |
83 name<-as.character(SuperAwesometrial[i,1]) | |
84 #the name of each protein is the first column | |
85 name<-sub(x=name, pattern=",", replacement="") | |
86 #the names may contain commas, remove them | |
87 data<-as.character(SuperAwesometrial[i,3]) | |
88 #the amino acids are stored in the third column | |
89 data<-strsplit(data,"") | |
90 #split them into their component letters | |
91 data<-unlist(data) | |
92 #turn them into a vector | |
93 motif<-c() | |
94 | |
95 #this part below is where I can speed things up | |
96 The_Ys<-data=="Y" | |
97 #find any Y in the protein | |
98 if (sum(The_Ys>0)){ #if there is at least one Y | |
99 Where_are_they<-which(The_Ys %in% TRUE) | |
100 for (z in 1:length(Where_are_they)) { #then for every Y, make a motif | |
101 | |
102 j<-Where_are_they[z] | |
103 #for (j in 1:length(data)){ | |
104 #if ("Y" %in% data[j]){ | |
105 #if there is a Y aka Tyrosine in the data | |
106 #allmotifs=rbind(allmotifs,data[(i-4):(i+4)]) | |
107 a <- j-7 | |
108 a<-ifelse(a<1, a <- 1, a <- a) | |
109 # if (a<1){ | |
110 # a <- 1 | |
111 # } | |
112 b<-j+7 | |
113 b<-ifelse(b>length(data), b <- length(data), b <- | |
114 b) | |
115 # if (b>length(data)){ | |
116 # b<-length(data) | |
117 # } | |
118 #take the motif that is +/- 4 from that Y, sanity checks so that values are never off the grid from the protein | |
119 | |
120 LeftSide<-7-(j-a) | |
121 RightSide<-7-(b-j) | |
122 #how is the motif justified? Does it have exactly 4 letters to the left/right, or does it not? | |
123 | |
124 leftspaces<-rep(" ",times=LeftSide) | |
125 rightspaces<-rep(" ",times=RightSide) | |
126 #add blank spaces if the motif has less than 4 letters to the left/right | |
127 | |
128 | |
129 motif<-(data[(a):(b)]) | |
130 motif<-c(leftspaces,motif,rightspaces) | |
131 #save that motif, which is the Y and +/- 4 amino acids, including truncation | |
132 | |
133 # lens<-c(lens,length(motif)) | |
134 # leni<-c(leni,i) | |
135 # lenj<-c(lenj,j) | |
136 | |
137 motif<-paste(motif, sep="", collapse="") | |
138 #the 4 amino acids, put them back together into a single string | |
139 motif<-matrix(data=c(motif),nrow = 1) | |
140 namesss<-matrix(data=c(name),nrow = 1) | |
141 #keep this motif and separately keep the name of the protein it came from | |
142 | |
143 # allmotifs<-rbind(allmotifs,motif) | |
144 # thenames<-rbind(thenames,namesss) | |
145 allmotifs[MotifNumber,1]<-motif | |
146 thenames[MotifNumber,1]<-namesss | |
147 MotifNumber<-MotifNumber+1 | |
148 | |
149 #add names and motifs to a growing list | |
150 | |
151 # write.table(motif, file="TRIALTIALRIAALSKFDJSD.csv", quote=FALSE, sep=",", | |
152 # row.names=FALSE,col.names = FALSE, na="", append=TRUE) | |
153 #and then write it into a csv, the sep is needed so that the two pieces of the data frame are separated | |
154 #append has 1to equal true because this thing will loop around many times adding more and more data points | |
155 #you must create a new filename/filepath with each new data you run | |
156 } | |
157 | |
158 } | |
159 } | |
160 } | |
161 | |
162 | |
163 | |
164 | |
165 ################################################################################################################################ | |
166 ################################################################################################################################ | |
167 ################################################################################################################################ | |
168 | |
169 | |
170 # for (i in 1:nrow(SuperAwesometrial)){ | |
171 # | |
172 # } | |
173 | |
174 names(allmotifs)<-thenames | |
175 | |
176 truemotifs<-allmotifs[!duplicated(allmotifs)] | |
177 #truenames<-thenames[!duplicated(thenames)] | |
178 #remove duplicates from the motifs and names | |
179 | |
180 #make the motifs and names into matrices | |
181 | |
182 | |
183 truemotifs<-truemotifs[!truemotifs %in% PositiveTrueMotifs] | |
184 | |
185 outputfile<-cbind(names(truemotifs),truemotifs) | |
186 | |
187 outputfile <- gsub(",","",outputfile) | |
188 | |
189 write.table(outputfile, file=NAMEOFOUTPUTFILE, quote=FALSE, sep=",", | |
190 row.names=FALSE,col.names = FALSE, na="", append=TRUE) |