Mercurial > repos > jfb > negative_motif_finder_7_7
comparison NMF/NMF-working-7-4-2020.R @ 5:5edbfbeba354 draft default tip
Uploaded
author | jfb |
---|---|
date | Tue, 14 Jul 2020 20:01:02 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
4:220d4359ec9b | 5:5edbfbeba354 |
---|---|
1 NAMEOFOUTPUTFILE<-"output1.csv" | |
2 | |
3 SuperAwesometrial <- read.delim2("input1.tabular", header=FALSE) | |
4 SBF<-read.csv("input3.csv", stringsAsFactors = FALSE, header = FALSE) | |
5 SBF<-t(SBF) | |
6 PositiveMotifs <- read.csv("input2.csv", stringsAsFactors=FALSE) | |
7 | |
8 | |
9 YsToim<-rep("xY",times=nrow(PositiveMotifs)) | |
10 PositiveMotifs[,11]<-YsToim | |
11 | |
12 | |
13 | |
14 #this code is meant to take a list of proteins, and our list of phosphopeptides, and find which Y-containing peptides could have been found phosphorylated but weren't | |
15 | |
16 | |
17 | |
18 #first then I create the list of phosphopeptides | |
19 Positive9Letters<-PositiveMotifs[,4:18] | |
20 PositiveTrueMotifs<-c() | |
21 | |
22 #then I take the proteins | |
23 AccessionNumbers<-as.character(SBF[2:nrow(SBF),1]) | |
24 AccessionNumbers<-AccessionNumbers[!is.na(AccessionNumbers)] | |
25 #the above is only those proteins from which our phosphopeptides sprung, the below is every protein in the human proteome | |
26 ALLPOSSIBLE<-SuperAwesometrial[,1] | |
27 ALLPOSSIBLE<-as.character(ALLPOSSIBLE) | |
28 | |
29 for (q in 1:nrow(Positive9Letters)) { | |
30 LeftJust<-0 | |
31 RightJust<-0 | |
32 | |
33 | |
34 motifmotif<-Positive9Letters[q,] | |
35 motifmotif<-paste(motifmotif, collapse = "",sep = "") | |
36 motifmotif<-unlist(strsplit(motifmotif, split = "")) | |
37 position <- match(x = "x", table = motifmotif) | |
38 LeftJust<-position-1 | |
39 RightJust<-length(motifmotif)-position-1 | |
40 #find which position was the phospho-amino acid, it is marked with an X | |
41 | |
42 LeftSpaces<-rep(x=" ", times=(7-LeftJust)) | |
43 RightSpaces<-rep(x=" ", times=(7-RightJust)) | |
44 motifmotif<-motifmotif[!motifmotif %in% c("x")] | |
45 motifmotif<-c(LeftSpaces,motifmotif,RightSpaces) | |
46 motifmotif<-paste(motifmotif, collapse = "",sep = "") | |
47 #put spaces on either side of the motif if the motif does not fill out a -7 to +7 motif | |
48 | |
49 PositiveTrueMotifs<-c(PositiveTrueMotifs,motifmotif) | |
50 } | |
51 | |
52 | |
53 | |
54 allmotifs<-matrix(data=rep("Motifs", times= 1000000),ncol = 1) | |
55 thenames<-matrix(data=rep("AccessionNumbers", times= 1000000),ncol = 1) | |
56 #I preallocate vectors for efficiency, but I have no way of knowing how big these particular vectors need to be, so I just make them way bigger | |
57 #than I know they need to be. A vector 1 million long is plenty big | |
58 | |
59 MotifNumber<-2 | |
60 | |
61 locations<-unique(grep(paste(AccessionNumbers,collapse="|"), ALLPOSSIBLE)) | |
62 | |
63 | |
64 if (sum(locations)>0){ | |
65 whereisit<-locations | |
66 for (u in 1:length(whereisit)) { | |
67 i<-whereisit[u] | |
68 name<-c() | |
69 data<-c() | |
70 name<-as.character(SuperAwesometrial[i,1]) | |
71 #the name of each protein is the first column | |
72 name<-sub(x=name, pattern=",", replacement="") | |
73 #the names may contain commas, remove them | |
74 data<-as.character(SuperAwesometrial[i,3]) | |
75 #the amino acids are stored in the third column | |
76 data<-strsplit(data,"") | |
77 #split them into their component letters | |
78 data<-unlist(data) | |
79 #turn them into a vector | |
80 motif<-c() | |
81 | |
82 #this part below is where I can speed things up | |
83 The_Ys<-data=="Y" | |
84 #find any Y in the protein | |
85 if (sum(The_Ys>0)){ #if there is at least one Y | |
86 Where_are_they<-which(The_Ys %in% TRUE) | |
87 for (z in 1:length(Where_are_they)) { #then for every Y, make a motif | |
88 | |
89 j<-Where_are_they[z] | |
90 a <- j-7 | |
91 a<-ifelse(a<1, a <- 1, a <- a) | |
92 b<-j+7 | |
93 b<-ifelse(b>length(data), b <- length(data), b <- | |
94 b) | |
95 #take the motif that is +/- 4 from that Y, sanity checks so that values are never off the grid from the protein | |
96 | |
97 LeftSide<-7-(j-a) | |
98 RightSide<-7-(b-j) | |
99 #how is the motif justified? Does it have exactly 4 letters to the left/right, or does it not? | |
100 | |
101 leftspaces<-rep(" ",times=LeftSide) | |
102 rightspaces<-rep(" ",times=RightSide) | |
103 #add blank spaces if the motif has less than 4 letters to the left/right | |
104 | |
105 | |
106 motif<-(data[(a):(b)]) | |
107 motif<-c(leftspaces,motif,rightspaces) | |
108 #save that motif, which is the Y and +/- 4 amino acids, including truncation | |
109 | |
110 motif<-paste(motif, sep="", collapse="") | |
111 #the 4 amino acids, put them back together into a single string | |
112 motif<-matrix(data=c(motif),nrow = 1) | |
113 namesss<-matrix(data=c(name),nrow = 1) | |
114 #keep this motif and separately keep the name of the protein it came from | |
115 | |
116 allmotifs[MotifNumber,1]<-motif | |
117 thenames[MotifNumber,1]<-namesss | |
118 MotifNumber<-MotifNumber+1 | |
119 | |
120 } | |
121 | |
122 } | |
123 } | |
124 } | |
125 | |
126 | |
127 | |
128 | |
129 names(allmotifs)<-thenames | |
130 | |
131 truemotifs<-allmotifs[!duplicated(allmotifs)] | |
132 #remove duplicates from the motifs and names | |
133 | |
134 #make the motifs and names into matrices | |
135 truemotifs<-truemotifs[!truemotifs %in% PositiveTrueMotifs] | |
136 outputfile<-cbind(names(truemotifs),truemotifs) | |
137 outputfile <- gsub(",","",outputfile) | |
138 write.table(outputfile, file=NAMEOFOUTPUTFILE, quote=FALSE, sep=",", | |
139 row.names=FALSE,col.names = FALSE, na="", append=TRUE) |