Mercurial > repos > jfb > negative_motif_finder_7_7

--- a/NMF/NMF-working-2-5-20.R	Thu Feb 06 14:20:36 2020 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,190 +0,0 @@
-NAMEOFOUTPUTFILE<-"output1.csv"
-
-SuperAwesometrial <- read.delim2("input1.tabular", header=FALSE)
-#once you've used the other script to turn the FASFA into a CSV, copypaste the filepath and name
-#of the csv into this line between the quote marks.
-
-SBF<-read.csv("input3.csv", stringsAsFactors = FALSE, header = FALSE)
-SBF<-t(SBF)
-
-PositiveMotifs <- read.csv("input2.csv", stringsAsFactors=FALSE)
-#because of R reasons, it is required that the motifs in this file have blank cells instead of spaces where there is no letter in
-#the motif
-
-YsToim<-rep("xY",times=nrow(PositiveMotifs))
-PositiveMotifs[,11]<-YsToim
-
-
-
-################################################################################################################################
-#I have to paste them, then split and unlist them, then find the x and paste again
-Positive9Letters<-PositiveMotifs[,4:18]
-#head(Positive9Letters)
-PositiveTrueMotifs<-c()
-
-AccessionNumbers<-as.character(SBF[2:nrow(SBF),1])
-AccessionNumbers<-AccessionNumbers[!is.na(AccessionNumbers)]
-ALLPOSSIBLE<-SuperAwesometrial[,1]
-ALLPOSSIBLE<-as.character(ALLPOSSIBLE)
-################################################################################################################################
-
-for (q in 1:nrow(Positive9Letters)) {
-  LeftJust<-0
-  RightJust<-0
-
-  motifmotif<-Positive9Letters[q,]
-  motifmotif<-paste(motifmotif, collapse = "",sep = "")
-
-  motifmotif<-unlist(strsplit(motifmotif, split = ""))
-
-  position <- match(x = "x", table = motifmotif)
-  LeftJust<-position-1
-  RightJust<-length(motifmotif)-position-1
-
-  LeftSpaces<-rep(x=" ", times=(7-LeftJust))
-  RightSpaces<-rep(x=" ", times=(7-RightJust))
-
-  motifmotif<-motifmotif[!motifmotif %in% c("x")]
-
-  motifmotif<-c(LeftSpaces,motifmotif,RightSpaces)
-  motifmotif<-paste(motifmotif, collapse = "",sep = "")
-  PositiveTrueMotifs<-c(PositiveTrueMotifs,motifmotif)
-}
-
-
-
-################################################################################################################################
-allmotifs<-matrix(data=rep("Motifs", times= 1000000),ncol = 1)
-thenames<-matrix(data=rep("AccessionNumbers", times= 1000000),ncol = 1)
-################################################################################################################################
-
-################################################################################################################################
-
-#I need to preallocate these vectors.  I will find out how many y's there are total and then make the vector that many long
-#Or what I need is two separate loops.  First loop finds all the accession number positions that Grep to the FASTA (which is called ALLPOSSIBLE)
-#then take only those AAs from the fasta and count their y's, preallocate the vector for part 2 to that many y's
-#those accessions and such as saved in a vector... this seems like it would be no faster actually
-
-#then_that_are <- which(AccessionNumbers %in% ALLPOSSIBLE)
-
-MotifNumber<-2
-
-#TrueMotifNums<-which(ALLPOSSIBLE %in% AccessionNumbers)
-#fihlodeANs<-c()
-
-locations<-unique(grep(paste(AccessionNumbers,collapse="|"), ALLPOSSIBLE))
-
-if (sum(locations)>0){
-  whereisit<-locations
-  for (u in 1:length(whereisit)) {
-    i<-whereisit[u]
-    name<-c()
-    data<-c()
-    name<-as.character(SuperAwesometrial[i,1])
-    #the name of each protein is the first column
-    name<-sub(x=name, pattern=",", replacement="")
-    #the names may contain commas, remove them
-    data<-as.character(SuperAwesometrial[i,3])
-    #the amino acids are stored in the third column
-    data<-strsplit(data,"")
-    #split them into their component letters
-    data<-unlist(data)
-    #turn them into a vector
-    motif<-c()
-
-    #this part below is where I can speed things up
-    The_Ys<-data=="Y"
-    #find any Y in the protein
-    if (sum(The_Ys>0)){ #if there is at least one Y
-      Where_are_they<-which(The_Ys %in% TRUE)
-      for (z in 1:length(Where_are_they)) { #then for every Y, make a motif
-
-        j<-Where_are_they[z]
-        #for (j in 1:length(data)){
-        #if ("Y" %in% data[j]){
-        #if there is a Y aka Tyrosine in the data
-        #allmotifs=rbind(allmotifs,data[(i-4):(i+4)])
-        a <- j-7
-        a<-ifelse(a<1, a <- 1, a <- a)
-        # if (a<1){
-        #   a <- 1
-        # }
-        b<-j+7
-        b<-ifelse(b>length(data), b <- length(data), b <-
-                    b)
-        # if (b>length(data)){
-        #   b<-length(data)
-        # }
-        #take the motif that is +/- 4 from that Y, sanity checks so that values are never off the grid from the protein
-
-        LeftSide<-7-(j-a)
-        RightSide<-7-(b-j)
-        #how is the motif justified?  Does it have exactly 4 letters to the left/right, or does it not?
-
-        leftspaces<-rep(" ",times=LeftSide)
-        rightspaces<-rep(" ",times=RightSide)
-        #add blank spaces if the motif has less than 4 letters to the left/right
-
-
-        motif<-(data[(a):(b)])
-        motif<-c(leftspaces,motif,rightspaces)
-        #save that motif, which is the Y and +/- 4 amino acids, including truncation
-
-        # lens<-c(lens,length(motif))
-        # leni<-c(leni,i)
-        # lenj<-c(lenj,j)
-
-        motif<-paste(motif, sep="", collapse="")
-        #the 4 amino acids, put them back together into a single string
-        motif<-matrix(data=c(motif),nrow = 1)
-        namesss<-matrix(data=c(name),nrow = 1)
-        #keep this motif and separately keep the name of the protein it came from
-
-        # allmotifs<-rbind(allmotifs,motif)
-        # thenames<-rbind(thenames,namesss)
-        allmotifs[MotifNumber,1]<-motif
-        thenames[MotifNumber,1]<-namesss
-        MotifNumber<-MotifNumber+1
-
-        #add names and motifs to a growing list
-
-        # write.table(motif, file="TRIALTIALRIAALSKFDJSD.csv", quote=FALSE, sep=",",
-        #             row.names=FALSE,col.names = FALSE, na="", append=TRUE)
-        #and then write it into a csv, the sep is needed so that the two pieces of the data frame are separated
-        #append has 1to equal true because this thing will loop around many times adding more and more data points
-        #you must create a new filename/filepath with each new data you run
-      }
-
-    }
-  }
-}
-
-
-
-
-################################################################################################################################
-################################################################################################################################
-################################################################################################################################
-
-
-# for (i in 1:nrow(SuperAwesometrial)){
-#
-# }
-
-names(allmotifs)<-thenames
-
-truemotifs<-allmotifs[!duplicated(allmotifs)]
-#truenames<-thenames[!duplicated(thenames)]
-#remove duplicates from the motifs and names
-
-#make the motifs and names into matrices
-
-
-truemotifs<-truemotifs[!truemotifs %in% PositiveTrueMotifs]
-
-outputfile<-cbind(names(truemotifs),truemotifs)
-
-outputfile <- gsub(",","",outputfile)
-
-write.table(outputfile, file=NAMEOFOUTPUTFILE, quote=FALSE, sep=",",
-             row.names=FALSE,col.names = FALSE, na="", append=TRUE)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/NMF/NMF-working-7-4-2020.R	Tue Jul 14 20:01:02 2020 -0400
@@ -0,0 +1,139 @@
+NAMEOFOUTPUTFILE<-"output1.csv"
+
+SuperAwesometrial <- read.delim2("input1.tabular", header=FALSE)
+SBF<-read.csv("input3.csv", stringsAsFactors = FALSE, header = FALSE)
+SBF<-t(SBF)
+PositiveMotifs <- read.csv("input2.csv", stringsAsFactors=FALSE)
+
+
+YsToim<-rep("xY",times=nrow(PositiveMotifs))
+PositiveMotifs[,11]<-YsToim
+
+
+
+#this code is meant to take a list of proteins, and our list of phosphopeptides, and find which Y-containing peptides could have been found phosphorylated but weren't
+
+
+
+#first then I create the list of phosphopeptides
+Positive9Letters<-PositiveMotifs[,4:18]
+PositiveTrueMotifs<-c()
+
+#then I take the proteins
+AccessionNumbers<-as.character(SBF[2:nrow(SBF),1])
+AccessionNumbers<-AccessionNumbers[!is.na(AccessionNumbers)]
+#the above is only those proteins from which our phosphopeptides sprung, the below is every protein in the human proteome
+ALLPOSSIBLE<-SuperAwesometrial[,1]
+ALLPOSSIBLE<-as.character(ALLPOSSIBLE)
+
+for (q in 1:nrow(Positive9Letters)) {
+  LeftJust<-0
+  RightJust<-0
+
+
+  motifmotif<-Positive9Letters[q,]
+  motifmotif<-paste(motifmotif, collapse = "",sep = "")
+  motifmotif<-unlist(strsplit(motifmotif, split = ""))
+  position <- match(x = "x", table = motifmotif)
+  LeftJust<-position-1
+  RightJust<-length(motifmotif)-position-1
+  #find which position was the phospho-amino acid, it is marked with an X
+
+  LeftSpaces<-rep(x=" ", times=(7-LeftJust))
+  RightSpaces<-rep(x=" ", times=(7-RightJust))
+  motifmotif<-motifmotif[!motifmotif %in% c("x")]
+  motifmotif<-c(LeftSpaces,motifmotif,RightSpaces)
+  motifmotif<-paste(motifmotif, collapse = "",sep = "")
+  #put spaces on either side of the motif if the motif does not fill out a -7 to +7 motif
+
+  PositiveTrueMotifs<-c(PositiveTrueMotifs,motifmotif)
+}
+
+
+
+allmotifs<-matrix(data=rep("Motifs", times= 1000000),ncol = 1)
+thenames<-matrix(data=rep("AccessionNumbers", times= 1000000),ncol = 1)
+#I preallocate vectors for efficiency, but I have no way of knowing how big these particular vectors need to be, so I just make them way bigger
+#than I know they need to be.  A vector 1 million long is plenty big
+
+MotifNumber<-2
+
+locations<-unique(grep(paste(AccessionNumbers,collapse="|"), ALLPOSSIBLE))
+
+
+if (sum(locations)>0){
+  whereisit<-locations
+  for (u in 1:length(whereisit)) {
+    i<-whereisit[u]
+    name<-c()
+    data<-c()
+    name<-as.character(SuperAwesometrial[i,1])
+    #the name of each protein is the first column
+    name<-sub(x=name, pattern=",", replacement="")
+    #the names may contain commas, remove them
+    data<-as.character(SuperAwesometrial[i,3])
+    #the amino acids are stored in the third column
+    data<-strsplit(data,"")
+    #split them into their component letters
+    data<-unlist(data)
+    #turn them into a vector
+    motif<-c()
+
+    #this part below is where I can speed things up
+    The_Ys<-data=="Y"
+    #find any Y in the protein
+    if (sum(The_Ys>0)){ #if there is at least one Y
+      Where_are_they<-which(The_Ys %in% TRUE)
+      for (z in 1:length(Where_are_they)) { #then for every Y, make a motif
+
+        j<-Where_are_they[z]
+        a <- j-7
+        a<-ifelse(a<1, a <- 1, a <- a)
+        b<-j+7
+        b<-ifelse(b>length(data), b <- length(data), b <-
+                    b)
+        #take the motif that is +/- 4 from that Y, sanity checks so that values are never off the grid from the protein
+
+        LeftSide<-7-(j-a)
+        RightSide<-7-(b-j)
+        #how is the motif justified?  Does it have exactly 4 letters to the left/right, or does it not?
+
+        leftspaces<-rep(" ",times=LeftSide)
+        rightspaces<-rep(" ",times=RightSide)
+        #add blank spaces if the motif has less than 4 letters to the left/right
+
+
+        motif<-(data[(a):(b)])
+        motif<-c(leftspaces,motif,rightspaces)
+        #save that motif, which is the Y and +/- 4 amino acids, including truncation
+
+        motif<-paste(motif, sep="", collapse="")
+        #the 4 amino acids, put them back together into a single string
+        motif<-matrix(data=c(motif),nrow = 1)
+        namesss<-matrix(data=c(name),nrow = 1)
+        #keep this motif and separately keep the name of the protein it came from
+
+        allmotifs[MotifNumber,1]<-motif
+        thenames[MotifNumber,1]<-namesss
+        MotifNumber<-MotifNumber+1
+
+      }
+
+    }
+  }
+}
+
+
+
+
+names(allmotifs)<-thenames
+
+truemotifs<-allmotifs[!duplicated(allmotifs)]
+#remove duplicates from the motifs and names
+
+#make the motifs and names into matrices
+truemotifs<-truemotifs[!truemotifs %in% PositiveTrueMotifs]
+outputfile<-cbind(names(truemotifs),truemotifs)
+outputfile <- gsub(",","",outputfile)
+write.table(outputfile, file=NAMEOFOUTPUTFILE, quote=FALSE, sep=",",
+             row.names=FALSE,col.names = FALSE, na="", append=TRUE)
--- a/NMF/NMF.xml	Thu Feb 06 14:20:36 2020 -0500
+++ b/NMF/NMF.xml	Tue Jul 14 20:01:02 2020 -0400
@@ -7,7 +7,7 @@
 		ln -s '$FASTA' input1.tabular &&
         ln -s '$positives' input2.csv &&
 		ln -s '$SBF' input3.csv &&
-        Rscript '$__tool_directory__/NMF-working-2-5-20.R'
+        Rscript '$__tool_directory__/NMF-working-7-4-2020.R'
     ]]></command>
     <inputs>
 		<param format="csv" name="SBF" type="data" label="Substrate Background Frequency"/>