# HG changeset patch # User jfb # Date 1594771262 14400 # Node ID 5edbfbeba354019e8212677c20455e9427a297da # Parent 220d4359ec9ba6019f2ddba2a1a4c7eb1327615d Uploaded diff -r 220d4359ec9b -r 5edbfbeba354 NMF/NMF-working-2-5-20.R --- a/NMF/NMF-working-2-5-20.R Thu Feb 06 14:20:36 2020 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,190 +0,0 @@ -NAMEOFOUTPUTFILE<-"output1.csv" - -SuperAwesometrial <- read.delim2("input1.tabular", header=FALSE) -#once you've used the other script to turn the FASFA into a CSV, copypaste the filepath and name -#of the csv into this line between the quote marks. - -SBF<-read.csv("input3.csv", stringsAsFactors = FALSE, header = FALSE) -SBF<-t(SBF) - -PositiveMotifs <- read.csv("input2.csv", stringsAsFactors=FALSE) -#because of R reasons, it is required that the motifs in this file have blank cells instead of spaces where there is no letter in -#the motif - -YsToim<-rep("xY",times=nrow(PositiveMotifs)) -PositiveMotifs[,11]<-YsToim - - - -################################################################################################################################ -#I have to paste them, then split and unlist them, then find the x and paste again -Positive9Letters<-PositiveMotifs[,4:18] -#head(Positive9Letters) -PositiveTrueMotifs<-c() - -AccessionNumbers<-as.character(SBF[2:nrow(SBF),1]) -AccessionNumbers<-AccessionNumbers[!is.na(AccessionNumbers)] -ALLPOSSIBLE<-SuperAwesometrial[,1] -ALLPOSSIBLE<-as.character(ALLPOSSIBLE) -################################################################################################################################ - -for (q in 1:nrow(Positive9Letters)) { - LeftJust<-0 - RightJust<-0 - - motifmotif<-Positive9Letters[q,] - motifmotif<-paste(motifmotif, collapse = "",sep = "") - - motifmotif<-unlist(strsplit(motifmotif, split = "")) - - position <- match(x = "x", table = motifmotif) - LeftJust<-position-1 - RightJust<-length(motifmotif)-position-1 - - LeftSpaces<-rep(x=" ", times=(7-LeftJust)) - RightSpaces<-rep(x=" ", times=(7-RightJust)) - - motifmotif<-motifmotif[!motifmotif %in% c("x")] - - motifmotif<-c(LeftSpaces,motifmotif,RightSpaces) - motifmotif<-paste(motifmotif, collapse = "",sep = "") - PositiveTrueMotifs<-c(PositiveTrueMotifs,motifmotif) -} - - - -################################################################################################################################ -allmotifs<-matrix(data=rep("Motifs", times= 1000000),ncol = 1) -thenames<-matrix(data=rep("AccessionNumbers", times= 1000000),ncol = 1) -################################################################################################################################ - -################################################################################################################################ - -#I need to preallocate these vectors. I will find out how many y's there are total and then make the vector that many long -#Or what I need is two separate loops. First loop finds all the accession number positions that Grep to the FASTA (which is called ALLPOSSIBLE) -#then take only those AAs from the fasta and count their y's, preallocate the vector for part 2 to that many y's -#those accessions and such as saved in a vector... this seems like it would be no faster actually - -#then_that_are <- which(AccessionNumbers %in% ALLPOSSIBLE) - -MotifNumber<-2 - -#TrueMotifNums<-which(ALLPOSSIBLE %in% AccessionNumbers) -#fihlodeANs<-c() - -locations<-unique(grep(paste(AccessionNumbers,collapse="|"), ALLPOSSIBLE)) - -if (sum(locations)>0){ - whereisit<-locations - for (u in 1:length(whereisit)) { - i<-whereisit[u] - name<-c() - data<-c() - name<-as.character(SuperAwesometrial[i,1]) - #the name of each protein is the first column - name<-sub(x=name, pattern=",", replacement="") - #the names may contain commas, remove them - data<-as.character(SuperAwesometrial[i,3]) - #the amino acids are stored in the third column - data<-strsplit(data,"") - #split them into their component letters - data<-unlist(data) - #turn them into a vector - motif<-c() - - #this part below is where I can speed things up - The_Ys<-data=="Y" - #find any Y in the protein - if (sum(The_Ys>0)){ #if there is at least one Y - Where_are_they<-which(The_Ys %in% TRUE) - for (z in 1:length(Where_are_they)) { #then for every Y, make a motif - - j<-Where_are_they[z] - #for (j in 1:length(data)){ - #if ("Y" %in% data[j]){ - #if there is a Y aka Tyrosine in the data - #allmotifs=rbind(allmotifs,data[(i-4):(i+4)]) - a <- j-7 - a<-ifelse(a<1, a <- 1, a <- a) - # if (a<1){ - # a <- 1 - # } - b<-j+7 - b<-ifelse(b>length(data), b <- length(data), b <- - b) - # if (b>length(data)){ - # b<-length(data) - # } - #take the motif that is +/- 4 from that Y, sanity checks so that values are never off the grid from the protein - - LeftSide<-7-(j-a) - RightSide<-7-(b-j) - #how is the motif justified? Does it have exactly 4 letters to the left/right, or does it not? - - leftspaces<-rep(" ",times=LeftSide) - rightspaces<-rep(" ",times=RightSide) - #add blank spaces if the motif has less than 4 letters to the left/right - - - motif<-(data[(a):(b)]) - motif<-c(leftspaces,motif,rightspaces) - #save that motif, which is the Y and +/- 4 amino acids, including truncation - - # lens<-c(lens,length(motif)) - # leni<-c(leni,i) - # lenj<-c(lenj,j) - - motif<-paste(motif, sep="", collapse="") - #the 4 amino acids, put them back together into a single string - motif<-matrix(data=c(motif),nrow = 1) - namesss<-matrix(data=c(name),nrow = 1) - #keep this motif and separately keep the name of the protein it came from - - # allmotifs<-rbind(allmotifs,motif) - # thenames<-rbind(thenames,namesss) - allmotifs[MotifNumber,1]<-motif - thenames[MotifNumber,1]<-namesss - MotifNumber<-MotifNumber+1 - - #add names and motifs to a growing list - - # write.table(motif, file="TRIALTIALRIAALSKFDJSD.csv", quote=FALSE, sep=",", - # row.names=FALSE,col.names = FALSE, na="", append=TRUE) - #and then write it into a csv, the sep is needed so that the two pieces of the data frame are separated - #append has 1to equal true because this thing will loop around many times adding more and more data points - #you must create a new filename/filepath with each new data you run - } - - } - } -} - - - - -################################################################################################################################ -################################################################################################################################ -################################################################################################################################ - - -# for (i in 1:nrow(SuperAwesometrial)){ -# -# } - -names(allmotifs)<-thenames - -truemotifs<-allmotifs[!duplicated(allmotifs)] -#truenames<-thenames[!duplicated(thenames)] -#remove duplicates from the motifs and names - -#make the motifs and names into matrices - - -truemotifs<-truemotifs[!truemotifs %in% PositiveTrueMotifs] - -outputfile<-cbind(names(truemotifs),truemotifs) - -outputfile <- gsub(",","",outputfile) - -write.table(outputfile, file=NAMEOFOUTPUTFILE, quote=FALSE, sep=",", - row.names=FALSE,col.names = FALSE, na="", append=TRUE) diff -r 220d4359ec9b -r 5edbfbeba354 NMF/NMF-working-7-4-2020.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/NMF/NMF-working-7-4-2020.R Tue Jul 14 20:01:02 2020 -0400 @@ -0,0 +1,139 @@ +NAMEOFOUTPUTFILE<-"output1.csv" + +SuperAwesometrial <- read.delim2("input1.tabular", header=FALSE) +SBF<-read.csv("input3.csv", stringsAsFactors = FALSE, header = FALSE) +SBF<-t(SBF) +PositiveMotifs <- read.csv("input2.csv", stringsAsFactors=FALSE) + + +YsToim<-rep("xY",times=nrow(PositiveMotifs)) +PositiveMotifs[,11]<-YsToim + + + +#this code is meant to take a list of proteins, and our list of phosphopeptides, and find which Y-containing peptides could have been found phosphorylated but weren't + + + +#first then I create the list of phosphopeptides +Positive9Letters<-PositiveMotifs[,4:18] +PositiveTrueMotifs<-c() + +#then I take the proteins +AccessionNumbers<-as.character(SBF[2:nrow(SBF),1]) +AccessionNumbers<-AccessionNumbers[!is.na(AccessionNumbers)] +#the above is only those proteins from which our phosphopeptides sprung, the below is every protein in the human proteome +ALLPOSSIBLE<-SuperAwesometrial[,1] +ALLPOSSIBLE<-as.character(ALLPOSSIBLE) + +for (q in 1:nrow(Positive9Letters)) { + LeftJust<-0 + RightJust<-0 + + + motifmotif<-Positive9Letters[q,] + motifmotif<-paste(motifmotif, collapse = "",sep = "") + motifmotif<-unlist(strsplit(motifmotif, split = "")) + position <- match(x = "x", table = motifmotif) + LeftJust<-position-1 + RightJust<-length(motifmotif)-position-1 + #find which position was the phospho-amino acid, it is marked with an X + + LeftSpaces<-rep(x=" ", times=(7-LeftJust)) + RightSpaces<-rep(x=" ", times=(7-RightJust)) + motifmotif<-motifmotif[!motifmotif %in% c("x")] + motifmotif<-c(LeftSpaces,motifmotif,RightSpaces) + motifmotif<-paste(motifmotif, collapse = "",sep = "") + #put spaces on either side of the motif if the motif does not fill out a -7 to +7 motif + + PositiveTrueMotifs<-c(PositiveTrueMotifs,motifmotif) +} + + + +allmotifs<-matrix(data=rep("Motifs", times= 1000000),ncol = 1) +thenames<-matrix(data=rep("AccessionNumbers", times= 1000000),ncol = 1) +#I preallocate vectors for efficiency, but I have no way of knowing how big these particular vectors need to be, so I just make them way bigger +#than I know they need to be. A vector 1 million long is plenty big + +MotifNumber<-2 + +locations<-unique(grep(paste(AccessionNumbers,collapse="|"), ALLPOSSIBLE)) + + +if (sum(locations)>0){ + whereisit<-locations + for (u in 1:length(whereisit)) { + i<-whereisit[u] + name<-c() + data<-c() + name<-as.character(SuperAwesometrial[i,1]) + #the name of each protein is the first column + name<-sub(x=name, pattern=",", replacement="") + #the names may contain commas, remove them + data<-as.character(SuperAwesometrial[i,3]) + #the amino acids are stored in the third column + data<-strsplit(data,"") + #split them into their component letters + data<-unlist(data) + #turn them into a vector + motif<-c() + + #this part below is where I can speed things up + The_Ys<-data=="Y" + #find any Y in the protein + if (sum(The_Ys>0)){ #if there is at least one Y + Where_are_they<-which(The_Ys %in% TRUE) + for (z in 1:length(Where_are_they)) { #then for every Y, make a motif + + j<-Where_are_they[z] + a <- j-7 + a<-ifelse(a<1, a <- 1, a <- a) + b<-j+7 + b<-ifelse(b>length(data), b <- length(data), b <- + b) + #take the motif that is +/- 4 from that Y, sanity checks so that values are never off the grid from the protein + + LeftSide<-7-(j-a) + RightSide<-7-(b-j) + #how is the motif justified? Does it have exactly 4 letters to the left/right, or does it not? + + leftspaces<-rep(" ",times=LeftSide) + rightspaces<-rep(" ",times=RightSide) + #add blank spaces if the motif has less than 4 letters to the left/right + + + motif<-(data[(a):(b)]) + motif<-c(leftspaces,motif,rightspaces) + #save that motif, which is the Y and +/- 4 amino acids, including truncation + + motif<-paste(motif, sep="", collapse="") + #the 4 amino acids, put them back together into a single string + motif<-matrix(data=c(motif),nrow = 1) + namesss<-matrix(data=c(name),nrow = 1) + #keep this motif and separately keep the name of the protein it came from + + allmotifs[MotifNumber,1]<-motif + thenames[MotifNumber,1]<-namesss + MotifNumber<-MotifNumber+1 + + } + + } + } +} + + + + +names(allmotifs)<-thenames + +truemotifs<-allmotifs[!duplicated(allmotifs)] +#remove duplicates from the motifs and names + +#make the motifs and names into matrices +truemotifs<-truemotifs[!truemotifs %in% PositiveTrueMotifs] +outputfile<-cbind(names(truemotifs),truemotifs) +outputfile <- gsub(",","",outputfile) +write.table(outputfile, file=NAMEOFOUTPUTFILE, quote=FALSE, sep=",", + row.names=FALSE,col.names = FALSE, na="", append=TRUE) diff -r 220d4359ec9b -r 5edbfbeba354 NMF/NMF.xml --- a/NMF/NMF.xml Thu Feb 06 14:20:36 2020 -0500 +++ b/NMF/NMF.xml Tue Jul 14 20:01:02 2020 -0400 @@ -7,7 +7,7 @@ ln -s '$FASTA' input1.tabular && ln -s '$positives' input2.csv && ln -s '$SBF' input3.csv && - Rscript '$__tool_directory__/NMF-working-2-5-20.R' + Rscript '$__tool_directory__/NMF-working-7-4-2020.R' ]]>