# HG changeset patch
# User jfb
# Date 1594771031 14400
# Node ID 77f3a77f0ca2d59599e58abb93f1004a548199f7
# Parent  eb89ff215fb80c9e6eec67ed7f8cad8652bee9f0
Uploaded

diff -r eb89ff215fb8 -r 77f3a77f0ca2 KT-ID fisher test/.RData
Binary file KT-ID fisher test/.RData has changed
diff -r eb89ff215fb8 -r 77f3a77f0ca2 KT-ID fisher test/.Rhistory
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/KT-ID fisher test/.Rhistory	Tue Jul 14 19:57:11 2020 -0400
@@ -0,0 +1,13 @@
+?rnorm
+?rnorm
+?rnorm
+pwr
+?pwr
+??pwr
+power.t.test(n=6, power = .7, type = "two.sample", alternative = "two.sided")
+power.t.test(n=6, power = .9, type = "two.sample", alternative = "two.sided")
+setwd("C:/Users/John Blankenhor/Downloads/kinatest_fisher_test-eb89ff215fb8/kinatest_fisher_test-eb89ff215fb8/KT-ID fisher test")
+DataFilename<-"thedata.RData"
+load(DataFilename)
+DataFilename<-"OnlyTheRequiredSubBackFreqData.RData"
+load(DataFilename)
diff -r eb89ff215fb8 -r 77f3a77f0ca2 KT-ID fisher test/7-7-fisher-galaxy_working.R
--- a/KT-ID fisher test/7-7-fisher-galaxy_working.R	Wed Feb 05 14:12:50 2020 -0500
+++ b/KT-ID fisher test/7-7-fisher-galaxy_working.R	Tue Jul 14 19:57:11 2020 -0400
@@ -1,31 +1,33 @@
 oldw <- getOption("warn")
 options(warn = -1)
+#the Galaxy tool will error out if the system displays a warning, even if that warning does not cause the code to fail
+#in order to ensure that the tool does not error out due to a simple Warning, I turn down the warning option to ignore warnings
 
 PositiveSubstrateList<- read.csv("substrates.csv", stringsAsFactors=FALSE)
 NegativeSubstrateList<- read.csv("negatives.csv", stringsAsFactors=FALSE)
 SubstrateBackgroundFrequency<- read.csv("SBF.csv", stringsAsFactors=FALSE,header = FALSE)
 SubstrateBackgroundFrequency<-t(SubstrateBackgroundFrequency)
 SubstrateBackgroundFrequency<-SubstrateBackgroundFrequency[2:nrow(SubstrateBackgroundFrequency),]
+#this takes in all the inputs and ensures they are oriented correctly
 
 ScreenerFilename<-"screener"
 screaner<-read.csv(ScreenerFilename, header = FALSE, stringsAsFactors = FALSE)
+#the screener file is packaged with the tool and is used for actually not for anything
 
 DataFilename<-"thedata.RData"
 load(DataFilename)
-
+#this is a data file containing a list called Genelist
+#genelist is a list of every single Uniprot-Swissprot protein in the human genome
+#and those proteins are then *in-silico* digested by trypsin, and any Y-containing tryptic peptides go in the list called GeneList
+#so GeneList is a list of all Y-containing tryptic peptides from the human proteome
 
 SDtableAndPercentTable<-"output1.csv"
 NormalizationScore_CharacterizationTable<-"output2.csv"
 SequenceScoringAndScreening<-"output3.csv"
-
-
-
-
+#these are the names of the output files that will be generated
 
-SiteSelectivityTable_EndogenousProbabilityMatrix_NormalizationScore_CharacterizationTable<-NormalizationScore_CharacterizationTable
-FILENAME2<-NormalizationScore_CharacterizationTable
-FILENAME3<-SequenceScoringAndScreening
 substrates<-matrix(rep("A",times=((nrow(PositiveSubstrateList)-1)*15)),ncol = 15)
+#an "empty" matrix will gets filled below.  This matrix will contain all the letters of the input Positive Substrates
 
 for (i in 2:nrow(PositiveSubstrateList))
 {
@@ -39,15 +41,10 @@
 
 substrates2<-substrates
 substrates2[substrates2==""]<-"O"
-
-#I will make it so that all blank values in substrates get a O after I'm done with it
-
-# SpacesToOs<-c(""="O",)
-# substrates<-SpacesToOs[substrates]
+#any blank spots on the substrates list are places where there was no Amino Acid, so I give them the letter O.
 
 
-
-#create the percent table
+#I create the percent table here, which finds what percertage of each amino acid was present at each position in the substrates, from -7 to +7
 if (1==1){
 Column1<-substrates[,1]
 Column2<-substrates[,2]
@@ -422,48 +419,46 @@
 Y14<-sum(Column14 %in% "Y")
 Y15<-sum(Column15 %in% "Y")
 YllYs<-cbind(Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y8,Y9,Y10,Y11,Y12,Y13,Y14,Y15)
+
+PercentTable<-rbind(AllAs,CllCs,DllDs,EllEs,FllFs,GllGs,HllHs,IllIs,KllKs,LllLs,MllMs,NllNs,PllPs,QllQs,RllRs,SllSs,TllTs,VllVs,WllWs,YllYs,OllOs)
 }
-#this is substrate percents
-
-#A C D E F G H I K L N P Q R S T V W Y O
 
 AllSubBackFreq<-array(data = NA,dim = c(21,15,nrow(SubstrateBackgroundFrequency)))
-# vectorvictor<-rep(1,times=nrow(SubstrateBackgroundFrequency))
-# AllSubBackFreq[20,5,]<-vectorvictor
 #this is where I'm creating the new SubBackFreq table, I have a list with all possible SBF matrices,
 #I perform a for function to find all the matrices that are in Substrate Background Frequency
 #and place them all in this array, then I will do mean and SD
+
 AAccessionNumbers<-SubstrateBackgroundFrequency[,1]
 AllGeneNames<-names(Genelist)
 
 number_replaced<-0
 totalmotifs<-0
 for (z in 1:length(AAccessionNumbers)) {
+  #for every protein found in the substrate background frequency list
   pattern<-AAccessionNumbers[z]
   referencepoint<-grepl(pattern = pattern, x=AllGeneNames,fixed = TRUE)
-  #so take the accession number and find which matrix corresponds to that accession number
+  #find that protein in the GeneList list
   referencenumber<-which(referencepoint==TRUE)
   if (length(referencenumber)<1){referencenumber<-FALSE}
-  # if (referencenumber==FALSE)
-  #   ThisMatix<-array(data=NA, dim = c(21,9))
   if (referencenumber!=FALSE){
-
+    #then if you have actually found a match in the GeneList list, you will do the following
     motifs<-unlist(Genelist[[referencenumber]])
+    #take every tryptic peptide from that list
     therow<-c(1:15)
     for (a in 1:length(motifs)) {
       thecut<-unlist(strsplit(motifs[a], split=""))
       edges<-c("O","O","O","O","O","O","O")
       thecut<-c(edges,thecut,edges)
       theYs<-which(thecut=="Y")
+      #and use the above code to replace blank spaces in the -7 to +7 motif with Os, same as what I did for the positive substrates
       for (q in 1:length(theYs)) {
         thiscut<-thecut[(theYs[q]-7):(theYs[q]+7)]
         therow<-rbind(therow,thiscut)
         totalmotifs<-totalmotifs+1
+        #count how many motifs you've put in there
       }
     }
     
-    #I hate for loops but I'm doing them anyway
-    
     cutreplacement<-c("X","X","X","X","X","X","X","X","X","X","X","X","X","X","X")
     for (t in 1:nrow(therow)) {
       compare1<-therow[t,1:15]
@@ -476,18 +471,13 @@
         if (compare1==positivesubstrate){
           therow[t,1:15]<-cutreplacement
           number_replaced<-number_replaced+1
+          #if any of the substrates found in the substrate background frequency are identical to the positive substrates, this removes them
+          #and replaces them with Xs to ensure they do not get counted when all the AAs are being counted
           }
       }
       
     }
     
-    #remember here
-    #here's what I want to do: every motif gets archived individually as how many AAs are left and right
-    #of the Y, THEN I take SD and Mean of that?!?!?!
-    #... no.  I'M GOING TO SUM UP ALL THE INDIVIDUAL AAs AT EACH POSITION
-    #then divide them by total number of motifs
-    #then just divide percent table by that
-    #then find out if it's significant with a test
     Column1<-therow[,1]
     Column2<-therow[,2]
     Column3<-therow[,3]
@@ -503,11 +493,14 @@
     Column13<-therow[,13]
     Column14<-therow[,14]
     Column15<-therow[,15]
+    #split the motifs you found above into individual columns
+    #each column is ome of the positions between -7 and +7, and so each column represents every single amino acid found at that position
     slice1<-c(sum(Column1=="A"),sum(Column1=="C"),sum(Column1=="D"),sum(Column1=="E"),sum(Column1=="F"),
               sum(Column1=="G"),sum(Column1=="H"),sum(Column1=="I"),sum(Column1=="K"),sum(Column1=="L"),
               sum(Column1=="M"),sum(Column1=="N"),sum(Column1=="P"),sum(Column1=="Q"),sum(Column1=="R"),
               sum(Column1=="S"),sum(Column1=="T"),sum(Column1=="V"),sum(Column1=="W"),sum(Column1=="Y"),
               sum(Column1=="O"))
+    #count how many of each amino acid were present in that column, the vector of "how many of each amino acid were present in Column1", the name of that vector is slice1
     slice2<-c(sum(Column2=="A"),sum(Column2=="C"),sum(Column2=="D"),sum(Column2=="E"),sum(Column2=="F"),
               sum(Column2=="G"),sum(Column2=="H"),sum(Column2=="I"),sum(Column2=="K"),sum(Column2=="L"),
               sum(Column2=="M"),sum(Column2=="N"),sum(Column2=="P"),sum(Column2=="Q"),sum(Column2=="R"),
@@ -581,49 +574,40 @@
     ThisMatix<-cbind(slice1,slice2,slice3,slice4,slice5,slice6,slice7,slice8,slice9,
                      slice10,slice11,slice12,slice13,slice14,slice15)
     ThisMatix<-ThisMatix
+    #stick all those slices together in a matrix
     AllSubBackFreq[1:21,1:15,z]<-ThisMatix
+    #add that matrix to the 3d matrix of substratebackgroundfrequency
+    #the full matrix is a 3d matrix.  The Z dimension represents each Accession Number aka protein, the Y dimension represents 
+    #each of the 20 amino acids plus O which is AA number 21, and the X dimension represents the positions -7 to +7
   }
 }
 
 theletters<-c("A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y","O")
-# 
-# AllSds<-apply(AllSubBackFreq, c(1,2), sd, na.rm = TRUE)
-# AllMeans<-apply(AllSubBackFreq, c(1,2), mean, na.rm = TRUE)
-#totalmotifs
-SumSBF<-apply(AllSubBackFreq, c(1,2), sum, na.rm=TRUE)
-# SumSBF<-SumSBF/totalmotifs
+#these are the 20 AAs of the code plus O which represents "no amino acid present here"
 
-##########
-#NumeratedPeptides<-sapply(LetteredPeptides, function(y) gsub("A",A,y,perl = TRUE))
-#ReferencePoints<-sapply(ReferencePoints,grepl, pattern = AAccessionNumbers, AllGeneNames,fixed = TRUE)
-#########
-#nrow(substrates)
-PercentTable<-rbind(AllAs,CllCs,DllDs,EllEs,FllFs,GllGs,HllHs,IllIs,KllKs,LllLs,MllMs,NllNs,PllPs,QllQs,RllRs,SllSs,TllTs,VllVs,WllWs,YllYs,OllOs)
-#PercentTable<-PercentTable*100
+SumSBF<-apply(AllSubBackFreq, c(1,2), sum, na.rm=TRUE)
+#the 3d matrix above is summed across its Z dimension
+
 
 fisheroddstable<-matrix(data = 1,nrow = 21,ncol = 15)
 fisherpvalstable<-matrix(data = 1,nrow = 21,ncol = 15)
 fisherpvalstableadjusted<-matrix(data = 1,nrow = 21,ncol = 15)
+#pre-allocate vectors
+
 for (rowas in 1:21) {
+  #for each of the 20 AAs plus O
   for (colams in 1:15) {
+    #for each position -7 to +7
     fishermatrix<-matrix(data=c(PercentTable[rowas,colams],nrow(substrates),SumSBF[rowas,colams],(totalmotifs-number_replaced)),nrow = 2)
     thetest<-fisher.test(x=fishermatrix)
+    #perform a Fisher Test to see if the frequency of that AA in the substrate background frequency is different than the frequency in the Positive Substrates
+    #remember that the frequency in the Positive Substrates is determined by the percent table
     fisheroddstable[rowas,colams]<-thetest$estimate
     fisherpvalstable[rowas,colams]<-thetest$p.value
     fisherpvalstableadjusted[rowas,colams]<-p.adjust(p=thetest$p.value,method = "fdr",n=21*15)
   }
 }
 
-# FisherPowerTable<-matrix(data = 1,nrow = 21,ncol = 9)
-# for (rowas in 1:21) {
-#   for (colams in 1:9) {
-#     pro1<-PercentTable[rowas,colams]/nrow(substrates)
-#     pro2<-SumSBF[rowas,colams]/totalmotifs
-#     PowerFisherTest<-power.fisher.test(pro1,pro2,nrow(substrates),totalmotifs)
-#     FisherPowerTable[rowas,colams]<-PowerFisherTest
-#   }
-# }
-
 fisheroddstable<-cbind.data.frame(theletters,fisheroddstable)
 fisherpvalstable<-cbind.data.frame(theletters,fisherpvalstable)
 fisherpvalstableadjusted<-cbind.data.frame(theletters,fisherpvalstableadjusted)
@@ -639,6 +623,7 @@
       theval<-testval
     }
     fisherupdown[x,y]<-theval
+    #determine which of the Fisher Odds found were actually significant
   }
 }
 
@@ -650,28 +635,17 @@
 write.table(x=fisherpvalstable,file = SDtableAndPercentTable, append = TRUE,sep = ",",col.names = FALSE,row.names = FALSE)
 write.table(x="Fisher p.values adjusted",file = SDtableAndPercentTable, append = TRUE,sep = ",",col.names = FALSE,row.names = FALSE)
 write.table(x=fisherpvalstableadjusted,file = SDtableAndPercentTable, append = TRUE,sep = ",",col.names = FALSE,row.names = FALSE)
-# write.table(x="Fisher Power",file = SDtableAndPercentTable, append = TRUE,sep = ",")
-# write.table(x=FisherPowerTable,file = SDtableAndPercentTable, append = TRUE,sep = ",")
 
 SetOfAAs<-c("Letter","A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y")
-
-
 SetOfAAs<-matrix(data = SetOfAAs,ncol = 1)
-
 numberofY<-as.numeric(SubstrateBackgroundFrequency[,34])
 numberofY<-numberofY[!is.na(numberofY)]
-
 numberofPY<-as.numeric(SubstrateBackgroundFrequency[,35])
 numberofPY<-numberofPY[!is.na(numberofPY)]
-
 NormalizationScore<-sum(numberofPY)/sum(numberofY)
-
-
 NormalizationScore<-c("Normalization Score",NormalizationScore)
-
 write.table(NormalizationScore, file = SiteSelectivityTable_EndogenousProbabilityMatrix_NormalizationScore_CharacterizationTable, append = TRUE,sep = ",",row.names = FALSE, col.names = FALSE)
-
-######################################
+#output files
 
 
 bareSDs<-fisherupdown[1:20,2:16]
@@ -680,21 +654,10 @@
 bareSDs[3:4,4]<-1
 bareSDs[3:4,9]<-1
 bareSDs[3:4,13:14]<-1
+#take the Fisher Odds table but only that those odds, remove the labels that I put of them
 
-goodones<-bareSDs>1
 bareSDs[20,8]<-1
 
-allSDs<-fisheroddstable[1:20,2:16]
-allSDs[3:4,2]<-1
-allSDs[3:4,4]<-1
-allSDs[3:4,9]<-1
-allSDs[3:4,13:14]<-1
-
-#I'm trying to make it so it only goes 6 to 6 instead of 7 to 7, do this for speed reasons
-
-#what the above and below code does is this: fisherupdown is the "SD" table because it shows which positions and which amino acids the kinase likes and dislikes
-#so then I use the if and which statements below to automatically pick out WHICH amino acids the kinase likes at each position, if there are less than 2 there
-#I make sure there are at least 2.  And I make sure that D and E are always represented as possibilities for the purposes of the terbium binding test
 
 
 A=1
@@ -720,24 +683,18 @@
 
 aa_props <- c("A"=A, "C"=C, "D"=D, "E"=E, "F"=F,"G"=G,"H"=H,"I"=I,"K"=K,"L"=L,"M"=M,"N"=N,"P"=P,"Q"=Q,"R"=R,
               "S"=S,"T"=T,"V"=V,"W"=W,"Y"=Y,"xY"=Y,"O"=21)
+#this function converts letters to a number based on the equalities that I have written above
+#so aa_props(c(A,C,D)) will give c(1,2,3) as an answer
 
 ThisKinTable<-fisheroddstable
 
 NegativeScores<-rep(NA,times=nrow(NegativeSubstrateList))
 NegativeWeirdScores<-rep(NA,times=nrow(NegativeSubstrateList))
+
+#here I am going to take all of the Negative motifs and score them, according to the Fisher Odds Tables and the statistics of KINATEST-ID
 for (v in 1:nrow(NegativeSubstrateList)) {
   motif<-NegativeSubstrateList[v,2]
   motif<-unlist(strsplit(motif,""))
-  #if (length(motif)<9){print(v)}}
-  # motif[1] <- sapply(motif[1], function (x) aa_props[x])
-  # motif[2] <- sapply(motif[2], function (x) aa_props[x])
-  # motif[3] <- sapply(motif[3], function (x) aa_props[x])
-  # motif[4] <- sapply(motif[4], function (x) aa_props[x])
-  # motif[5] <- sapply(motif[5], function (x) aa_props[x])
-  # motif[6] <- sapply(motif[6], function (x) aa_props[x])
-  # motif[7] <- sapply(motif[7], function (x) aa_props[x])
-  # motif[8] <- sapply(motif[8], function (x) aa_props[x])
-  # motif[9] <- sapply(motif[9], function (x) aa_props[x])
   motif<- gsub(" ","O",motif)  
   motif <- sapply(motif, function (x) aa_props[x])
   Scoringpeptide<-motif
@@ -757,12 +714,10 @@
 negativesubstrates<-NegativeSubstrateList[,2]
 NegativeWithScores<-cbind(negativesubstrates,as.character(NegativeScores),as.character(NegativeWeirdScores))
 
-
-#NEED TO HAVE THE NEGATIVE SUBSTRATES BE OUTPUTTED
-
 PositiveScores<-rep(NA,times=nrow(PositiveSubstrateList))
 PositiveWeirdScores<-rep(NA,times=nrow(PositiveSubstrateList))
 
+#now I'm going to score all of the positive motifs, according to the same statistics
 for (v in 1:nrow(PositiveSubstrateList)) {
   motif<-PositiveSubstrateList[v,4:18]
   motif<-unlist(motif)
@@ -803,9 +758,7 @@
   }
   SumOfSigmaAAs[i]<-SumOfSigmasValue
 }
-
-
-#create the MCC table
+#this creates the Sum Of Sigmas value, also from the KINATEST-ID paper
 
 threshold<-c(1:100,(1:9)/10,(1:9)/100,0,-.1)
 threshold<-threshold[order(threshold,decreasing = TRUE)]
@@ -825,8 +778,9 @@
 F_One<-c(1:120)
 F_Two<-c(1:120)
 FalsePositiveRate<-c(1:120)
-#MAKE DAMN SURE THAT THE ACCESSION NUMBERS FOLLOW THE MOTIFS
 
+#here I am creating the Matthew's Correlation Coefficient table for each posible threshold value that could be used to separate negative motifs from positive motifs
+#these statistics allow us to determine the best threshold value
 for (z in 1:120) {
   thres<-threshold[z]
   Truepositives[z]<-length(PositiveWeirdScores[PositiveWeirdScores>=(thres)])
@@ -838,7 +792,6 @@
   One_Minus_Specificity[z]<-1-(TrueNegatives[z]/(FalsePositives[z]+TrueNegatives[z]))
   Accuracy[z]<-100*(Truepositives[z]+TrueNegatives[z])/(Falsenegatives[z]+FalsePositives[z]+TrueNegatives[z]+Truepositives[z])
   MCC[z]<-((Truepositives[z]*TrueNegatives[z])-(Falsenegatives[z]*FalsePositives[z]))/sqrt(round(round(Truepositives[z]+Falsenegatives[z])*round(TrueNegatives[z]+FalsePositives[z])*round(Truepositives[z]+FalsePositives[z])*round(TrueNegatives[z]+Falsenegatives[z])))
-  #EER[z]<-.01*(((1-(Sensitivity[z]))*(Truepositives[z]+Falsenegatives[z]))+(Specificity[z]*(1-(Truepositives[z]+Falsenegatives[z]))))
   EER[z]<-(FalsePositives[z]+Falsenegatives[z])/(Truepositives[z]+TrueNegatives[z]+FalsePositives[z]+Falsenegatives[z])
   Precision[z]<-Truepositives[z]/(Truepositives[z]+FalsePositives[z])
   F_One_Half[z]<-(1.5*Precision[z]*Sensitivity[z])/(.25*Precision[z]+Sensitivity[z])
@@ -854,15 +807,10 @@
 negativeheader<-c("Substrate","RPMS","PMS")
 colnames(NegativeWithScores)<-negativeheader
 
-# write.xlsx(NegativeWithScores,file = FILENAME, sheetName = "Negative Sequences Scored",col.names = TRUE,row.names = FALSE,append = TRUE)
-# write.xlsx(Characterization,file = FILENAME,sheetName = "Characterization Table",col.names = TRUE,row.names = FALSE,append = TRUE)
-# write.xlsx(RanksPeptides,file = FILENAME,sheetName = "Ranked Generated Peptides",col.names = FALSE,row.names = FALSE,append = TRUE)
-# write.xlsx(positivewithscores,file = FILENAME, sheetName = "Positive Sequences Scored",col.names = FALSE,row.names = FALSE,append = TRUE)
 write.table(x=c("Characterzation Table"),file = FILENAME2, col.names = FALSE,row.names = FALSE, append = TRUE,sep = ",")
 write.table(Characterization,file = FILENAME2, col.names = TRUE,row.names = FALSE, append = TRUE,sep = ",")
-
-
-#write.table(RanksPeptides,file = FILENAME3,append = TRUE,row.names = FALSE,col.names = TRUE,sep = ",")
+#write the output files
 
 
 options(warn = oldw)
+#reset the warning option
\ No newline at end of file