view KT-ID fisher test/7-7-fisher-galaxy_working.R @ 7:77f3a77f0ca2 draft default tip

Uploaded
author jfb
date Tue, 14 Jul 2020 19:57:11 -0400
parents eb89ff215fb8
children
line wrap: on
line source

oldw <- getOption("warn")
options(warn = -1)
#the Galaxy tool will error out if the system displays a warning, even if that warning does not cause the code to fail
#in order to ensure that the tool does not error out due to a simple Warning, I turn down the warning option to ignore warnings

PositiveSubstrateList<- read.csv("substrates.csv", stringsAsFactors=FALSE)
NegativeSubstrateList<- read.csv("negatives.csv", stringsAsFactors=FALSE)
SubstrateBackgroundFrequency<- read.csv("SBF.csv", stringsAsFactors=FALSE,header = FALSE)
SubstrateBackgroundFrequency<-t(SubstrateBackgroundFrequency)
SubstrateBackgroundFrequency<-SubstrateBackgroundFrequency[2:nrow(SubstrateBackgroundFrequency),]
#this takes in all the inputs and ensures they are oriented correctly

ScreenerFilename<-"screener"
screaner<-read.csv(ScreenerFilename, header = FALSE, stringsAsFactors = FALSE)
#the screener file is packaged with the tool and is used for actually not for anything

DataFilename<-"thedata.RData"
load(DataFilename)
#this is a data file containing a list called Genelist
#genelist is a list of every single Uniprot-Swissprot protein in the human genome
#and those proteins are then *in-silico* digested by trypsin, and any Y-containing tryptic peptides go in the list called GeneList
#so GeneList is a list of all Y-containing tryptic peptides from the human proteome

SDtableAndPercentTable<-"output1.csv"
NormalizationScore_CharacterizationTable<-"output2.csv"
SequenceScoringAndScreening<-"output3.csv"
#these are the names of the output files that will be generated

substrates<-matrix(rep("A",times=((nrow(PositiveSubstrateList)-1)*15)),ncol = 15)
#an "empty" matrix will gets filled below.  This matrix will contain all the letters of the input Positive Substrates

for (i in 2:nrow(PositiveSubstrateList))
{
  substratemotif<-PositiveSubstrateList[i,4:18]
  substratemotif[8]<-"Y"
  #substratemotif<-paste(substratemotif,sep = "",collapse = "")
  j=i-1
  substratemotif<-unlist(substratemotif)
  substrates[j,1:15]<-substratemotif
}

substrates2<-substrates
substrates2[substrates2==""]<-"O"
#any blank spots on the substrates list are places where there was no Amino Acid, so I give them the letter O.


#I create the percent table here, which finds what percertage of each amino acid was present at each position in the substrates, from -7 to +7
if (1==1){
Column1<-substrates[,1]
Column2<-substrates[,2]
Column3<-substrates[,3]
Column4<-substrates[,4]
Column5<-substrates[,5]
Column6<-substrates[,6]
Column7<-substrates[,7]
Column8<-substrates[,8]
Column9<-substrates[,9]
Column10<-substrates[,10]
Column11<-substrates[,11]
Column12<-substrates[,12]
Column13<-substrates[,13]
Column14<-substrates[,14]
Column15<-substrates[,15]

spaces1<-sum(Column1%in% "")
spaces2<-sum(Column2%in% "")
spaces3<-sum(Column3%in% "")
spaces4<-sum(Column4%in% "")
spaces5<-sum(Column5%in% "")
spaces6<-sum(Column6%in% "")
spaces7<-sum(Column7%in% "")
spaces8<-sum(Column8%in% "")
spaces9<-sum(Column9%in% "")
spaces10<-sum(Column10%in% "")
spaces11<-sum(Column11%in% "")
spaces12<-sum(Column12%in% "")
spaces13<-sum(Column13%in% "")
spaces14<-sum(Column14%in% "")
spaces15<-sum(Column15%in% "")
OllOs<-cbind(spaces1,spaces2,spaces3,spaces4,spaces5,spaces6,spaces7,spaces8,spaces9,spaces10,spaces11,
             spaces12,spaces13,spaces14,spaces15)

A1<-sum(Column1 %in% "A")
A2<-sum(Column2 %in% "A")
A3<-sum(Column3 %in% "A")
A4<-sum(Column4 %in% "A")
A5<-sum(Column5 %in% "A")
A6<-sum(Column6 %in% "A")
A7<-sum(Column7 %in% "A")
A8<-sum(Column8 %in% "A")
A9<-sum(Column9 %in% "A")
A10<-sum(Column10 %in% "A")
A11<-sum(Column11 %in% "A")
A12<-sum(Column12 %in% "A")
A13<-sum(Column13 %in% "A")
A14<-sum(Column14 %in% "A")
A15<-sum(Column15 %in% "A")
AllAs<-cbind(A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15)

C1<-sum(Column1 %in% "C")
C2<-sum(Column2 %in% "C")
C3<-sum(Column3 %in% "C")
C4<-sum(Column4 %in% "C")
C5<-sum(Column5 %in% "C")
C6<-sum(Column6 %in% "C")
C7<-sum(Column7 %in% "C")
C8<-sum(Column8 %in% "C")
C9<-sum(Column9 %in% "C")
C10<-sum(Column10 %in% "C")
C11<-sum(Column11 %in% "C")
C12<-sum(Column12 %in% "C")
C13<-sum(Column13 %in% "C")
C14<-sum(Column14 %in% "C")
C15<-sum(Column15 %in% "C")
CllCs<-cbind(C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,C15)

D1<-sum(Column1 %in% "D")
D2<-sum(Column2 %in% "D")
D3<-sum(Column3 %in% "D")
D4<-sum(Column4 %in% "D")
D5<-sum(Column5 %in% "D")
D6<-sum(Column6 %in% "D")
D7<-sum(Column7 %in% "D")
D8<-sum(Column8 %in% "D")
D9<-sum(Column9 %in% "D")
D10<-sum(Column10 %in% "D")
D11<-sum(Column11 %in% "D")
D12<-sum(Column12 %in% "D")
D13<-sum(Column13 %in% "D")
D14<-sum(Column14 %in% "D")
D15<-sum(Column15 %in% "D")
DllDs<-cbind(D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15)

E1<-sum(Column1 %in% "E")
E2<-sum(Column2 %in% "E")
E3<-sum(Column3 %in% "E")
E4<-sum(Column4 %in% "E")
E5<-sum(Column5 %in% "E")
E6<-sum(Column6 %in% "E")
E7<-sum(Column7 %in% "E")
E8<-sum(Column8 %in% "E")
E9<-sum(Column9 %in% "E")
E10<-sum(Column10 %in% "E")
E11<-sum(Column11 %in% "E")
E12<-sum(Column12 %in% "E")
E13<-sum(Column13 %in% "E")
E14<-sum(Column14 %in% "E")
E15<-sum(Column15 %in% "E")
EllEs<-cbind(E1,E2,E3,E4,E5,E6,E7,E8,E9,E10,E11,E12,E13,E14,E15)

F1<-sum(Column1 %in% "F")
F2<-sum(Column2 %in% "F")
F3<-sum(Column3 %in% "F")
F4<-sum(Column4 %in% "F")
F5<-sum(Column5 %in% "F")
F6<-sum(Column6 %in% "F")
F7<-sum(Column7 %in% "F")
F8<-sum(Column8 %in% "F")
F9<-sum(Column9 %in% "F")
F10<-sum(Column10 %in% "F")
F11<-sum(Column11 %in% "F")
F12<-sum(Column12 %in% "F")
F13<-sum(Column13 %in% "F")
F14<-sum(Column14 %in% "F")
F15<-sum(Column15 %in% "F")
FllFs<-cbind(F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15)

G1<-sum(Column1 %in% "G")
G2<-sum(Column2 %in% "G")
G3<-sum(Column3 %in% "G")
G4<-sum(Column4 %in% "G")
G5<-sum(Column5 %in% "G")
G6<-sum(Column6 %in% "G")
G7<-sum(Column7 %in% "G")
G8<-sum(Column8 %in% "G")
G9<-sum(Column9 %in% "G")
G10<-sum(Column10 %in% "G")
G11<-sum(Column11 %in% "G")
G12<-sum(Column12 %in% "G")
G13<-sum(Column13 %in% "G")
G14<-sum(Column14 %in% "G")
G15<-sum(Column15 %in% "G")
GllGs<-cbind(G1,G2,G3,G4,G5,G6,G7,G8,G9,G10,G11,G12,G13,G14,G15)

H1<-sum(Column1 %in% "H")
H2<-sum(Column2 %in% "H")
H3<-sum(Column3 %in% "H")
H4<-sum(Column4 %in% "H")
H5<-sum(Column5 %in% "H")
H6<-sum(Column6 %in% "H")
H7<-sum(Column7 %in% "H")
H8<-sum(Column8 %in% "H")
H9<-sum(Column9 %in% "H")
H10<-sum(Column10 %in% "H")
H11<-sum(Column11 %in% "H")
H12<-sum(Column12 %in% "H")
H13<-sum(Column13 %in% "H")
H14<-sum(Column14 %in% "H")
H15<-sum(Column15 %in% "H")
HllHs<-cbind(H1,H2,H3,H4,H5,H6,H7,H8,H9,H10,H11,H12,H13,H14,H15)

I1<-sum(Column1 %in% "I")
I2<-sum(Column2 %in% "I")
I3<-sum(Column3 %in% "I")
I4<-sum(Column4 %in% "I")
I5<-sum(Column5 %in% "I")
I6<-sum(Column6 %in% "I")
I7<-sum(Column7 %in% "I")
I8<-sum(Column8 %in% "I")
I9<-sum(Column9 %in% "I")
I10<-sum(Column10 %in% "I")
I11<-sum(Column11 %in% "I")
I12<-sum(Column12 %in% "I")
I13<-sum(Column13 %in% "I")
I14<-sum(Column14 %in% "I")
I15<-sum(Column15 %in% "I")
IllIs<-cbind(I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,I11,I12,I13,I14,I15)

K1<-sum(Column1 %in% "K")
K2<-sum(Column2 %in% "K")
K3<-sum(Column3 %in% "K")
K4<-sum(Column4 %in% "K")
K5<-sum(Column5 %in% "K")
K6<-sum(Column6 %in% "K")
K7<-sum(Column7 %in% "K")
K8<-sum(Column8 %in% "K")
K9<-sum(Column9 %in% "K")
K10<-sum(Column10 %in% "K")
K11<-sum(Column11 %in% "K")
K12<-sum(Column12 %in% "K")
K13<-sum(Column13 %in% "K")
K14<-sum(Column14 %in% "K")
K15<-sum(Column15 %in% "K")
KllKs<-cbind(K1,K2,K3,K4,K5,K6,K7,K8,K9,K10,K11,K12,K13,K14,K15)

L1<-sum(Column1 %in% "L")
L2<-sum(Column2 %in% "L")
L3<-sum(Column3 %in% "L")
L4<-sum(Column4 %in% "L")
L5<-sum(Column5 %in% "L")
L6<-sum(Column6 %in% "L")
L7<-sum(Column7 %in% "L")
L8<-sum(Column8 %in% "L")
L9<-sum(Column9 %in% "L")
L10<-sum(Column10 %in% "L")
L11<-sum(Column11 %in% "L")
L12<-sum(Column12 %in% "L")
L13<-sum(Column13 %in% "L")
L14<-sum(Column14 %in% "L")
L15<-sum(Column15 %in% "L")
LllLs<-cbind(L1,L2,L3,L4,L5,L6,L7,L8,L9,L10,L11,L12,L13,L14,L15)

M1<-sum(Column1 %in% "M")
M2<-sum(Column2 %in% "M")
M3<-sum(Column3 %in% "M")
M4<-sum(Column4 %in% "M")
M5<-sum(Column5 %in% "M")
M6<-sum(Column6 %in% "M")
M7<-sum(Column7 %in% "M")
M8<-sum(Column8 %in% "M")
M9<-sum(Column9 %in% "M")
M10<-sum(Column10 %in% "M")
M11<-sum(Column11 %in% "M")
M12<-sum(Column12 %in% "M")
M13<-sum(Column13 %in% "M")
M14<-sum(Column14 %in% "M")
M15<-sum(Column15 %in% "M")
MllMs<-cbind(M1,M2,M3,M4,M5,M6,M7,M8,M9,M10,M11,M12,M13,M14,M15)

N1<-sum(Column1 %in% "N")
N2<-sum(Column2 %in% "N")
N3<-sum(Column3 %in% "N")
N4<-sum(Column4 %in% "N")
N5<-sum(Column5 %in% "N")
N6<-sum(Column6 %in% "N")
N7<-sum(Column7 %in% "N")
N8<-sum(Column8 %in% "N")
N9<-sum(Column9 %in% "N")
N10<-sum(Column10 %in% "N")
N11<-sum(Column11 %in% "N")
N12<-sum(Column12 %in% "N")
N13<-sum(Column13 %in% "N")
N14<-sum(Column14 %in% "N")
N15<-sum(Column15 %in% "N")
NllNs<-cbind(N1,N2,N3,N4,N5,N6,N7,N8,N9,N10,N11,N12,N13,N14,N15)

P1<-sum(Column1 %in% "P")
P2<-sum(Column2 %in% "P")
P3<-sum(Column3 %in% "P")
P4<-sum(Column4 %in% "P")
P5<-sum(Column5 %in% "P")
P6<-sum(Column6 %in% "P")
P7<-sum(Column7 %in% "P")
P8<-sum(Column8 %in% "P")
P9<-sum(Column9 %in% "P")
P10<-sum(Column10 %in% "P")
P11<-sum(Column11 %in% "P")
P12<-sum(Column12 %in% "P")
P13<-sum(Column13 %in% "P")
P14<-sum(Column14 %in% "P")
P15<-sum(Column15 %in% "P")
PllPs<-cbind(P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,P11,P12,P13,P14,P15)

Q1<-sum(Column1 %in% "Q")
Q2<-sum(Column2 %in% "Q")
Q3<-sum(Column3 %in% "Q")
Q4<-sum(Column4 %in% "Q")
Q5<-sum(Column5 %in% "Q")
Q6<-sum(Column6 %in% "Q")
Q7<-sum(Column7 %in% "Q")
Q8<-sum(Column8 %in% "Q")
Q9<-sum(Column9 %in% "Q")
Q10<-sum(Column10 %in% "Q")
Q11<-sum(Column11 %in% "Q")
Q12<-sum(Column12 %in% "Q")
Q13<-sum(Column13 %in% "Q")
Q14<-sum(Column14 %in% "Q")
Q15<-sum(Column15 %in% "Q")
QllQs<-cbind(Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Q13,Q14,Q15)

R1<-sum(Column1 %in% "R")
R2<-sum(Column2 %in% "R")
R3<-sum(Column3 %in% "R")
R4<-sum(Column4 %in% "R")
R5<-sum(Column5 %in% "R")
R6<-sum(Column6 %in% "R")
R7<-sum(Column7 %in% "R")
R8<-sum(Column8 %in% "R")
R9<-sum(Column9 %in% "R")
R10<-sum(Column10 %in% "R")
R11<-sum(Column11 %in% "R")
R12<-sum(Column12 %in% "R")
R13<-sum(Column13 %in% "R")
R14<-sum(Column14 %in% "R")
R15<-sum(Column15 %in% "R")
RllRs<-cbind(R1,R2,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,R13,R14,R15)

S1<-sum(Column1 %in% "S")
S2<-sum(Column2 %in% "S")
S3<-sum(Column3 %in% "S")
S4<-sum(Column4 %in% "S")
S5<-sum(Column5 %in% "S")
S6<-sum(Column6 %in% "S")
S7<-sum(Column7 %in% "S")
S8<-sum(Column8 %in% "S")
S9<-sum(Column9 %in% "S")
S10<-sum(Column10 %in% "S")
S11<-sum(Column11 %in% "S")
S12<-sum(Column12 %in% "S")
S13<-sum(Column13 %in% "S")
S14<-sum(Column14 %in% "S")
S15<-sum(Column15 %in% "S")
SllSs<-cbind(S1,S2,S3,S4,S5,S6,S7,S8,S9,S10,S11,S12,S13,S14,S15)

T1<-sum(Column1 %in% "T")
T2<-sum(Column2 %in% "T")
T3<-sum(Column3 %in% "T")
T4<-sum(Column4 %in% "T")
T5<-sum(Column5 %in% "T")
T6<-sum(Column6 %in% "T")
T7<-sum(Column7 %in% "T")
T8<-sum(Column8 %in% "T")
T9<-sum(Column9 %in% "T")
T10<-sum(Column10 %in% "T")
T11<-sum(Column11 %in% "T")
T12<-sum(Column12 %in% "T")
T13<-sum(Column13 %in% "T")
T14<-sum(Column14 %in% "T")
T15<-sum(Column15 %in% "T")
TllTs<-cbind(T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15)

V1<-sum(Column1 %in% "V")
V2<-sum(Column2 %in% "V")
V3<-sum(Column3 %in% "V")
V4<-sum(Column4 %in% "V")
V5<-sum(Column5 %in% "V")
V6<-sum(Column6 %in% "V")
V7<-sum(Column7 %in% "V")
V8<-sum(Column8 %in% "V")
V9<-sum(Column9 %in% "V")
V10<-sum(Column10 %in% "V")
V11<-sum(Column11 %in% "V")
V12<-sum(Column12 %in% "V")
V13<-sum(Column13 %in% "V")
V14<-sum(Column14 %in% "V")
V15<-sum(Column15 %in% "V")
VllVs<-cbind(V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15)

W1<-sum(Column1 %in% "W")
W2<-sum(Column2 %in% "W")
W3<-sum(Column3 %in% "W")
W4<-sum(Column4 %in% "W")
W5<-sum(Column5 %in% "W")
W6<-sum(Column6 %in% "W")
W7<-sum(Column7 %in% "W")
W8<-sum(Column8 %in% "W")
W9<-sum(Column9 %in% "W")
W10<-sum(Column10 %in% "W")
W11<-sum(Column11 %in% "W")
W12<-sum(Column12 %in% "W")
W13<-sum(Column13 %in% "W")
W14<-sum(Column14 %in% "W")
W15<-sum(Column15 %in% "W")
WllWs<-cbind(W1,W2,W3,W4,W5,W6,W7,W8,W9,W10,W11,W12,W13,W14,W15)

Y1<-sum(Column1 %in% "Y")
Y2<-sum(Column2 %in% "Y")
Y3<-sum(Column3 %in% "Y")
Y4<-sum(Column4 %in% "Y")
Y5<-sum(Column5 %in% "Y")
Y6<-sum(Column6 %in% "Y")
Y7<-sum(Column7 %in% "Y")
Y8<-sum(Column8 %in% "Y")
Y9<-sum(Column9 %in% "Y")
Y10<-sum(Column10 %in% "Y")
Y11<-sum(Column11 %in% "Y")
Y12<-sum(Column12 %in% "Y")
Y13<-sum(Column13 %in% "Y")
Y14<-sum(Column14 %in% "Y")
Y15<-sum(Column15 %in% "Y")
YllYs<-cbind(Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y8,Y9,Y10,Y11,Y12,Y13,Y14,Y15)

PercentTable<-rbind(AllAs,CllCs,DllDs,EllEs,FllFs,GllGs,HllHs,IllIs,KllKs,LllLs,MllMs,NllNs,PllPs,QllQs,RllRs,SllSs,TllTs,VllVs,WllWs,YllYs,OllOs)
}

AllSubBackFreq<-array(data = NA,dim = c(21,15,nrow(SubstrateBackgroundFrequency)))
#this is where I'm creating the new SubBackFreq table, I have a list with all possible SBF matrices,
#I perform a for function to find all the matrices that are in Substrate Background Frequency
#and place them all in this array, then I will do mean and SD

AAccessionNumbers<-SubstrateBackgroundFrequency[,1]
AllGeneNames<-names(Genelist)

number_replaced<-0
totalmotifs<-0
for (z in 1:length(AAccessionNumbers)) {
  #for every protein found in the substrate background frequency list
  pattern<-AAccessionNumbers[z]
  referencepoint<-grepl(pattern = pattern, x=AllGeneNames,fixed = TRUE)
  #find that protein in the GeneList list
  referencenumber<-which(referencepoint==TRUE)
  if (length(referencenumber)<1){referencenumber<-FALSE}
  if (referencenumber!=FALSE){
    #then if you have actually found a match in the GeneList list, you will do the following
    motifs<-unlist(Genelist[[referencenumber]])
    #take every tryptic peptide from that list
    therow<-c(1:15)
    for (a in 1:length(motifs)) {
      thecut<-unlist(strsplit(motifs[a], split=""))
      edges<-c("O","O","O","O","O","O","O")
      thecut<-c(edges,thecut,edges)
      theYs<-which(thecut=="Y")
      #and use the above code to replace blank spaces in the -7 to +7 motif with Os, same as what I did for the positive substrates
      for (q in 1:length(theYs)) {
        thiscut<-thecut[(theYs[q]-7):(theYs[q]+7)]
        therow<-rbind(therow,thiscut)
        totalmotifs<-totalmotifs+1
        #count how many motifs you've put in there
      }
    }
    
    cutreplacement<-c("X","X","X","X","X","X","X","X","X","X","X","X","X","X","X")
    for (t in 1:nrow(therow)) {
      compare1<-therow[t,1:15]
      compare1<-paste(compare1,sep = "",collapse = "")
      
      for (v in 1:nrow(substrates2)) {
        positivesubstrate<-substrates2[v,1:15]
        positivesubstrate<-paste(positivesubstrate,sep = "",collapse = "")
        
        if (compare1==positivesubstrate){
          therow[t,1:15]<-cutreplacement
          number_replaced<-number_replaced+1
          #if any of the substrates found in the substrate background frequency are identical to the positive substrates, this removes them
          #and replaces them with Xs to ensure they do not get counted when all the AAs are being counted
          }
      }
      
    }
    
    Column1<-therow[,1]
    Column2<-therow[,2]
    Column3<-therow[,3]
    Column4<-therow[,4]
    Column5<-therow[,5]
    Column6<-therow[,6]
    Column7<-therow[,7]
    Column8<-therow[,8]
    Column9<-therow[,9]
    Column10<-therow[,10]
    Column11<-therow[,11]
    Column12<-therow[,12]
    Column13<-therow[,13]
    Column14<-therow[,14]
    Column15<-therow[,15]
    #split the motifs you found above into individual columns
    #each column is ome of the positions between -7 and +7, and so each column represents every single amino acid found at that position
    slice1<-c(sum(Column1=="A"),sum(Column1=="C"),sum(Column1=="D"),sum(Column1=="E"),sum(Column1=="F"),
              sum(Column1=="G"),sum(Column1=="H"),sum(Column1=="I"),sum(Column1=="K"),sum(Column1=="L"),
              sum(Column1=="M"),sum(Column1=="N"),sum(Column1=="P"),sum(Column1=="Q"),sum(Column1=="R"),
              sum(Column1=="S"),sum(Column1=="T"),sum(Column1=="V"),sum(Column1=="W"),sum(Column1=="Y"),
              sum(Column1=="O"))
    #count how many of each amino acid were present in that column, the vector of "how many of each amino acid were present in Column1", the name of that vector is slice1
    slice2<-c(sum(Column2=="A"),sum(Column2=="C"),sum(Column2=="D"),sum(Column2=="E"),sum(Column2=="F"),
              sum(Column2=="G"),sum(Column2=="H"),sum(Column2=="I"),sum(Column2=="K"),sum(Column2=="L"),
              sum(Column2=="M"),sum(Column2=="N"),sum(Column2=="P"),sum(Column2=="Q"),sum(Column2=="R"),
              sum(Column2=="S"),sum(Column2=="T"),sum(Column2=="V"),sum(Column2=="W"),sum(Column2=="Y"),
              sum(Column2=="O"))
    slice3<-c(sum(Column3=="A"),sum(Column3=="C"),sum(Column3=="D"),sum(Column3=="E"),sum(Column3=="F"),
              sum(Column3=="G"),sum(Column3=="H"),sum(Column3=="I"),sum(Column3=="K"),sum(Column3=="L"),
              sum(Column3=="M"),sum(Column3=="N"),sum(Column3=="P"),sum(Column3=="Q"),sum(Column3=="R"),
              sum(Column3=="S"),sum(Column3=="T"),sum(Column3=="V"),sum(Column3=="W"),sum(Column3=="Y"),
              sum(Column3=="O"))
    slice4<-c(sum(Column4=="A"),sum(Column4=="C"),sum(Column4=="D"),sum(Column4=="E"),sum(Column4=="F"),
              sum(Column4=="G"),sum(Column4=="H"),sum(Column4=="I"),sum(Column4=="K"),sum(Column4=="L"),
              sum(Column4=="M"),sum(Column4=="N"),sum(Column4=="P"),sum(Column4=="Q"),sum(Column4=="R"),
              sum(Column4=="S"),sum(Column4=="T"),sum(Column4=="V"),sum(Column4=="W"),sum(Column4=="Y"),
              sum(Column4=="O"))
    slice5<-c(sum(Column5=="A"),sum(Column5=="C"),sum(Column5=="D"),sum(Column5=="E"),sum(Column5=="F"),
              sum(Column5=="G"),sum(Column5=="H"),sum(Column5=="I"),sum(Column5=="K"),sum(Column5=="L"),
              sum(Column5=="M"),sum(Column5=="N"),sum(Column5=="P"),sum(Column5=="Q"),sum(Column5=="R"),
              sum(Column5=="S"),sum(Column5=="T"),sum(Column5=="V"),sum(Column5=="W"),sum(Column5=="Y"),
              sum(Column5=="O"))
    slice6<-c(sum(Column6=="A"),sum(Column6=="C"),sum(Column6=="D"),sum(Column6=="E"),sum(Column6=="F"),
              sum(Column6=="G"),sum(Column6=="H"),sum(Column6=="I"),sum(Column6=="K"),sum(Column6=="L"),
              sum(Column6=="M"),sum(Column6=="N"),sum(Column6=="P"),sum(Column6=="Q"),sum(Column6=="R"),
              sum(Column6=="S"),sum(Column6=="T"),sum(Column6=="V"),sum(Column6=="W"),sum(Column6=="Y"),
              sum(Column6=="O"))
    slice7<-c(sum(Column7=="A"),sum(Column7=="C"),sum(Column7=="D"),sum(Column7=="E"),sum(Column7=="F"),
              sum(Column7=="G"),sum(Column7=="H"),sum(Column7=="I"),sum(Column7=="K"),sum(Column7=="L"),
              sum(Column7=="M"),sum(Column7=="N"),sum(Column7=="P"),sum(Column7=="Q"),sum(Column7=="R"),
              sum(Column7=="S"),sum(Column7=="T"),sum(Column7=="V"),sum(Column7=="W"),sum(Column7=="Y"),
              sum(Column7=="O"))
    slice8<-c(sum(Column8=="A"),sum(Column8=="C"),sum(Column8=="D"),sum(Column8=="E"),sum(Column8=="F"),
              sum(Column8=="G"),sum(Column8=="H"),sum(Column8=="I"),sum(Column8=="K"),sum(Column8=="L"),
              sum(Column8=="M"),sum(Column8=="N"),sum(Column8=="P"),sum(Column8=="Q"),sum(Column8=="R"),
              sum(Column8=="S"),sum(Column8=="T"),sum(Column8=="V"),sum(Column8=="W"),sum(Column8=="Y"),
              sum(Column8=="O"))
    slice9<-c(sum(Column9=="A"),sum(Column9=="C"),sum(Column9=="D"),sum(Column9=="E"),sum(Column9=="F"),
              sum(Column9=="G"),sum(Column9=="H"),sum(Column9=="I"),sum(Column9=="K"),sum(Column9=="L"),
              sum(Column9=="M"),sum(Column9=="N"),sum(Column9=="P"),sum(Column9=="Q"),sum(Column9=="R"),
              sum(Column9=="S"),sum(Column9=="T"),sum(Column9=="V"),sum(Column9=="W"),sum(Column9=="Y"),
              sum(Column9=="O"))
    slice10<-c(sum(Column10=="A"),sum(Column10=="C"),sum(Column10=="D"),sum(Column10=="E"),sum(Column10=="F"),
              sum(Column10=="G"),sum(Column10=="H"),sum(Column10=="I"),sum(Column10=="K"),sum(Column10=="L"),
              sum(Column10=="M"),sum(Column10=="N"),sum(Column10=="P"),sum(Column10=="Q"),sum(Column10=="R"),
              sum(Column10=="S"),sum(Column10=="T"),sum(Column10=="V"),sum(Column10=="W"),sum(Column10=="Y"),
              sum(Column10=="O"))
    slice11<-c(sum(Column11=="A"),sum(Column11=="C"),sum(Column11=="D"),sum(Column11=="E"),sum(Column11=="F"),
              sum(Column11=="G"),sum(Column11=="H"),sum(Column11=="I"),sum(Column11=="K"),sum(Column11=="L"),
              sum(Column11=="M"),sum(Column11=="N"),sum(Column11=="P"),sum(Column11=="Q"),sum(Column11=="R"),
              sum(Column11=="S"),sum(Column11=="T"),sum(Column11=="V"),sum(Column11=="W"),sum(Column11=="Y"),
              sum(Column11=="O"))
    slice12<-c(sum(Column12=="A"),sum(Column12=="C"),sum(Column12=="D"),sum(Column12=="E"),sum(Column12=="F"),
              sum(Column12=="G"),sum(Column12=="H"),sum(Column12=="I"),sum(Column12=="K"),sum(Column12=="L"),
              sum(Column12=="M"),sum(Column12=="N"),sum(Column12=="P"),sum(Column12=="Q"),sum(Column12=="R"),
              sum(Column12=="S"),sum(Column12=="T"),sum(Column12=="V"),sum(Column12=="W"),sum(Column12=="Y"),
              sum(Column12=="O"))
    slice13<-c(sum(Column13=="A"),sum(Column13=="C"),sum(Column13=="D"),sum(Column13=="E"),sum(Column13=="F"),
              sum(Column13=="G"),sum(Column13=="H"),sum(Column13=="I"),sum(Column13=="K"),sum(Column13=="L"),
              sum(Column13=="M"),sum(Column13=="N"),sum(Column13=="P"),sum(Column13=="Q"),sum(Column13=="R"),
              sum(Column13=="S"),sum(Column13=="T"),sum(Column13=="V"),sum(Column13=="W"),sum(Column13=="Y"),
              sum(Column13=="O"))
    slice14<-c(sum(Column14=="A"),sum(Column14=="C"),sum(Column14=="D"),sum(Column14=="E"),sum(Column14=="F"),
              sum(Column14=="G"),sum(Column14=="H"),sum(Column14=="I"),sum(Column14=="K"),sum(Column14=="L"),
              sum(Column14=="M"),sum(Column14=="N"),sum(Column14=="P"),sum(Column14=="Q"),sum(Column14=="R"),
              sum(Column14=="S"),sum(Column14=="T"),sum(Column14=="V"),sum(Column14=="W"),sum(Column14=="Y"),
              sum(Column14=="O"))
    slice15<-c(sum(Column15=="A"),sum(Column15=="C"),sum(Column15=="D"),sum(Column15=="E"),sum(Column15=="F"),
              sum(Column15=="G"),sum(Column15=="H"),sum(Column15=="I"),sum(Column15=="K"),sum(Column15=="L"),
              sum(Column15=="M"),sum(Column15=="N"),sum(Column15=="P"),sum(Column15=="Q"),sum(Column15=="R"),
              sum(Column15=="S"),sum(Column15=="T"),sum(Column15=="V"),sum(Column15=="W"),sum(Column15=="Y"),
              sum(Column15=="O"))
    ThisMatix<-cbind(slice1,slice2,slice3,slice4,slice5,slice6,slice7,slice8,slice9,
                     slice10,slice11,slice12,slice13,slice14,slice15)
    ThisMatix<-ThisMatix
    #stick all those slices together in a matrix
    AllSubBackFreq[1:21,1:15,z]<-ThisMatix
    #add that matrix to the 3d matrix of substratebackgroundfrequency
    #the full matrix is a 3d matrix.  The Z dimension represents each Accession Number aka protein, the Y dimension represents 
    #each of the 20 amino acids plus O which is AA number 21, and the X dimension represents the positions -7 to +7
  }
}

theletters<-c("A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y","O")
#these are the 20 AAs of the code plus O which represents "no amino acid present here"

SumSBF<-apply(AllSubBackFreq, c(1,2), sum, na.rm=TRUE)
#the 3d matrix above is summed across its Z dimension


fisheroddstable<-matrix(data = 1,nrow = 21,ncol = 15)
fisherpvalstable<-matrix(data = 1,nrow = 21,ncol = 15)
fisherpvalstableadjusted<-matrix(data = 1,nrow = 21,ncol = 15)
#pre-allocate vectors

for (rowas in 1:21) {
  #for each of the 20 AAs plus O
  for (colams in 1:15) {
    #for each position -7 to +7
    fishermatrix<-matrix(data=c(PercentTable[rowas,colams],nrow(substrates),SumSBF[rowas,colams],(totalmotifs-number_replaced)),nrow = 2)
    thetest<-fisher.test(x=fishermatrix)
    #perform a Fisher Test to see if the frequency of that AA in the substrate background frequency is different than the frequency in the Positive Substrates
    #remember that the frequency in the Positive Substrates is determined by the percent table
    fisheroddstable[rowas,colams]<-thetest$estimate
    fisherpvalstable[rowas,colams]<-thetest$p.value
    fisherpvalstableadjusted[rowas,colams]<-p.adjust(p=thetest$p.value,method = "fdr",n=21*15)
  }
}

fisheroddstable<-cbind.data.frame(theletters,fisheroddstable)
fisherpvalstable<-cbind.data.frame(theletters,fisherpvalstable)
fisherpvalstableadjusted<-cbind.data.frame(theletters,fisherpvalstableadjusted)

fisherupdown<-fisheroddstable

for (x in 1:21) {
  for (y in 2:16) {
    theval<-1
    testval<-fisheroddstable[x,y]
    testp<-fisherpvalstable[x,y]
    if (testp<.05){
      theval<-testval
    }
    fisherupdown[x,y]<-theval
    #determine which of the Fisher Odds found were actually significant
  }
}

write.table(x="Fisher Odds, only significant ones",file = SDtableAndPercentTable, append = TRUE,sep = ",",col.names = FALSE,row.names = FALSE)
write.table(x=fisherupdown,file = SDtableAndPercentTable, append = TRUE,sep = ",",col.names = FALSE,row.names = FALSE)
write.table(x="Fisher Odds",file = SDtableAndPercentTable, append = TRUE,sep = ",",col.names = FALSE,row.names = FALSE)
write.table(x=fisheroddstable,file = SDtableAndPercentTable, append = TRUE,sep = ",",col.names = FALSE,row.names = FALSE)
write.table(x="Fisher p.values",file = SDtableAndPercentTable, append = TRUE,sep = ",",col.names = FALSE,row.names = FALSE)
write.table(x=fisherpvalstable,file = SDtableAndPercentTable, append = TRUE,sep = ",",col.names = FALSE,row.names = FALSE)
write.table(x="Fisher p.values adjusted",file = SDtableAndPercentTable, append = TRUE,sep = ",",col.names = FALSE,row.names = FALSE)
write.table(x=fisherpvalstableadjusted,file = SDtableAndPercentTable, append = TRUE,sep = ",",col.names = FALSE,row.names = FALSE)

SetOfAAs<-c("Letter","A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y")
SetOfAAs<-matrix(data = SetOfAAs,ncol = 1)
numberofY<-as.numeric(SubstrateBackgroundFrequency[,34])
numberofY<-numberofY[!is.na(numberofY)]
numberofPY<-as.numeric(SubstrateBackgroundFrequency[,35])
numberofPY<-numberofPY[!is.na(numberofPY)]
NormalizationScore<-sum(numberofPY)/sum(numberofY)
NormalizationScore<-c("Normalization Score",NormalizationScore)
write.table(NormalizationScore, file = SiteSelectivityTable_EndogenousProbabilityMatrix_NormalizationScore_CharacterizationTable, append = TRUE,sep = ",",row.names = FALSE, col.names = FALSE)
#output files


bareSDs<-fisherupdown[1:20,2:16]
bareSDs[20,8]<-3
bareSDs[3:4,2]<-1
bareSDs[3:4,4]<-1
bareSDs[3:4,9]<-1
bareSDs[3:4,13:14]<-1
#take the Fisher Odds table but only that those odds, remove the labels that I put of them

bareSDs[20,8]<-1



A=1
C=2
D=3
E=4
F=5
G=6
H=7
I=8
K=9
L=10
M=11
N=12
P=13
Q=14
R=15
S=16
T=17
V=18
W=19
Y=20

aa_props <- c("A"=A, "C"=C, "D"=D, "E"=E, "F"=F,"G"=G,"H"=H,"I"=I,"K"=K,"L"=L,"M"=M,"N"=N,"P"=P,"Q"=Q,"R"=R,
              "S"=S,"T"=T,"V"=V,"W"=W,"Y"=Y,"xY"=Y,"O"=21)
#this function converts letters to a number based on the equalities that I have written above
#so aa_props(c(A,C,D)) will give c(1,2,3) as an answer

ThisKinTable<-fisheroddstable

NegativeScores<-rep(NA,times=nrow(NegativeSubstrateList))
NegativeWeirdScores<-rep(NA,times=nrow(NegativeSubstrateList))

#here I am going to take all of the Negative motifs and score them, according to the Fisher Odds Tables and the statistics of KINATEST-ID
for (v in 1:nrow(NegativeSubstrateList)) {
  motif<-NegativeSubstrateList[v,2]
  motif<-unlist(strsplit(motif,""))
  motif<- gsub(" ","O",motif)  
  motif <- sapply(motif, function (x) aa_props[x])
  Scoringpeptide<-motif
  Scoringpeptide<-Scoringpeptide
  ThisKinTableScore<-as.numeric(ThisKinTable[Scoringpeptide[1],2])*ThisKinTable[as.numeric(Scoringpeptide[2]),3]*ThisKinTable[as.numeric(Scoringpeptide[3]),4]*
    ThisKinTable[as.numeric(Scoringpeptide[4]),5]*ThisKinTable[as.numeric(Scoringpeptide[5]),6]*ThisKinTable[as.numeric(Scoringpeptide[6]),7]*
    ThisKinTable[as.numeric(Scoringpeptide[7]),8]*
    #ThisKinTable[as.numeric(Scoringpeptide[8]),10]*
    ThisKinTable[as.numeric(Scoringpeptide[9]),10]*ThisKinTable[as.numeric(Scoringpeptide[10]),11]*ThisKinTable[as.numeric(Scoringpeptide[11]),12]*
    ThisKinTable[as.numeric(Scoringpeptide[12]),13]*ThisKinTable[as.numeric(Scoringpeptide[13]),14]*ThisKinTable[as.numeric(Scoringpeptide[14]),15]*
    ThisKinTable[as.numeric(Scoringpeptide[15]),16]
  NegativeScores[v]<-ThisKinTableScore
  ThisKinTableScore<-(ThisKinTableScore/(ThisKinTableScore+1/as.numeric(NormalizationScore[2])))
  NegativeWeirdScores[v]<-ThisKinTableScore*100
}

negativesubstrates<-NegativeSubstrateList[,2]
NegativeWithScores<-cbind(negativesubstrates,as.character(NegativeScores),as.character(NegativeWeirdScores))

PositiveScores<-rep(NA,times=nrow(PositiveSubstrateList))
PositiveWeirdScores<-rep(NA,times=nrow(PositiveSubstrateList))

#now I'm going to score all of the positive motifs, according to the same statistics
for (v in 1:nrow(PositiveSubstrateList)) {
  motif<-PositiveSubstrateList[v,4:18]
  motif<-unlist(motif)
  motif<- gsub("^$","O",motif)
  motif <- sapply(motif, function (x) aa_props[x])
  Scoringpeptide<-motif
  Scoringpeptide<-Scoringpeptide
  ThisKinTableScore<-as.numeric(ThisKinTable[Scoringpeptide[1],2])*ThisKinTable[as.numeric(Scoringpeptide[2]),3]*ThisKinTable[as.numeric(Scoringpeptide[3]),4]*
    ThisKinTable[as.numeric(Scoringpeptide[4]),5]*ThisKinTable[as.numeric(Scoringpeptide[5]),6]*ThisKinTable[as.numeric(Scoringpeptide[6]),7]*
    ThisKinTable[as.numeric(Scoringpeptide[7]),8]*
    #ThisKinTable[as.numeric(Scoringpeptide[8]),10]*
    ThisKinTable[as.numeric(Scoringpeptide[9]),10]*ThisKinTable[as.numeric(Scoringpeptide[10]),11]*ThisKinTable[as.numeric(Scoringpeptide[11]),12]*
    ThisKinTable[as.numeric(Scoringpeptide[12]),13]*ThisKinTable[as.numeric(Scoringpeptide[13]),14]*ThisKinTable[as.numeric(Scoringpeptide[14]),15]*
    ThisKinTable[as.numeric(Scoringpeptide[15]),16]

  PositiveScores[v]<-ThisKinTableScore
  ThisKinTableScore<-(ThisKinTableScore/(ThisKinTableScore+1/as.numeric(NormalizationScore[2])))
  PositiveWeirdScores[v]<-ThisKinTableScore*100
}

positivesubstrates<-PositiveSubstrateList[,4:18]
positivewithscores<-cbind.data.frame(positivesubstrates,PositiveScores,PositiveWeirdScores)



SetOfAAs<-c("Letter","A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y")
SumOfSigmaAAs<-c(1:15)

for (i in 1:15){
  SumOfSigmasValue<-0
  for (j in 1:20){
    value<-0
    if (bareSDs[j,i]>1){
      k<-j+1
      value<-sum(substrates[,i]==SetOfAAs[k])
    }
    SumOfSigmasValue<-SumOfSigmasValue+value
  }
  SumOfSigmaAAs[i]<-SumOfSigmasValue
}
#this creates the Sum Of Sigmas value, also from the KINATEST-ID paper

threshold<-c(1:100,(1:9)/10,(1:9)/100,0,-.1)
threshold<-threshold[order(threshold,decreasing = TRUE)]
threshold

Truepositives<-c(1:120)
Falsenegatives<-c(1:120)
Sensitivity<-c(1:120)
TrueNegatives<-c(1:120)
FalsePositives<-c(1:120)
One_Minus_Specificity<-c(1:120)
Accuracy<-c(1:120)
MCC<-c(1:120)
EER<-c(1:120)
Precision<-c(1:120)
F_One_Half<-c(1:120)
F_One<-c(1:120)
F_Two<-c(1:120)
FalsePositiveRate<-c(1:120)

#here I am creating the Matthew's Correlation Coefficient table for each posible threshold value that could be used to separate negative motifs from positive motifs
#these statistics allow us to determine the best threshold value
for (z in 1:120) {
  thres<-threshold[z]
  Truepositives[z]<-length(PositiveWeirdScores[PositiveWeirdScores>=(thres)])
  Falsenegatives[z]<-nrow(positivesubstrates)-Truepositives[z]
  Sensitivity[z]<-Truepositives[z]/(Falsenegatives[z]+Truepositives[z])
  TrueNegatives[z]<-length(NegativeWeirdScores[NegativeWeirdScores<(thres)])
  # at thresh 100 this should be 0, because it is total minus true negatives
  FalsePositives[z]<-nrow(NegativeSubstrateList)-TrueNegatives[z]
  One_Minus_Specificity[z]<-1-(TrueNegatives[z]/(FalsePositives[z]+TrueNegatives[z]))
  Accuracy[z]<-100*(Truepositives[z]+TrueNegatives[z])/(Falsenegatives[z]+FalsePositives[z]+TrueNegatives[z]+Truepositives[z])
  MCC[z]<-((Truepositives[z]*TrueNegatives[z])-(Falsenegatives[z]*FalsePositives[z]))/sqrt(round(round(Truepositives[z]+Falsenegatives[z])*round(TrueNegatives[z]+FalsePositives[z])*round(Truepositives[z]+FalsePositives[z])*round(TrueNegatives[z]+Falsenegatives[z])))
  EER[z]<-(FalsePositives[z]+Falsenegatives[z])/(Truepositives[z]+TrueNegatives[z]+FalsePositives[z]+Falsenegatives[z])
  Precision[z]<-Truepositives[z]/(Truepositives[z]+FalsePositives[z])
  F_One_Half[z]<-(1.5*Precision[z]*Sensitivity[z])/(.25*Precision[z]+Sensitivity[z])
  F_One[z]<-(2*Precision[z]*Sensitivity[z])/(Precision[z]+Sensitivity[z])
  F_Two[z]<-(5*Precision[z]*Sensitivity[z])/(4*Precision[z]+Sensitivity[z])
  FalsePositiveRate[z]<-FalsePositives[z]/(TrueNegatives[z]+FalsePositives[z])
}
Characterization<-cbind.data.frame(threshold,Truepositives,Falsenegatives,Sensitivity,TrueNegatives,FalsePositives,One_Minus_Specificity,Accuracy,MCC,EER,Precision,FalsePositiveRate,F_One_Half,F_One,F_Two)

positiveheader<-c(1,2,3,4,5,6,7,8,9,10,11,12,13,"RPMS","PMS")
positivewithscores<-rbind.data.frame(positiveheader,positivewithscores)

negativeheader<-c("Substrate","RPMS","PMS")
colnames(NegativeWithScores)<-negativeheader

write.table(x=c("Characterzation Table"),file = FILENAME2, col.names = FALSE,row.names = FALSE, append = TRUE,sep = ",")
write.table(Characterization,file = FILENAME2, col.names = TRUE,row.names = FALSE, append = TRUE,sep = ",")
#write the output files


options(warn = oldw)
#reset the warning option