changeset 29:a555e95b2066 draft

Update phagepromoter.py -> pickle instead joblib
author martasampaio
date Mon, 28 Oct 2019 12:39:04 -0400
parents 4289d1b76d66
children c60c460307e7
files phagepromoter.py
diffstat 1 files changed, 570 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/phagepromoter.py	Mon Oct 28 12:39:04 2019 -0400
@@ -0,0 +1,570 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jul 19 13:45:05 2018
+
+@author: Marta
+"""
+
+from Bio import SeqIO
+import numpy as np
+import pandas as pd
+from auxiliar import free_energy,freq_base
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from Bio.Alphabet import IUPAC
+from auxiliar import get_bacteria, get_families, get_max_pssm, get_scores, get_lifecycle
+
+#division of the test genome in sequences of 65 bp
+def get_testseqs65(form,fic,both=False):
+    ALL = []
+    indexes = []
+    a = 0
+    rec = SeqIO.read(fic,form)
+    genome = rec.seq
+    i = 0
+    j = 65
+    while j < len(genome):
+        s = genome[i:j]
+        ALL.append([1,i,j,s])
+        i += 20
+        j += 20
+        a += 1
+        indexes.append(rec.name+":"+str(a))
+    if both:
+        comp = genome.reverse_complement()
+        size = len(rec.seq)
+        i = 0
+        j = 65
+        while j < len(comp):
+            s = comp[i:j]
+            ALL.append([-1,size-j,size-i,s])
+            i += 20
+            j += 20
+            a += 1
+            indexes.append(rec.name+":"+str(a))
+    df = pd.DataFrame(ALL, index=indexes, columns=['strand','iniprom','endprom','seq'])
+    return df
+
+#calculate the scores of all sequences (similar to get_posScores and get_negScores)
+def get_testScores(loc,test):
+    scores = []
+    posis = []
+    sizes = []
+    dic = {}
+    for ind,row in test.iterrows():
+        _,window = ind.split(':')
+        strand = row['strand']
+        ini = row['iniprom']
+        end = row['endprom']
+        seq = row['seq']
+        pos = [ini,end,strand]
+        dic[window] = pos
+        s = seq
+        score10_6,pos10_6 = get_scores(os.path.join(loc,'pssm10_6.txt'), s)
+        maxi10_6 = get_max_pssm(os.path.join(loc,'pssm10_6.txt'))
+        score10_8,pos10_8 = get_scores(os.path.join(loc,'pssm10_8.txt'), s)
+        maxi10_8 = get_max_pssm(os.path.join(loc,'pssm10_8.txt'))
+        scores23,pos23 = get_scores(os.path.join(loc,'pssm_23.txt'), s)
+        maxi23 = get_max_pssm(os.path.join(loc,'pssm_23.txt'))
+        scores21,pos21 = get_scores(os.path.join(loc,'pssm_21.txt'), s)
+        maxi21 = get_max_pssm(os.path.join(loc,'pssm_21.txt'))
+        scores27,pos27 = get_scores(os.path.join(loc,'pssm_27.txt'), s)
+        maxi27 = get_max_pssm(os.path.join(loc,'pssm_27.txt'))
+        scores32,pos32 = get_scores(os.path.join(loc,'pssm_32.txt'), s)
+        maxi32 = get_max_pssm(os.path.join(loc,'pssm_32.txt'))
+        score23 = max(scores23)
+        score21 = max(scores21)
+        score27 = max(scores27)
+        score32 = max(scores32)
+        maxiphage = max(score23,score21,score27,score32)
+        if maxiphage == score23: phage_max = score23*maxi23
+        elif maxiphage == score21: phage_max = score21*maxi21
+        elif maxiphage == score27: phage_max = score27*maxi27
+        elif maxiphage == score32: phage_max = score32*maxi32
+        score35_6,pos35_6 = get_scores(os.path.join(loc,'pssm35_6.txt'), s)
+        maxi35_6 = get_max_pssm(os.path.join(loc,'pssm35_6.txt'))
+        score35_9,pos35_9 = get_scores(os.path.join(loc,'pssm35_9.txt'), s)
+        maxi35_9 = get_max_pssm(os.path.join(loc,'pssm35_9.txt'))
+        score35_t4,pos35_t4 = get_scores(os.path.join(loc,'pssm35_t4.txt'), s)
+        maxi35_t4 = get_max_pssm(os.path.join(loc,'pssm35_t4.txt'))
+        score35_cbb,pos35_cbb = get_scores(os.path.join(loc,'pssm35_cbb.txt'), s)
+        maxi35_cbb = get_max_pssm(os.path.join(loc,'pssm35_cbb.txt'))
+        score35_lb,pos35_lb = get_scores(os.path.join(loc,'pssm35_lb.txt'),s)
+        maxi35_lb = get_max_pssm(os.path.join(loc,'pssm35_lb.txt'))
+        score35_mu, pos35_mu = get_scores(os.path.join(loc,'pssm35_mu.txt'),s)
+        maxi35_mu = get_max_pssm(os.path.join(loc,'pssm35_mu.txt'))
+        
+        dists6 = []
+        score6 = []
+        for p in pos10_6:
+            for a in range(14,22):
+                d = p-a-6
+                if d >= 0: 
+                    s10 = score10_6[p]
+                    s35_6 = score35_6[d]
+                    score6.append([s35_6,s10])
+                    dists6.append([d,p])
+        dists9 = []            
+        score9 = []
+        for p in pos10_6:
+            for a in range(11,14):
+                d = p-a-9
+                if d >= 0: 
+                    s10 = score10_6[p]
+                    s35_9 = score35_9[d]
+                    score9.append([s35_9,s10])
+                    dists9.append([d,p])
+        distst4 = []
+        scoret4 = []
+        distscbb = []
+        scorecbb = []
+        for p in pos10_6:
+            for a in range(16,18):
+                d = p-a-7
+                if d >= 0: 
+                    s10 = score10_6[p]
+                    s35_t4 = score35_t4[d]
+                    s35_cbb = score35_cbb[d]
+                    scoret4.append([s35_t4,s10])
+                    distst4.append([d,p])
+                    scorecbb.append([s35_cbb,s10])
+                    distscbb.append([d,p])
+                    
+        
+        distsmu = []
+        scoremu = []
+        for p in pos10_6:
+            d = p-16-14
+            if d >= 0: 
+                s10 = score10_6[p]
+                s35_mu = score35_mu[d]
+                scoremu.append([s35_mu,s10])
+                distsmu.append([d,p])
+        
+        distslb = []
+        scorelb = []
+        for p in pos10_6:
+            d = p-13-14
+            if d >= 0: 
+                s10 = score10_6[p]
+                s35_lb = score35_lb[d]
+                scorelb.append([s35_lb,s10])
+                distslb.append([d,p])
+    
+        soma6 = [sum(x) for x in score6]
+        soma9 = [sum(x) for x in score9]
+        somat4 = [sum(x) for x in scoret4]
+        somacbb = [sum(x) for x in scorecbb]
+        somamu = [sum(x) for x in scoremu]
+        somalb = [sum(x) for x in scorelb]
+        
+        maxi6 = max(soma6)
+        maxi9 = max(soma9)
+        maxit4 = max(somat4)
+        maxicbb = max(somacbb)
+        maximu = max(somamu)
+        maxilb = max(somalb)
+        maxi_elems = max(maxi6,maxi9,maxit4,maxicbb,maxilb,maximu)
+
+        if maxi_elems == maxilb:
+            indmax = somalb.index(maxilb)
+            sc35 = scorelb[indmax][0]
+            sc10 = scorelb[indmax][1]
+            score_elems = [sc35,sc10]
+            posel = distslb[indmax]
+            size35 = 14
+            elems_maxi = sc35*maxi35_lb+sc10*maxi10_6
+        elif maxi_elems == maximu:
+            indmax = somamu.index(maximu)
+            sc35 = scoremu[indmax][0]
+            sc10 = scoremu[indmax][1]
+            score_elems = [sc35,sc10]
+            posel = distsmu[indmax]
+            size35 = 14
+            elems_maxi = sc35*maxi35_mu+sc10*maxi10_6
+        elif maxi_elems == maxi9:
+            indmax = soma9.index(maxi9)
+            sc35 = score9[indmax][0]
+            sc10 = score9[indmax][1]
+            score_elems = [sc35,sc10]
+            posel = dists9[indmax]
+            size35 = 9
+            elems_maxi = sc35*maxi35_9+sc10*maxi10_6
+        elif maxi_elems == maxit4:
+            indmax = somat4.index(maxit4)
+            sc35 = scoret4[indmax][0]
+            sc10 = scoret4[indmax][1]
+            score_elems = [sc35,sc10]
+            posel = distst4[indmax]
+            size35 = 7
+            elems_maxi = sc35*maxi35_t4+sc10*maxi10_6
+        elif maxi_elems == maxicbb:
+            indmax = somacbb.index(maxicbb)
+            sc35 = scorecbb[indmax][0]
+            sc10 = scorecbb[indmax][1]
+            score_elems = [sc35,sc10]
+            posel = distscbb[indmax]
+            size35 = 7
+            elems_maxi = sc35*maxi35_cbb+sc10*maxi10_6
+        else:
+            indmax = soma6.index(maxi6)
+            sc35 = score6[indmax][0]
+            sc10 = score6[indmax][1]
+            score_elems = [sc35,sc10]
+            posel = dists6[indmax]
+            size35 = 6
+            elems_maxi = sc35*maxi35_6+sc10*maxi10_6
+        
+        if score23 == maxiphage:
+            phage_score = score23
+            posiphage = scores23.index(score23)
+            sizephage = 23
+        elif score21 == maxiphage:
+            phage_score = score21
+            posiphage = scores21.index(score21)
+            sizephage = 21
+        elif score27 == maxiphage:
+            phage_score = score27
+            posiphage = scores27.index(score27)
+            sizephage = 27
+        else:
+            phage_score = score32
+            posiphage = scores32.index(score32)
+            sizephage = 32
+            
+        if elems_maxi >= max(score10_8)*maxi10_8:
+            i = posel[1]
+            ext = s[i-3:i-1]
+            if ext == 'TG': tg = 1
+            else: tg = 0
+            if elems_maxi > phage_max: host = 1
+            else: 
+                host = 0
+                tg = 0
+            sc = max(score10_8)
+            end35 = posel[0]+size35
+            dist = posel[1]-end35
+            scores.append([host, score_elems[1],sc,score_elems[0],phage_score,tg,dist,str(seq)])
+            posis.append([posel[1],posel[0],posiphage])
+            sizes.append([6,size35,sizephage])
+        else:
+            host = 1
+            sc = max(score10_8)
+            i = score10_8.index(sc)
+            ext = s[i-3:i-1]
+            if ext == 'TG': tg = 1
+            else: tg = 0
+            if max(score10_8)*maxi10_8 > phage_max: host = 1
+            else: 
+                host = 0
+                tg = 0
+            end35 = posel[0]+size35
+            dist = posel[1]-end35
+            scores.append([host,score_elems[1],sc,score_elems[0],phage_score,tg,dist,str(seq)])
+            posis.append([i,posel[0],posiphage])
+            sizes.append([8,size35,sizephage])
+    score = pd.DataFrame(scores, index=test.index, columns=['host','score10','score10_8','score35','score_phage','tg','dist','seq'])
+    posis = pd.DataFrame(posis, index=test.index, columns=['pos10','pos35','posphage'])
+    sizes = pd.DataFrame(sizes, index=test.index, columns=['size10','size35','size_phage'])
+    df_all = pd.concat([score,posis,sizes],axis=1)
+    return df_all,dic
+
+def create_dftest(scores_test,dic_window,family,bacteria,lifecycle):
+    tudo = []
+    tudo2 = []
+    for ind,row in scores_test.iterrows():
+        _,window = ind.split(':')
+        posis = dic_window[window]
+        strand=posis[2]
+        if strand == 1: ini=posis[0]
+        else: ini=posis[1]
+        seqprom = row['seq']
+        score10 = row['score10']
+        score10_8 = row['score10_8']
+        score35 = row['score35']
+        scorephage = row['score_phage']
+        size10 = row['size10']
+        size35 = row['size35']
+        sizephage = row['size_phage']
+        ini10 = row['pos10']
+        tg = row['tg']
+        host = row['host']
+        ini35 = row['pos35']
+        dist = row['dist']
+        end10=ini10+size10
+        iniphage = row['posphage']
+        endphage = iniphage+sizephage
+        if strand == 1:
+            if host == 0: 
+                new_seq = seqprom[iniphage:endphage]
+                new_ini = ini+iniphage+1
+                new_end = ini+endphage
+            else:
+                if size10 == 6: 
+                    new_seq = seqprom[ini35:end10]
+                    new_ini = ini+ini35+1
+                    new_end = ini+end10
+                else: 
+                    new_seq = seqprom[ini10:end10]
+                    new_ini = ini+ini10+1
+                    new_end = ini+end10
+            new_pos = '('+str(new_ini)+'..'+str(new_end)+')'
+        else:
+            if host == 0: 
+                new_seq = seqprom[iniphage:endphage]
+                new_ini = ini-endphage+1
+                new_end = ini-iniphage
+            else:
+                if size10 == 6: 
+                    new_seq = seqprom[ini35:end10]
+                    new_ini = ini-end10+1
+                    new_end = ini-ini35
+                else: 
+                    new_seq = seqprom[ini10:end10]
+                    new_ini = ini-end10+1
+                    new_end = ini-ini10
+            new_pos = 'complement('+str(new_ini)+'..'+str(new_end)+')'
+        if size10 == 6:
+            size10_6 = 1
+        else: 
+            size10_6 = 0
+        if size35 == 6:
+            size35_6 = 1
+            size35_7 = 0
+            size35_9 = 0
+            size35_14 = 0
+        elif size35 == 7:
+            size35_6 = 0
+            size35_7 = 1
+            size35_9 = 0
+            size35_14 = 0
+        elif size35 == 9:
+            size35_6 = 0
+            size35_7 = 0
+            size35_9 = 1
+            size35_14 = 0
+        else:
+            size35_6 = 0
+            size35_7 = 0
+            size35_9 = 0
+            size35_14 = 1
+        if sizephage == 23:
+            sizephage_23 = 1
+            sizephage_21 = 0
+            sizephage_32 = 0
+        elif sizephage == 21:
+            sizephage_23 = 0
+            sizephage_21 = 1
+            sizephage_32 = 0
+        elif sizephage == 32:
+            sizephage_23 = 0
+            sizephage_21 = 0
+            sizephage_32 = 1
+        else:
+            sizephage_23 = 0
+            sizephage_21 = 0
+            sizephage_32 = 0
+        if family == 'Podoviridae': 
+            Podo = 1
+            Sipho = 0
+            Myo = 0
+        elif family == 'Siphoviridae':
+            Podo = 0
+            Sipho = 1
+            Myo = 0
+        elif family == 'Myoviridae':
+            Podo = 0
+            Sipho = 0
+            Myo = 1
+        else: 
+            Podo = 0
+            Sipho = 0
+            Myo = 0
+        if bacteria == 'Bacillus':
+            bac = [1,0,0,0,0]
+        elif bacteria == 'Escherichia coli':
+            bac = [0,1,0,0,0]
+        elif bacteria == 'Klebsiella':
+            bac = [0,0,1,0,0]
+        elif bacteria == 'Pectobacterium':
+            bac = [0,0,0,1,0]
+        elif bacteria == 'Cronobacter':
+            bac = [0,0,0,0,1]
+        else:
+            bac = [0,0,0,0,0]
+        if lifecycle == 'virulent':  tp = 0
+        else: tp = 1
+        fe = free_energy(str(seqprom))
+        AT = freq_base(str(seqprom))
+        linha = [score10, score10_8, score35, dist, scorephage, fe, AT, host,size10_6,
+                 size35_6, size35_7, size35_9, size35_14,
+                 sizephage_23, sizephage_21,
+                 sizephage_32, tg, Podo, Sipho, Myo,tp]
+        linha.extend(bac)
+        tudo.append(linha)
+        linha2 = [new_pos,str(new_seq), host, size10_6,
+                 score10, score10_8, size35_6, size35_7, size35_9,size35_14, 
+                 score35, dist, sizephage_23, sizephage_21,
+                 sizephage_32, scorephage, tg, Podo, Sipho, Myo,tp, fe, AT]
+        linha2.extend(bac)
+        tudo2.append(linha2)
+    df_test = pd.DataFrame(tudo, index=scores_test.index, 
+                          columns = ['score10', 'score10_8','score35', 'dist35_10',
+                                     'scorephage','fe', 'freqAT',
+                                     'host','size10',
+                                     'size35_6', 'size35_7','size35_9','size35_14',
+                                     'sizephage_23', 
+                                     'sizephage_21', 'sizephage_32',
+                                     'TG', 'Podo', 'Sipho', 'Myo', 'tp',
+                                     'Bacillus', 'EColi', 
+                                     'Pectobacterium','Klebsiella', 'Cronobacter'])
+    df_INFO = pd.DataFrame(tudo2, index=scores_test.index, 
+                          columns = ['Positions','Promoter Sequence','host','size10',
+                                     'score10', 'score10_8',
+                                     'size35_6', 'size35_7','size35_9','size35_14',
+                                     'score35', 'dist35_10','sizephage_23', 
+                                     'sizephage_21', 'sizephage_32',
+                                     'scorephage', 'TG', 'Podo', 'Sipho', 'Myo', 'tp','fe', 'freqAT',
+                                     'EColi', 'Salmonella', 
+                                     'Pectobacterium','Cronobacter', 'Streptococcus'])
+    return df_test,df_INFO
+
+
+def get_predictions(scaler_file,model_file,test,df_testinfo,threshold):
+    import pickle
+    scaler = pickle.load(open(scaler_file, 'rb'))
+    model = pickle.load(open(model_file, 'rb'))
+    feat_scaled = pd.DataFrame(scaler.transform(test.iloc[:,:7]),index =test.index, columns=test.columns[:7])
+    TEST_scaled = pd.concat([feat_scaled,test.iloc[:,7:]],axis=1)
+    scores = model.predict_proba(TEST_scaled)
+    pos_scores = np.empty((TEST_scaled.shape[0],0), float)
+    for x in scores: pos_scores = np.append(pos_scores,x[1])
+    try: positive_indexes = np.nonzero(pos_scores>float(threshold))[0] #escolher os positivos, podia ser escolher com score > x
+    except ValueError: return 'The threshold value is not a float'
+    else:
+        if len(positive_indexes) == 0: return None
+        else:
+            positive_windows = TEST_scaled.index[positive_indexes]
+            INFO = df_testinfo.loc[positive_windows,['Positions','Promoter Sequence']]
+            promoter_type = []
+            for x in df_testinfo.loc[positive_windows,'host'].tolist():
+                if x == 0: promoter_type.append('phage')
+                else: promoter_type.append('host')
+            INFO['Type'] = promoter_type
+            INFO['Scores'] = np.around(pos_scores[positive_indexes],decimals=3)
+            INFO.index = positive_windows
+            return INFO
+
+
+def get_finaldf(test,rec):
+    new_df = test.groupby(['Positions'])
+    groups = list(new_df.groups.keys())
+    for i in range(len(groups)-1):
+        for j in range(i, len(groups)):
+            if 'complement' in groups[i]: inii = int(groups[i][11:].split('..')[0])
+            else: inii = int(groups[i][1:].split('..')[0])
+            if 'complement' in groups[j]: inij = int(groups[j][11:].split('..')[0])
+            else: inij = int(groups[j][1:].split('..')[0])
+            if inij < inii:
+                temp = groups[i]
+                groups[i] = groups[j]
+                groups[j] = temp
+    new_inds = []
+    for g in groups:
+        inds = new_df.groups[g]
+        if len(inds) == 1: new_inds.append(inds[0])
+        else:
+            maxi = max(new_df.get_group(g)['Scores'])
+            i = new_df.groups[g][new_df.get_group(g)['Scores']==maxi][0]
+            new_inds.append(i)
+    
+    output = test.loc[new_inds,:]
+    strands = []
+    new_pos = []
+    old_pos = output['Positions'].tolist()
+    
+    from Bio.SeqFeature import SeqFeature, FeatureLocation
+    feats = rec.features
+    for ind, row in output.iterrows():
+        pos = row['Positions']
+        if 'complement' in pos: 
+            strands.append('-')
+            new_pos.append(pos[10:])
+            ini,end= pos[11:-1].split('..')
+            new_loc = FeatureLocation(int(ini),int(end),strand=-1)
+        else: 
+            strands.append('+')
+            new_pos.append(pos)
+            ini,end= pos[1:-1].split('..')
+            new_loc = FeatureLocation(int(ini),int(end),strand=1)
+        feat = SeqFeature(new_loc, type='regulatory',qualifiers={'regulatory_class':['promoter'], 'note=':['predicted by PhagePromoter']})
+        feats.append(feat)            
+            
+    output.insert(loc=0, column='Strand', value=strands)
+    output['Positions'] = new_pos
+    output.to_html('output.html',index=False, justify='center')
+    recs = []
+    i = 0
+    for ind,row in output.iterrows():
+        s = Seq(row['Promoter Sequence'])
+        posis = old_pos[i]
+        typ = row['Type']
+        score = row['Scores']
+        sq = SeqRecord(seq=s, id=ind, description=typ+' '+posis+' score='+str(score))
+        recs.append(sq)
+        i += 1
+    SeqIO.write(recs, 'output.fasta','fasta')
+    new_rec = rec
+    new_rec.seq.alphabet = IUPAC.IUPACAmbiguousDNA()
+    new_feats = sorted(feats, key=lambda x: x.location.start)
+    new_rec.features = new_feats
+    SeqIO.write(new_rec,'output.gb','genbank')
+
+if __name__== "__main__":
+    
+    import sys
+    import os
+    __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+    
+    gen_format = sys.argv[1]
+    genome_file = sys.argv[2]
+    both = sys.argv[3]
+    threshold = sys.argv[4]
+    family = sys.argv[5]
+    host = sys.argv[6]
+    phage_type = sys.argv[7]
+    model = sys.argv[8]
+    '''
+    gen_format = 'genbank'
+    genome_file = 'test-data/NC_015264.gb'
+    both = False
+    threshold = '0.50'
+    family = 'Podoviridae'
+    host = 'Pseudomonas'
+    phage_type = 'virulent'
+    model = 'SVM2400'
+    #model = 'ANN1600'
+    '''
+    
+    rec = SeqIO.read(genome_file, gen_format)
+    test_windows = get_testseqs65(gen_format, genome_file,both)
+    try: score_test,dic_window = get_testScores(__location__,test_windows)
+    except IndexError: print('Error. Input sequence can only have A,C,G or T')
+    else:
+        df_test,df_testinfo = create_dftest(score_test,dic_window,family,host,phage_type)
+        if model == 'ANN1600':
+            scaler_file = os.path.join(__location__, 'scaler1600.sav')
+            model_file = os.path.join(__location__, 'model1600.sav')
+            preds =  get_predictions(scaler_file, model_file, df_test,df_testinfo,threshold)
+            if preds is None: print('There is no sequence with a score value higher or equal to the threshold '+str(threshold))
+            elif type(preds) == str: print(preds)
+            else: output = get_finaldf(preds,rec)
+        else:
+            scaler_file = os.path.join(__location__, 'scaler2400.sav')
+            model_file = os.path.join(__location__, 'model2400.sav')
+            new_df_test = df_test.iloc[:,[0,1,2,3,4,5,6,7,8,9,13,14,16,17,19,20,22,24,25]]
+            preds =  get_predictions(scaler_file, model_file, new_df_test,df_testinfo,threshold)
+            if preds is None: print('There is no sequence with a score value higher or equal to the threshold '+str(threshold))
+            elif type(preds) == str: print(preds)
+            else: output = get_finaldf(preds,rec)
+