view phagepromoter.py @ 13:8b9534a83ae2 draft

Uploaded
author martasampaio
date Sat, 20 Apr 2019 11:03:04 -0400
parents 30b5e33eca40
children
line wrap: on
line source

# -*- coding: utf-8 -*-
"""
Created on Thu Jul 19 13:45:05 2018

@author: Marta
"""

from Bio import SeqIO
import numpy as np
import pandas as pd
from auxiliar import free_energy,freq_base
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from auxiliar import get_bacteria, get_families, get_max_pssm, get_scores, get_lifecycle

#division of the test genome in sequences of 65 bp
def get_testseqs65(form,fic,both=False):
    ALL = []
    indexes = []
    a = 0
    rec = SeqIO.read(fic,form)
    genome = rec.seq
    i = 0
    j = 65
    while j < len(genome):
        s = genome[i:j]
        ALL.append([1,i,j,s])
        i += 20
        j += 20
        a += 1
        indexes.append(rec.name+":"+str(a))
    if both:
        comp = genome.reverse_complement()
        size = len(rec.seq)
        i = 0
        j = 65
        while j < len(comp):
            s = comp[i:j]
            ALL.append([-1,size-j,size-i,s])
            i += 20
            j += 20
            a += 1
            indexes.append(rec.name+":"+str(a))
    df = pd.DataFrame(ALL, index=indexes, columns=['strand','iniprom','endprom','seq'])
    return df

#calculate the scores of all sequences (similar to get_posScores and get_negScores)
def get_testScores(loc,test):
    scores = []
    posis = []
    sizes = []
    dic = {}
    for ind,row in test.iterrows():
        _,window = ind.split(':')
        strand = row['strand']
        ini = row['iniprom']
        end = row['endprom']
        seq = row['seq']
        pos = [ini,end,strand]
        dic[window] = pos
        s = seq
        score10_6,pos10_6 = get_scores(os.path.join(loc,'pssm10_6.txt'), s)
        maxi10_6 = get_max_pssm(os.path.join(loc,'pssm10_6.txt'))
        score10_8,pos10_8 = get_scores(os.path.join(loc,'pssm10_8.txt'), s)
        maxi10_8 = get_max_pssm(os.path.join(loc,'pssm10_8.txt'))
        scores23,pos23 = get_scores(os.path.join(loc,'pssm_23.txt'), s)
        maxi23 = get_max_pssm(os.path.join(loc,'pssm_23.txt'))
        scores21,pos21 = get_scores(os.path.join(loc,'pssm_21.txt'), s)
        maxi21 = get_max_pssm(os.path.join(loc,'pssm_21.txt'))
        scores27,pos27 = get_scores(os.path.join(loc,'pssm_27.txt'), s)
        maxi27 = get_max_pssm(os.path.join(loc,'pssm_27.txt'))
        scores32,pos32 = get_scores(os.path.join(loc,'pssm_32.txt'), s)
        maxi32 = get_max_pssm(os.path.join(loc,'pssm_32.txt'))
        score23 = max(scores23)
        score21 = max(scores21)
        score27 = max(scores27)
        score32 = max(scores32)
        maxiphage = max(score23,score21,score27,score32)
        if maxiphage == score23: phage_max = score23*maxi23
        elif maxiphage == score21: phage_max = score21*maxi21
        elif maxiphage == score27: phage_max = score27*maxi27
        elif maxiphage == score32: phage_max = score32*maxi32
        score35_6,pos35_6 = get_scores(os.path.join(loc,'pssm35_6.txt'), s)
        maxi35_6 = get_max_pssm(os.path.join(loc,'pssm35_6.txt'))
        score35_9,pos35_9 = get_scores(os.path.join(loc,'pssm35_9.txt'), s)
        maxi35_9 = get_max_pssm(os.path.join(loc,'pssm35_9.txt'))
        score35_t4,pos35_t4 = get_scores(os.path.join(loc,'pssm35_t4.txt'), s)
        maxi35_t4 = get_max_pssm(os.path.join(loc,'pssm35_t4.txt'))
        score35_cbb,pos35_cbb = get_scores(os.path.join(loc,'pssm35_cbb.txt'), s)
        maxi35_cbb = get_max_pssm(os.path.join(loc,'pssm35_cbb.txt'))
        score35_lb,pos35_lb = get_scores(os.path.join(loc,'pssm35_lb.txt'),s)
        maxi35_lb = get_max_pssm(os.path.join(loc,'pssm35_lb.txt'))
        score35_mu, pos35_mu = get_scores(os.path.join(loc,'pssm35_mu.txt'),s)
        maxi35_mu = get_max_pssm(os.path.join(loc,'pssm35_mu.txt'))
        
        dists6 = []
        score6 = []
        for p in pos10_6:
            for a in range(14,22):
                d = p-a-6
                if d >= 0: 
                    s10 = score10_6[p]
                    s35_6 = score35_6[d]
                    score6.append([s35_6,s10])
                    dists6.append([d,p])
        dists9 = []            
        score9 = []
        for p in pos10_6:
            for a in range(11,14):
                d = p-a-9
                if d >= 0: 
                    s10 = score10_6[p]
                    s35_9 = score35_9[d]
                    score9.append([s35_9,s10])
                    dists9.append([d,p])
        distst4 = []
        scoret4 = []
        distscbb = []
        scorecbb = []
        for p in pos10_6:
            for a in range(16,18):
                d = p-a-7
                if d >= 0: 
                    s10 = score10_6[p]
                    s35_t4 = score35_t4[d]
                    s35_cbb = score35_cbb[d]
                    scoret4.append([s35_t4,s10])
                    distst4.append([d,p])
                    scorecbb.append([s35_cbb,s10])
                    distscbb.append([d,p])
                    
        
        distsmu = []
        scoremu = []
        for p in pos10_6:
            d = p-16-14
            if d >= 0: 
                s10 = score10_6[p]
                s35_mu = score35_mu[d]
                scoremu.append([s35_mu,s10])
                distsmu.append([d,p])
        
        distslb = []
        scorelb = []
        for p in pos10_6:
            d = p-13-14
            if d >= 0: 
                s10 = score10_6[p]
                s35_lb = score35_lb[d]
                scorelb.append([s35_lb,s10])
                distslb.append([d,p])
    
        soma6 = [sum(x) for x in score6]
        soma9 = [sum(x) for x in score9]
        somat4 = [sum(x) for x in scoret4]
        somacbb = [sum(x) for x in scorecbb]
        somamu = [sum(x) for x in scoremu]
        somalb = [sum(x) for x in scorelb]
        
        maxi6 = max(soma6)
        maxi9 = max(soma9)
        maxit4 = max(somat4)
        maxicbb = max(somacbb)
        maximu = max(somamu)
        maxilb = max(somalb)
        maxi_elems = max(maxi6,maxi9,maxit4,maxicbb,maxilb,maximu)

        if maxi_elems == maxilb:
            indmax = somalb.index(maxilb)
            sc35 = scorelb[indmax][0]
            sc10 = scorelb[indmax][1]
            score_elems = [sc35,sc10]
            posel = distslb[indmax]
            size35 = 14
            elems_maxi = sc35*maxi35_lb+sc10*maxi10_6
        elif maxi_elems == maximu:
            indmax = somamu.index(maximu)
            sc35 = scoremu[indmax][0]
            sc10 = scoremu[indmax][1]
            score_elems = [sc35,sc10]
            posel = distsmu[indmax]
            size35 = 14
            elems_maxi = sc35*maxi35_mu+sc10*maxi10_6
        elif maxi_elems == maxi9:
            indmax = soma9.index(maxi9)
            sc35 = score9[indmax][0]
            sc10 = score9[indmax][1]
            score_elems = [sc35,sc10]
            posel = dists9[indmax]
            size35 = 9
            elems_maxi = sc35*maxi35_9+sc10*maxi10_6
        elif maxi_elems == maxit4:
            indmax = somat4.index(maxit4)
            sc35 = scoret4[indmax][0]
            sc10 = scoret4[indmax][1]
            score_elems = [sc35,sc10]
            posel = distst4[indmax]
            size35 = 7
            elems_maxi = sc35*maxi35_t4+sc10*maxi10_6
        elif maxi_elems == maxicbb:
            indmax = somacbb.index(maxicbb)
            sc35 = scorecbb[indmax][0]
            sc10 = scorecbb[indmax][1]
            score_elems = [sc35,sc10]
            posel = distscbb[indmax]
            size35 = 7
            elems_maxi = sc35*maxi35_cbb+sc10*maxi10_6
        else:
            indmax = soma6.index(maxi6)
            sc35 = score6[indmax][0]
            sc10 = score6[indmax][1]
            score_elems = [sc35,sc10]
            posel = dists6[indmax]
            size35 = 6
            elems_maxi = sc35*maxi35_6+sc10*maxi10_6
        
        if score23 == maxiphage:
            phage_score = score23
            posiphage = scores23.index(score23)
            sizephage = 23
        elif score21 == maxiphage:
            phage_score = score21
            posiphage = scores21.index(score21)
            sizephage = 21
        elif score27 == maxiphage:
            phage_score = score27
            posiphage = scores27.index(score27)
            sizephage = 27
        else:
            phage_score = score32
            posiphage = scores32.index(score32)
            sizephage = 32
            
        if elems_maxi >= max(score10_8)*maxi10_8:
            i = posel[1]
            ext = s[i-3:i-1]
            if ext == 'TG': tg = 1
            else: tg = 0
            if elems_maxi > phage_max: host = 1
            else: 
                host = 0
                tg = 0
            sc = max(score10_8)
            end35 = posel[0]+size35
            dist = posel[1]-end35
            scores.append([host, score_elems[1],sc,score_elems[0],phage_score,tg,dist,str(seq)])
            posis.append([posel[1],posel[0],posiphage])
            sizes.append([6,size35,sizephage])
        else:
            host = 1
            sc = max(score10_8)
            i = score10_8.index(sc)
            ext = s[i-3:i-1]
            if ext == 'TG': tg = 1
            else: tg = 0
            if max(score10_8)*maxi10_8 > phage_max: host = 1
            else: 
                host = 0
                tg = 0
            end35 = posel[0]+size35
            dist = posel[1]-end35
            scores.append([host,score_elems[1],sc,score_elems[0],phage_score,tg,dist,str(seq)])
            posis.append([i,posel[0],posiphage])
            sizes.append([8,size35,sizephage])
    score = pd.DataFrame(scores, index=test.index, columns=['host','score10','score10_8','score35','score_phage','tg','dist','seq'])
    posis = pd.DataFrame(posis, index=test.index, columns=['pos10','pos35','posphage'])
    sizes = pd.DataFrame(sizes, index=test.index, columns=['size10','size35','size_phage'])
    df_all = pd.concat([score,posis,sizes],axis=1)
    return df_all,dic

def create_dftest(scores_test,dic_window,family,bacteria,lifecycle):
    tudo = []
    tudo2 = []
    for ind,row in scores_test.iterrows():
        _,window = ind.split(':')
        posis = dic_window[window]
        strand=posis[2]
        if strand == 1: ini=posis[0]
        else: ini=posis[1]
        seqprom = row['seq']
        score10 = row['score10']
        score10_8 = row['score10_8']
        score35 = row['score35']
        scorephage = row['score_phage']
        size10 = row['size10']
        size35 = row['size35']
        sizephage = row['size_phage']
        ini10 = row['pos10']
        tg = row['tg']
        host = row['host']
        ini35 = row['pos35']
        dist = row['dist']
        end10=ini10+size10
        iniphage = row['posphage']
        endphage = iniphage+sizephage
        if strand == 1:
            if host == 0: 
                new_seq = seqprom[iniphage:endphage]
                new_ini = ini+iniphage+1
                new_end = ini+endphage
            else:
                if size10 == 6: 
                    new_seq = seqprom[ini35:end10]
                    new_ini = ini+ini35+1
                    new_end = ini+end10
                else: 
                    new_seq = seqprom[ini10:end10]
                    new_ini = ini+ini10+1
                    new_end = ini+end10
            new_pos = '('+str(new_ini)+'..'+str(new_end)+')'
        else:
            if host == 0: 
                new_seq = seqprom[iniphage:endphage]
                new_ini = ini-endphage+1
                new_end = ini-iniphage
            else:
                if size10 == 6: 
                    new_seq = seqprom[ini35:end10]
                    new_ini = ini-end10+1
                    new_end = ini-ini35
                else: 
                    new_seq = seqprom[ini10:end10]
                    new_ini = ini-end10+1
                    new_end = ini-ini10
            new_pos = 'complement('+str(new_ini)+'..'+str(new_end)+')'
        if size10 == 6:
            size10_6 = 1
        else: 
            size10_6 = 0
        if size35 == 6:
            size35_6 = 1
            size35_7 = 0
            size35_9 = 0
            size35_14 = 0
        elif size35 == 7:
            size35_6 = 0
            size35_7 = 1
            size35_9 = 0
            size35_14 = 0
        elif size35 == 9:
            size35_6 = 0
            size35_7 = 0
            size35_9 = 1
            size35_14 = 0
        else:
            size35_6 = 0
            size35_7 = 0
            size35_9 = 0
            size35_14 = 1
        if sizephage == 23:
            sizephage_23 = 1
            sizephage_21 = 0
            sizephage_32 = 0
        elif sizephage == 21:
            sizephage_23 = 0
            sizephage_21 = 1
            sizephage_32 = 0
        elif sizephage == 32:
            sizephage_23 = 0
            sizephage_21 = 0
            sizephage_32 = 1
        else:
            sizephage_23 = 0
            sizephage_21 = 0
            sizephage_32 = 0
        if family == 'Podoviridae': 
            Podo = 1
            Sipho = 0
            Myo = 0
        elif family == 'Siphoviridae':
            Podo = 0
            Sipho = 1
            Myo = 0
        elif family == 'Myoviridae':
            Podo = 0
            Sipho = 0
            Myo = 1
        else: 
            Podo = 0
            Sipho = 0
            Myo = 0
        if bacteria == 'Bacillus':
            bac = [1,0,0,0,0]
        elif bacteria == 'Escherichia coli':
            bac = [0,1,0,0,0]
        elif bacteria == 'Klebsiella':
            bac = [0,0,1,0,0]
        elif bacteria == 'Pectobacterium':
            bac = [0,0,0,1,0]
        elif bacteria == 'Cronobacter':
            bac = [0,0,0,0,1]
        else:
            bac = [0,0,0,0,0]
        if lifecycle == 'virulent':  tp = 0
        else: tp = 1
        fe = free_energy(str(seqprom))
        AT = freq_base(str(seqprom))
        linha = [score10, score10_8, score35, dist, scorephage, fe, AT, host,size10_6,
                 size35_6, size35_7, size35_9, size35_14,
                 sizephage_23, sizephage_21,
                 sizephage_32, tg, Podo, Sipho, Myo,tp]
        linha.extend(bac)
        tudo.append(linha)
        linha2 = [new_pos,str(new_seq), host, size10_6,
                 score10, score10_8, size35_6, size35_7, size35_9,size35_14, 
                 score35, dist, sizephage_23, sizephage_21,
                 sizephage_32, scorephage, tg, Podo, Sipho, Myo,tp, fe, AT]
        linha2.extend(bac)
        tudo2.append(linha2)
    df_test = pd.DataFrame(tudo, index=scores_test.index, 
                          columns = ['score10', 'score10_8','score35', 'dist35_10',
                                     'scorephage','fe', 'freqAT',
                                     'host','size10',
                                     'size35_6', 'size35_7','size35_9','size35_14',
                                     'sizephage_23', 
                                     'sizephage_21', 'sizephage_32',
                                     'TG', 'Podo', 'Sipho', 'Myo', 'tp',
                                     'Bacillus', 'EColi', 
                                     'Pectobacterium','Klebsiella', 'Cronobacter'])
    df_INFO = pd.DataFrame(tudo2, index=scores_test.index, 
                          columns = ['Positions','Promoter Sequence','host','size10',
                                     'score10', 'score10_8',
                                     'size35_6', 'size35_7','size35_9','size35_14',
                                     'score35', 'dist35_10','sizephage_23', 
                                     'sizephage_21', 'sizephage_32',
                                     'scorephage', 'TG', 'Podo', 'Sipho', 'Myo', 'tp','fe', 'freqAT',
                                     'EColi', 'Salmonella', 
                                     'Pectobacterium','Cronobacter', 'Streptococcus'])
    return df_test,df_INFO


def get_predictions(scaler_file,model_file,test,df_testinfo,threshold):
    from sklearn.externals import joblib
    scaler = joblib.load(scaler_file)
    model = joblib.load(model_file)
    feat_scaled = pd.DataFrame(scaler.transform(test.iloc[:,:7]),index =test.index, columns=test.columns[:7])
    TEST_scaled = pd.concat([feat_scaled,test.iloc[:,7:]],axis=1)
    scores = model.predict_proba(TEST_scaled)
    pos_scores = np.empty((TEST_scaled.shape[0],0), float)
    for x in scores: pos_scores = np.append(pos_scores,x[1])
    try: positive_indexes = np.nonzero(pos_scores>float(threshold))[0] #escolher os positivos, podia ser escolher com score > x
    except ValueError: return 'The threshold value is not a float'
    else:
        if len(positive_indexes) == 0: return None
        else:
            positive_windows = TEST_scaled.index[positive_indexes]
            INFO = df_testinfo.loc[positive_windows,['Positions','Promoter Sequence']]
            promoter_type = []
            for x in df_testinfo.loc[positive_windows,'host'].tolist():
                if x == 0: promoter_type.append('phage')
                else: promoter_type.append('host')
            INFO['Type'] = promoter_type
            INFO['Scores'] = np.around(pos_scores[positive_indexes],decimals=3)
            INFO.index = positive_windows
            return INFO


def get_finaldf(test,rec):
    new_df = test.groupby(['Positions'])
    groups = list(new_df.groups.keys())
    for i in range(len(groups)-1):
        for j in range(i, len(groups)):
            if 'complement' in groups[i]: inii = int(groups[i][11:].split('..')[0])
            else: inii = int(groups[i][1:].split('..')[0])
            if 'complement' in groups[j]: inij = int(groups[j][11:].split('..')[0])
            else: inij = int(groups[j][1:].split('..')[0])
            if inij < inii:
                temp = groups[i]
                groups[i] = groups[j]
                groups[j] = temp
    new_inds = []
    for g in groups:
        inds = new_df.groups[g]
        if len(inds) == 1: new_inds.append(inds[0])
        else:
            maxi = max(new_df.get_group(g)['Scores'])
            i = new_df.groups[g][new_df.get_group(g)['Scores']==maxi][0]
            new_inds.append(i)
    
    output = test.loc[new_inds,:]
    strands = []
    new_pos = []
    old_pos = output['Positions'].tolist()
    
    from Bio.SeqFeature import SeqFeature, FeatureLocation
    feats = rec.features
    for ind, row in output.iterrows():
        pos = row['Positions']
        if 'complement' in pos: 
            strands.append('-')
            new_pos.append(pos[10:])
            ini,end= pos[11:-1].split('..')
            new_loc = FeatureLocation(int(ini),int(end),strand=-1)
        else: 
            strands.append('+')
            new_pos.append(pos)
            ini,end= pos[1:-1].split('..')
            new_loc = FeatureLocation(int(ini),int(end),strand=1)
        feat = SeqFeature(new_loc, type='regulatory',qualifiers={'regulatory_class':['promoter'], 'note=':['predicted by PhagePromoter']})
        feats.append(feat)            
            
    output.insert(loc=0, column='Strand', value=strands)
    output['Positions'] = new_pos
    output.to_html('output.html',index=False, justify='center')
    recs = []
    i = 0
    for ind,row in output.iterrows():
        s = Seq(row['Promoter Sequence'])
        posis = old_pos[i]
        typ = row['Type']
        score = row['Scores']
        sq = SeqRecord(seq=s, id=ind, description=typ+' '+posis+' score='+str(score))
        recs.append(sq)
        i += 1
    SeqIO.write(recs, 'output.fasta','fasta')
    new_rec = rec
    new_rec.seq.alphabet = IUPAC.IUPACAmbiguousDNA()
    new_feats = sorted(feats, key=lambda x: x.location.start)
    new_rec.features = new_feats
    SeqIO.write(new_rec,'output.gb','genbank')

if __name__== "__main__":
    
    import sys
    import os
    __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
    
    gen_format = sys.argv[1]
    genome_file = sys.argv[2]
    both = sys.argv[3]
    threshold = sys.argv[4]
    family = sys.argv[5]
    host = sys.argv[6]
    phage_type = sys.argv[7]
    model = sys.argv[8]
    '''
    gen_format = 'genbank'
    genome_file = 'test-data/NC_015264.gb'
    both = False
    threshold = '0.50'
    family = 'Podoviridae'
    host = 'Pseudomonas'
    phage_type = 'virulent'
    model = 'SVM2400'
    #model = 'ANN1600'
    '''
    
    rec = SeqIO.read(genome_file, gen_format)
    test_windows = get_testseqs65(gen_format, genome_file,both)
    try: score_test,dic_window = get_testScores(__location__,test_windows)
    except IndexError: print('Error. Input sequence can only have A,C,G or T')
    else:
        df_test,df_testinfo = create_dftest(score_test,dic_window,family,host,phage_type)
        if model == 'ANN1600':
            scaler_file = os.path.join(__location__, 'scaler1600.sav')
            model_file = os.path.join(__location__, 'model1600.sav')
            preds =  get_predictions(scaler_file, model_file, df_test,df_testinfo,threshold)
            if preds is None: print('There is no sequence with a score value higher or equal to the threshold '+str(threshold))
            elif type(preds) == str: print(preds)
            else: output = get_finaldf(preds,rec)
        else:
            scaler_file = os.path.join(__location__, 'scaler2400.sav')
            model_file = os.path.join(__location__, 'model2400.sav')
            new_df_test = df_test.iloc[:,[0,1,2,3,4,5,6,7,8,9,13,14,16,17,19,20,22,24,25]]
            preds =  get_predictions(scaler_file, model_file, new_df_test,df_testinfo,threshold)
            if preds is None: print('There is no sequence with a score value higher or equal to the threshold '+str(threshold))
            elif type(preds) == str: print(preds)
            else: output = get_finaldf(preds,rec)