# HG changeset patch # User martasampaio # Date 1555772278 14400 # Node ID ebd359f22fa5235017bab81deefcbb421b69dd47 # Parent 09a05b1e1379b4a070c18280f17c564bd6592cb6 Deleted selected files diff -r 09a05b1e1379 -r ebd359f22fa5 phage_promoter.py --- a/phage_promoter.py Sat Apr 20 10:57:46 2019 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,570 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Thu Jul 19 13:45:05 2018 - -@author: Marta -""" - -from Bio import SeqIO -import numpy as np -import pandas as pd -from auxiliar import free_energy,freq_base -from Bio.Seq import Seq -from Bio.SeqRecord import SeqRecord -from Bio.Alphabet import IUPAC -from auxiliar import get_bacteria, get_families, get_max_pssm, get_scores, get_lifecycle - -#division of the test genome in sequences of 65 bp -def get_testseqs65(form,fic,both=False): - ALL = [] - indexes = [] - a = 0 - rec = SeqIO.read(fic,form) - genome = rec.seq - i = 0 - j = 65 - while j < len(genome): - s = genome[i:j] - ALL.append([1,i,j,s]) - i += 20 - j += 20 - a += 1 - indexes.append(rec.name+":"+str(a)) - if both: - comp = genome.reverse_complement() - size = len(rec.seq) - i = 0 - j = 65 - while j < len(comp): - s = comp[i:j] - ALL.append([-1,size-j,size-i,s]) - i += 20 - j += 20 - a += 1 - indexes.append(rec.name+":"+str(a)) - df = pd.DataFrame(ALL, index=indexes, columns=['strand','iniprom','endprom','seq']) - return df - -#calculate the scores of all sequences (similar to get_posScores and get_negScores) -def get_testScores(loc,test): - scores = [] - posis = [] - sizes = [] - dic = {} - for ind,row in test.iterrows(): - _,window = ind.split(':') - strand = row['strand'] - ini = row['iniprom'] - end = row['endprom'] - seq = row['seq'] - pos = [ini,end,strand] - dic[window] = pos - s = seq - score10_6,pos10_6 = get_scores(os.path.join(loc,'pssm10_6.txt'), s) - maxi10_6 = get_max_pssm(os.path.join(loc,'pssm10_6.txt')) - score10_8,pos10_8 = get_scores(os.path.join(loc,'pssm10_8.txt'), s) - maxi10_8 = get_max_pssm(os.path.join(loc,'pssm10_8.txt')) - scores23,pos23 = get_scores(os.path.join(loc,'pssm_23.txt'), s) - maxi23 = get_max_pssm(os.path.join(loc,'pssm_23.txt')) - scores21,pos21 = get_scores(os.path.join(loc,'pssm_21.txt'), s) - maxi21 = get_max_pssm(os.path.join(loc,'pssm_21.txt')) - scores27,pos27 = get_scores(os.path.join(loc,'pssm_27.txt'), s) - maxi27 = get_max_pssm(os.path.join(loc,'pssm_27.txt')) - scores32,pos32 = get_scores(os.path.join(loc,'pssm_32.txt'), s) - maxi32 = get_max_pssm(os.path.join(loc,'pssm_32.txt')) - score23 = max(scores23) - score21 = max(scores21) - score27 = max(scores27) - score32 = max(scores32) - maxiphage = max(score23,score21,score27,score32) - if maxiphage == score23: phage_max = score23*maxi23 - elif maxiphage == score21: phage_max = score21*maxi21 - elif maxiphage == score27: phage_max = score27*maxi27 - elif maxiphage == score32: phage_max = score32*maxi32 - score35_6,pos35_6 = get_scores(os.path.join(loc,'pssm35_6.txt'), s) - maxi35_6 = get_max_pssm(os.path.join(loc,'pssm35_6.txt')) - score35_9,pos35_9 = get_scores(os.path.join(loc,'pssm35_9.txt'), s) - maxi35_9 = get_max_pssm(os.path.join(loc,'pssm35_9.txt')) - score35_t4,pos35_t4 = get_scores(os.path.join(loc,'pssm35_t4.txt'), s) - maxi35_t4 = get_max_pssm(os.path.join(loc,'pssm35_t4.txt')) - score35_cbb,pos35_cbb = get_scores(os.path.join(loc,'pssm35_cbb.txt'), s) - maxi35_cbb = get_max_pssm(os.path.join(loc,'pssm35_cbb.txt')) - score35_lb,pos35_lb = get_scores(os.path.join(loc,'pssm35_lb.txt'),s) - maxi35_lb = get_max_pssm(os.path.join(loc,'pssm35_lb.txt')) - score35_mu, pos35_mu = get_scores(os.path.join(loc,'pssm35_mu.txt'),s) - maxi35_mu = get_max_pssm(os.path.join(loc,'pssm35_mu.txt')) - - dists6 = [] - score6 = [] - for p in pos10_6: - for a in range(14,22): - d = p-a-6 - if d >= 0: - s10 = score10_6[p] - s35_6 = score35_6[d] - score6.append([s35_6,s10]) - dists6.append([d,p]) - dists9 = [] - score9 = [] - for p in pos10_6: - for a in range(11,14): - d = p-a-9 - if d >= 0: - s10 = score10_6[p] - s35_9 = score35_9[d] - score9.append([s35_9,s10]) - dists9.append([d,p]) - distst4 = [] - scoret4 = [] - distscbb = [] - scorecbb = [] - for p in pos10_6: - for a in range(16,18): - d = p-a-7 - if d >= 0: - s10 = score10_6[p] - s35_t4 = score35_t4[d] - s35_cbb = score35_cbb[d] - scoret4.append([s35_t4,s10]) - distst4.append([d,p]) - scorecbb.append([s35_cbb,s10]) - distscbb.append([d,p]) - - - distsmu = [] - scoremu = [] - for p in pos10_6: - d = p-16-14 - if d >= 0: - s10 = score10_6[p] - s35_mu = score35_mu[d] - scoremu.append([s35_mu,s10]) - distsmu.append([d,p]) - - distslb = [] - scorelb = [] - for p in pos10_6: - d = p-13-14 - if d >= 0: - s10 = score10_6[p] - s35_lb = score35_lb[d] - scorelb.append([s35_lb,s10]) - distslb.append([d,p]) - - soma6 = [sum(x) for x in score6] - soma9 = [sum(x) for x in score9] - somat4 = [sum(x) for x in scoret4] - somacbb = [sum(x) for x in scorecbb] - somamu = [sum(x) for x in scoremu] - somalb = [sum(x) for x in scorelb] - - maxi6 = max(soma6) - maxi9 = max(soma9) - maxit4 = max(somat4) - maxicbb = max(somacbb) - maximu = max(somamu) - maxilb = max(somalb) - maxi_elems = max(maxi6,maxi9,maxit4,maxicbb,maxilb,maximu) - - if maxi_elems == maxilb: - indmax = somalb.index(maxilb) - sc35 = scorelb[indmax][0] - sc10 = scorelb[indmax][1] - score_elems = [sc35,sc10] - posel = distslb[indmax] - size35 = 14 - elems_maxi = sc35*maxi35_lb+sc10*maxi10_6 - elif maxi_elems == maximu: - indmax = somamu.index(maximu) - sc35 = scoremu[indmax][0] - sc10 = scoremu[indmax][1] - score_elems = [sc35,sc10] - posel = distsmu[indmax] - size35 = 14 - elems_maxi = sc35*maxi35_mu+sc10*maxi10_6 - elif maxi_elems == maxi9: - indmax = soma9.index(maxi9) - sc35 = score9[indmax][0] - sc10 = score9[indmax][1] - score_elems = [sc35,sc10] - posel = dists9[indmax] - size35 = 9 - elems_maxi = sc35*maxi35_9+sc10*maxi10_6 - elif maxi_elems == maxit4: - indmax = somat4.index(maxit4) - sc35 = scoret4[indmax][0] - sc10 = scoret4[indmax][1] - score_elems = [sc35,sc10] - posel = distst4[indmax] - size35 = 7 - elems_maxi = sc35*maxi35_t4+sc10*maxi10_6 - elif maxi_elems == maxicbb: - indmax = somacbb.index(maxicbb) - sc35 = scorecbb[indmax][0] - sc10 = scorecbb[indmax][1] - score_elems = [sc35,sc10] - posel = distscbb[indmax] - size35 = 7 - elems_maxi = sc35*maxi35_cbb+sc10*maxi10_6 - else: - indmax = soma6.index(maxi6) - sc35 = score6[indmax][0] - sc10 = score6[indmax][1] - score_elems = [sc35,sc10] - posel = dists6[indmax] - size35 = 6 - elems_maxi = sc35*maxi35_6+sc10*maxi10_6 - - if score23 == maxiphage: - phage_score = score23 - posiphage = scores23.index(score23) - sizephage = 23 - elif score21 == maxiphage: - phage_score = score21 - posiphage = scores21.index(score21) - sizephage = 21 - elif score27 == maxiphage: - phage_score = score27 - posiphage = scores27.index(score27) - sizephage = 27 - else: - phage_score = score32 - posiphage = scores32.index(score32) - sizephage = 32 - - if elems_maxi >= max(score10_8)*maxi10_8: - i = posel[1] - ext = s[i-3:i-1] - if ext == 'TG': tg = 1 - else: tg = 0 - if elems_maxi > phage_max: host = 1 - else: - host = 0 - tg = 0 - sc = max(score10_8) - end35 = posel[0]+size35 - dist = posel[1]-end35 - scores.append([host, score_elems[1],sc,score_elems[0],phage_score,tg,dist,str(seq)]) - posis.append([posel[1],posel[0],posiphage]) - sizes.append([6,size35,sizephage]) - else: - host = 1 - sc = max(score10_8) - i = score10_8.index(sc) - ext = s[i-3:i-1] - if ext == 'TG': tg = 1 - else: tg = 0 - if max(score10_8)*maxi10_8 > phage_max: host = 1 - else: - host = 0 - tg = 0 - end35 = posel[0]+size35 - dist = posel[1]-end35 - scores.append([host,score_elems[1],sc,score_elems[0],phage_score,tg,dist,str(seq)]) - posis.append([i,posel[0],posiphage]) - sizes.append([8,size35,sizephage]) - score = pd.DataFrame(scores, index=test.index, columns=['host','score10','score10_8','score35','score_phage','tg','dist','seq']) - posis = pd.DataFrame(posis, index=test.index, columns=['pos10','pos35','posphage']) - sizes = pd.DataFrame(sizes, index=test.index, columns=['size10','size35','size_phage']) - df_all = pd.concat([score,posis,sizes],axis=1) - return df_all,dic - -def create_dftest(scores_test,dic_window,family,bacteria,lifecycle): - tudo = [] - tudo2 = [] - for ind,row in scores_test.iterrows(): - _,window = ind.split(':') - posis = dic_window[window] - strand=posis[2] - if strand == 1: ini=posis[0] - else: ini=posis[1] - seqprom = row['seq'] - score10 = row['score10'] - score10_8 = row['score10_8'] - score35 = row['score35'] - scorephage = row['score_phage'] - size10 = row['size10'] - size35 = row['size35'] - sizephage = row['size_phage'] - ini10 = row['pos10'] - tg = row['tg'] - host = row['host'] - ini35 = row['pos35'] - dist = row['dist'] - end10=ini10+size10 - iniphage = row['posphage'] - endphage = iniphage+sizephage - if strand == 1: - if host == 0: - new_seq = seqprom[iniphage:endphage] - new_ini = ini+iniphage+1 - new_end = ini+endphage - else: - if size10 == 6: - new_seq = seqprom[ini35:end10] - new_ini = ini+ini35+1 - new_end = ini+end10 - else: - new_seq = seqprom[ini10:end10] - new_ini = ini+ini10+1 - new_end = ini+end10 - new_pos = '('+str(new_ini)+'..'+str(new_end)+')' - else: - if host == 0: - new_seq = seqprom[iniphage:endphage] - new_ini = ini-endphage+1 - new_end = ini-iniphage - else: - if size10 == 6: - new_seq = seqprom[ini35:end10] - new_ini = ini-end10+1 - new_end = ini-ini35 - else: - new_seq = seqprom[ini10:end10] - new_ini = ini-end10+1 - new_end = ini-ini10 - new_pos = 'complement('+str(new_ini)+'..'+str(new_end)+')' - if size10 == 6: - size10_6 = 1 - else: - size10_6 = 0 - if size35 == 6: - size35_6 = 1 - size35_7 = 0 - size35_9 = 0 - size35_14 = 0 - elif size35 == 7: - size35_6 = 0 - size35_7 = 1 - size35_9 = 0 - size35_14 = 0 - elif size35 == 9: - size35_6 = 0 - size35_7 = 0 - size35_9 = 1 - size35_14 = 0 - else: - size35_6 = 0 - size35_7 = 0 - size35_9 = 0 - size35_14 = 1 - if sizephage == 23: - sizephage_23 = 1 - sizephage_21 = 0 - sizephage_32 = 0 - elif sizephage == 21: - sizephage_23 = 0 - sizephage_21 = 1 - sizephage_32 = 0 - elif sizephage == 32: - sizephage_23 = 0 - sizephage_21 = 0 - sizephage_32 = 1 - else: - sizephage_23 = 0 - sizephage_21 = 0 - sizephage_32 = 0 - if family == 'Podoviridae': - Podo = 1 - Sipho = 0 - Myo = 0 - elif family == 'Siphoviridae': - Podo = 0 - Sipho = 1 - Myo = 0 - elif family == 'Myoviridae': - Podo = 0 - Sipho = 0 - Myo = 1 - else: - Podo = 0 - Sipho = 0 - Myo = 0 - if bacteria == 'Bacillus': - bac = [1,0,0,0,0] - elif bacteria == 'Escherichia coli': - bac = [0,1,0,0,0] - elif bacteria == 'Klebsiella': - bac = [0,0,1,0,0] - elif bacteria == 'Pectobacterium': - bac = [0,0,0,1,0] - elif bacteria == 'Cronobacter': - bac = [0,0,0,0,1] - else: - bac = [0,0,0,0,0] - if lifecycle == 'virulent': tp = 0 - else: tp = 1 - fe = free_energy(str(seqprom)) - AT = freq_base(str(seqprom)) - linha = [score10, score10_8, score35, dist, scorephage, fe, AT, host,size10_6, - size35_6, size35_7, size35_9, size35_14, - sizephage_23, sizephage_21, - sizephage_32, tg, Podo, Sipho, Myo,tp] - linha.extend(bac) - tudo.append(linha) - linha2 = [new_pos,str(new_seq), host, size10_6, - score10, score10_8, size35_6, size35_7, size35_9,size35_14, - score35, dist, sizephage_23, sizephage_21, - sizephage_32, scorephage, tg, Podo, Sipho, Myo,tp, fe, AT] - linha2.extend(bac) - tudo2.append(linha2) - df_test = pd.DataFrame(tudo, index=scores_test.index, - columns = ['score10', 'score10_8','score35', 'dist35_10', - 'scorephage','fe', 'freqAT', - 'host','size10', - 'size35_6', 'size35_7','size35_9','size35_14', - 'sizephage_23', - 'sizephage_21', 'sizephage_32', - 'TG', 'Podo', 'Sipho', 'Myo', 'tp', - 'Bacillus', 'EColi', - 'Pectobacterium','Klebsiella', 'Cronobacter']) - df_INFO = pd.DataFrame(tudo2, index=scores_test.index, - columns = ['Positions','Promoter Sequence','host','size10', - 'score10', 'score10_8', - 'size35_6', 'size35_7','size35_9','size35_14', - 'score35', 'dist35_10','sizephage_23', - 'sizephage_21', 'sizephage_32', - 'scorephage', 'TG', 'Podo', 'Sipho', 'Myo', 'tp','fe', 'freqAT', - 'EColi', 'Salmonella', - 'Pectobacterium','Cronobacter', 'Streptococcus']) - return df_test,df_INFO - - -def get_predictions(scaler_file,model_file,test,df_testinfo,threshold): - from sklearn.externals import joblib - scaler = joblib.load(scaler_file) - model = joblib.load(model_file) - feat_scaled = pd.DataFrame(scaler.transform(test.iloc[:,:7]),index =test.index, columns=test.columns[:7]) - TEST_scaled = pd.concat([feat_scaled,test.iloc[:,7:]],axis=1) - scores = model.predict_proba(TEST_scaled) - pos_scores = np.empty((TEST_scaled.shape[0],0), float) - for x in scores: pos_scores = np.append(pos_scores,x[1]) - try: positive_indexes = np.nonzero(pos_scores>float(threshold))[0] #escolher os positivos, podia ser escolher com score > x - except ValueError: return 'The threshold value is not a float' - else: - if len(positive_indexes) == 0: return None - else: - positive_windows = TEST_scaled.index[positive_indexes] - INFO = df_testinfo.loc[positive_windows,['Positions','Promoter Sequence']] - promoter_type = [] - for x in df_testinfo.loc[positive_windows,'host'].tolist(): - if x == 0: promoter_type.append('phage') - else: promoter_type.append('host') - INFO['Type'] = promoter_type - INFO['Scores'] = np.around(pos_scores[positive_indexes],decimals=3) - INFO.index = positive_windows - return INFO - - -def get_finaldf(test,rec): - new_df = test.groupby(['Positions']) - groups = list(new_df.groups.keys()) - for i in range(len(groups)-1): - for j in range(i, len(groups)): - if 'complement' in groups[i]: inii = int(groups[i][11:].split('..')[0]) - else: inii = int(groups[i][1:].split('..')[0]) - if 'complement' in groups[j]: inij = int(groups[j][11:].split('..')[0]) - else: inij = int(groups[j][1:].split('..')[0]) - if inij < inii: - temp = groups[i] - groups[i] = groups[j] - groups[j] = temp - new_inds = [] - for g in groups: - inds = new_df.groups[g] - if len(inds) == 1: new_inds.append(inds[0]) - else: - maxi = max(new_df.get_group(g)['Scores']) - i = new_df.groups[g][new_df.get_group(g)['Scores']==maxi][0] - new_inds.append(i) - - output = test.loc[new_inds,:] - strands = [] - new_pos = [] - old_pos = output['Positions'].tolist() - - from Bio.SeqFeature import SeqFeature, FeatureLocation - feats = rec.features - for ind, row in output.iterrows(): - pos = row['Positions'] - if 'complement' in pos: - strands.append('-') - new_pos.append(pos[10:]) - ini,end= pos[11:-1].split('..') - new_loc = FeatureLocation(int(ini),int(end),strand=-1) - else: - strands.append('+') - new_pos.append(pos) - ini,end= pos[1:-1].split('..') - new_loc = FeatureLocation(int(ini),int(end),strand=1) - feat = SeqFeature(new_loc, type='regulatory',qualifiers={'regulatory_class':['promoter'], 'note=':['predicted by PhagePromoter']}) - feats.append(feat) - - output.insert(loc=0, column='Strand', value=strands) - output['Positions'] = new_pos - output.to_html('output.html',index=False, justify='center') - recs = [] - i = 0 - for ind,row in output.iterrows(): - s = Seq(row['Promoter Sequence']) - posis = old_pos[i] - typ = row['Type'] - score = row['Scores'] - sq = SeqRecord(seq=s, id=ind, description=typ+' '+posis+' score='+str(score)) - recs.append(sq) - i += 1 - SeqIO.write(recs, 'output.fasta','fasta') - new_rec = rec - new_rec.seq.alphabet = IUPAC.IUPACAmbiguousDNA() - new_feats = sorted(feats, key=lambda x: x.location.start) - new_rec.features = new_feats - SeqIO.write(new_rec,'output.gb','genbank') - -if __name__== "__main__": - - import sys - import os - __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) - - gen_format = sys.argv[1] - genome_file = sys.argv[2] - both = sys.argv[3] - threshold = sys.argv[4] - family = sys.argv[5] - host = sys.argv[6] - phage_type = sys.argv[7] - model = sys.argv[8] - ''' - gen_format = 'genbank' - genome_file = 'test-data/NC_015264.gb' - both = False - threshold = '0.50' - family = 'Podoviridae' - host = 'Pseudomonas' - phage_type = 'virulent' - model = 'SVM2400' - #model = 'ANN1600' - ''' - - rec = SeqIO.read(genome_file, gen_format) - test_windows = get_testseqs65(gen_format, genome_file,both) - try: score_test,dic_window = get_testScores(__location__,test_windows) - except IndexError: print('Error. Input sequence can only have A,C,G or T') - else: - df_test,df_testinfo = create_dftest(score_test,dic_window,family,host,phage_type) - if model == 'ANN1600': - scaler_file = os.path.join(__location__, 'scaler1600.sav') - model_file = os.path.join(__location__, 'model1600.sav') - preds = get_predictions(scaler_file, model_file, df_test,df_testinfo,threshold) - if preds is None: print('There is no sequence with a score value higher or equal to the threshold '+str(threshold)) - elif type(preds) == str: print(preds) - else: output = get_finaldf(preds,rec) - else: - scaler_file = os.path.join(__location__, 'scaler2400.sav') - model_file = os.path.join(__location__, 'model2400.sav') - new_df_test = df_test.iloc[:,[0,1,2,3,4,5,6,7,8,9,13,14,16,17,19,20,22,24,25]] - preds = get_predictions(scaler_file, model_file, new_df_test,df_testinfo,threshold) - if preds is None: print('There is no sequence with a score value higher or equal to the threshold '+str(threshold)) - elif type(preds) == str: print(preds) - else: output = get_finaldf(preds,rec) -