Previous changeset 28:4289d1b76d66 (2019-10-28) Next changeset 30:c60c460307e7 (2019-10-28) |
Commit message:
Update phagepromoter.py -> pickle instead joblib |
added:
phagepromoter.py |
b |
diff -r 4289d1b76d66 -r a555e95b2066 phagepromoter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/phagepromoter.py Mon Oct 28 12:39:04 2019 -0400 |
[ |
b'@@ -0,0 +1,570 @@\n+# -*- coding: utf-8 -*-\n+"""\n+Created on Thu Jul 19 13:45:05 2018\n+\n+@author: Marta\n+"""\n+\n+from Bio import SeqIO\n+import numpy as np\n+import pandas as pd\n+from auxiliar import free_energy,freq_base\n+from Bio.Seq import Seq\n+from Bio.SeqRecord import SeqRecord\n+from Bio.Alphabet import IUPAC\n+from auxiliar import get_bacteria, get_families, get_max_pssm, get_scores, get_lifecycle\n+\n+#division of the test genome in sequences of 65 bp\n+def get_testseqs65(form,fic,both=False):\n+ ALL = []\n+ indexes = []\n+ a = 0\n+ rec = SeqIO.read(fic,form)\n+ genome = rec.seq\n+ i = 0\n+ j = 65\n+ while j < len(genome):\n+ s = genome[i:j]\n+ ALL.append([1,i,j,s])\n+ i += 20\n+ j += 20\n+ a += 1\n+ indexes.append(rec.name+":"+str(a))\n+ if both:\n+ comp = genome.reverse_complement()\n+ size = len(rec.seq)\n+ i = 0\n+ j = 65\n+ while j < len(comp):\n+ s = comp[i:j]\n+ ALL.append([-1,size-j,size-i,s])\n+ i += 20\n+ j += 20\n+ a += 1\n+ indexes.append(rec.name+":"+str(a))\n+ df = pd.DataFrame(ALL, index=indexes, columns=[\'strand\',\'iniprom\',\'endprom\',\'seq\'])\n+ return df\n+\n+#calculate the scores of all sequences (similar to get_posScores and get_negScores)\n+def get_testScores(loc,test):\n+ scores = []\n+ posis = []\n+ sizes = []\n+ dic = {}\n+ for ind,row in test.iterrows():\n+ _,window = ind.split(\':\')\n+ strand = row[\'strand\']\n+ ini = row[\'iniprom\']\n+ end = row[\'endprom\']\n+ seq = row[\'seq\']\n+ pos = [ini,end,strand]\n+ dic[window] = pos\n+ s = seq\n+ score10_6,pos10_6 = get_scores(os.path.join(loc,\'pssm10_6.txt\'), s)\n+ maxi10_6 = get_max_pssm(os.path.join(loc,\'pssm10_6.txt\'))\n+ score10_8,pos10_8 = get_scores(os.path.join(loc,\'pssm10_8.txt\'), s)\n+ maxi10_8 = get_max_pssm(os.path.join(loc,\'pssm10_8.txt\'))\n+ scores23,pos23 = get_scores(os.path.join(loc,\'pssm_23.txt\'), s)\n+ maxi23 = get_max_pssm(os.path.join(loc,\'pssm_23.txt\'))\n+ scores21,pos21 = get_scores(os.path.join(loc,\'pssm_21.txt\'), s)\n+ maxi21 = get_max_pssm(os.path.join(loc,\'pssm_21.txt\'))\n+ scores27,pos27 = get_scores(os.path.join(loc,\'pssm_27.txt\'), s)\n+ maxi27 = get_max_pssm(os.path.join(loc,\'pssm_27.txt\'))\n+ scores32,pos32 = get_scores(os.path.join(loc,\'pssm_32.txt\'), s)\n+ maxi32 = get_max_pssm(os.path.join(loc,\'pssm_32.txt\'))\n+ score23 = max(scores23)\n+ score21 = max(scores21)\n+ score27 = max(scores27)\n+ score32 = max(scores32)\n+ maxiphage = max(score23,score21,score27,score32)\n+ if maxiphage == score23: phage_max = score23*maxi23\n+ elif maxiphage == score21: phage_max = score21*maxi21\n+ elif maxiphage == score27: phage_max = score27*maxi27\n+ elif maxiphage == score32: phage_max = score32*maxi32\n+ score35_6,pos35_6 = get_scores(os.path.join(loc,\'pssm35_6.txt\'), s)\n+ maxi35_6 = get_max_pssm(os.path.join(loc,\'pssm35_6.txt\'))\n+ score35_9,pos35_9 = get_scores(os.path.join(loc,\'pssm35_9.txt\'), s)\n+ maxi35_9 = get_max_pssm(os.path.join(loc,\'pssm35_9.txt\'))\n+ score35_t4,pos35_t4 = get_scores(os.path.join(loc,\'pssm35_t4.txt\'), s)\n+ maxi35_t4 = get_max_pssm(os.path.join(loc,\'pssm35_t4.txt\'))\n+ score35_cbb,pos35_cbb = get_scores(os.path.join(loc,\'pssm35_cbb.txt\'), s)\n+ maxi35_cbb = get_max_pssm(os.path.join(loc,\'pssm35_cbb.txt\'))\n+ score35_lb,pos35_lb = get_scores(os.path.join(loc,\'pssm35_lb.txt\'),s)\n+ maxi35_lb = get_max_pssm(os.path.join(loc,\'pssm35_lb.txt\'))\n+ score35_mu, pos35_mu = get_scores(os.path.join(loc,\'pssm35_mu.txt\'),s)\n+ maxi35_mu = get_max_pssm(os.path.join(loc,\'pssm35_mu.txt\'))\n+ \n+ dists6 = []\n+ score6 = []\n+ for p in pos10_6:\n+ for a in range(14,22):\n+ d = p-a-6\n+ '..b' inds = new_df.groups[g]\n+ if len(inds) == 1: new_inds.append(inds[0])\n+ else:\n+ maxi = max(new_df.get_group(g)[\'Scores\'])\n+ i = new_df.groups[g][new_df.get_group(g)[\'Scores\']==maxi][0]\n+ new_inds.append(i)\n+ \n+ output = test.loc[new_inds,:]\n+ strands = []\n+ new_pos = []\n+ old_pos = output[\'Positions\'].tolist()\n+ \n+ from Bio.SeqFeature import SeqFeature, FeatureLocation\n+ feats = rec.features\n+ for ind, row in output.iterrows():\n+ pos = row[\'Positions\']\n+ if \'complement\' in pos: \n+ strands.append(\'-\')\n+ new_pos.append(pos[10:])\n+ ini,end= pos[11:-1].split(\'..\')\n+ new_loc = FeatureLocation(int(ini),int(end),strand=-1)\n+ else: \n+ strands.append(\'+\')\n+ new_pos.append(pos)\n+ ini,end= pos[1:-1].split(\'..\')\n+ new_loc = FeatureLocation(int(ini),int(end),strand=1)\n+ feat = SeqFeature(new_loc, type=\'regulatory\',qualifiers={\'regulatory_class\':[\'promoter\'], \'note=\':[\'predicted by PhagePromoter\']})\n+ feats.append(feat) \n+ \n+ output.insert(loc=0, column=\'Strand\', value=strands)\n+ output[\'Positions\'] = new_pos\n+ output.to_html(\'output.html\',index=False, justify=\'center\')\n+ recs = []\n+ i = 0\n+ for ind,row in output.iterrows():\n+ s = Seq(row[\'Promoter Sequence\'])\n+ posis = old_pos[i]\n+ typ = row[\'Type\']\n+ score = row[\'Scores\']\n+ sq = SeqRecord(seq=s, id=ind, description=typ+\' \'+posis+\' score=\'+str(score))\n+ recs.append(sq)\n+ i += 1\n+ SeqIO.write(recs, \'output.fasta\',\'fasta\')\n+ new_rec = rec\n+ new_rec.seq.alphabet = IUPAC.IUPACAmbiguousDNA()\n+ new_feats = sorted(feats, key=lambda x: x.location.start)\n+ new_rec.features = new_feats\n+ SeqIO.write(new_rec,\'output.gb\',\'genbank\')\n+\n+if __name__== "__main__":\n+ \n+ import sys\n+ import os\n+ __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))\n+ \n+ gen_format = sys.argv[1]\n+ genome_file = sys.argv[2]\n+ both = sys.argv[3]\n+ threshold = sys.argv[4]\n+ family = sys.argv[5]\n+ host = sys.argv[6]\n+ phage_type = sys.argv[7]\n+ model = sys.argv[8]\n+ \'\'\'\n+ gen_format = \'genbank\'\n+ genome_file = \'test-data/NC_015264.gb\'\n+ both = False\n+ threshold = \'0.50\'\n+ family = \'Podoviridae\'\n+ host = \'Pseudomonas\'\n+ phage_type = \'virulent\'\n+ model = \'SVM2400\'\n+ #model = \'ANN1600\'\n+ \'\'\'\n+ \n+ rec = SeqIO.read(genome_file, gen_format)\n+ test_windows = get_testseqs65(gen_format, genome_file,both)\n+ try: score_test,dic_window = get_testScores(__location__,test_windows)\n+ except IndexError: print(\'Error. Input sequence can only have A,C,G or T\')\n+ else:\n+ df_test,df_testinfo = create_dftest(score_test,dic_window,family,host,phage_type)\n+ if model == \'ANN1600\':\n+ scaler_file = os.path.join(__location__, \'scaler1600.sav\')\n+ model_file = os.path.join(__location__, \'model1600.sav\')\n+ preds = get_predictions(scaler_file, model_file, df_test,df_testinfo,threshold)\n+ if preds is None: print(\'There is no sequence with a score value higher or equal to the threshold \'+str(threshold))\n+ elif type(preds) == str: print(preds)\n+ else: output = get_finaldf(preds,rec)\n+ else:\n+ scaler_file = os.path.join(__location__, \'scaler2400.sav\')\n+ model_file = os.path.join(__location__, \'model2400.sav\')\n+ new_df_test = df_test.iloc[:,[0,1,2,3,4,5,6,7,8,9,13,14,16,17,19,20,22,24,25]]\n+ preds = get_predictions(scaler_file, model_file, new_df_test,df_testinfo,threshold)\n+ if preds is None: print(\'There is no sequence with a score value higher or equal to the threshold \'+str(threshold))\n+ elif type(preds) == str: print(preds)\n+ else: output = get_finaldf(preds,rec)\n+ \n' |