Repository 'phagepromoter'
hg clone https://toolshed.g2.bx.psu.edu/repos/martasampaio/phagepromoter

Changeset 28:4289d1b76d66 (2019-10-28)
Previous changeset 27:636e332434ff (2019-10-28) Next changeset 29:a555e95b2066 (2019-10-28)
Commit message:
Deleted selected files
removed:
model1600.sav
model2400.sav
phagepromoter.py
scaler1600.sav
scaler2400.sav
b
diff -r 636e332434ff -r 4289d1b76d66 model1600.sav
b
Binary file model1600.sav has changed
b
diff -r 636e332434ff -r 4289d1b76d66 model2400.sav
b
Binary file model2400.sav has changed
b
diff -r 636e332434ff -r 4289d1b76d66 phagepromoter.py
--- a/phagepromoter.py Mon Oct 28 11:57:47 2019 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,570 +0,0 @@\n-# -*- coding: utf-8 -*-\n-"""\n-Created on Thu Jul 19 13:45:05 2018\n-\n-@author: Marta\n-"""\n-\n-from Bio import SeqIO\n-import numpy as np\n-import pandas as pd\n-from auxiliar import free_energy,freq_base\n-from Bio.Seq import Seq\n-from Bio.SeqRecord import SeqRecord\n-from Bio.Alphabet import IUPAC\n-from auxiliar import get_bacteria, get_families, get_max_pssm, get_scores, get_lifecycle\n-\n-#division of the test genome in sequences of 65 bp\n-def get_testseqs65(form,fic,both=False):\n-    ALL = []\n-    indexes = []\n-    a = 0\n-    rec = SeqIO.read(fic,form)\n-    genome = rec.seq\n-    i = 0\n-    j = 65\n-    while j < len(genome):\n-        s = genome[i:j]\n-        ALL.append([1,i,j,s])\n-        i += 20\n-        j += 20\n-        a += 1\n-        indexes.append(rec.name+":"+str(a))\n-    if both:\n-        comp = genome.reverse_complement()\n-        size = len(rec.seq)\n-        i = 0\n-        j = 65\n-        while j < len(comp):\n-            s = comp[i:j]\n-            ALL.append([-1,size-j,size-i,s])\n-            i += 20\n-            j += 20\n-            a += 1\n-            indexes.append(rec.name+":"+str(a))\n-    df = pd.DataFrame(ALL, index=indexes, columns=[\'strand\',\'iniprom\',\'endprom\',\'seq\'])\n-    return df\n-\n-#calculate the scores of all sequences (similar to get_posScores and get_negScores)\n-def get_testScores(loc,test):\n-    scores = []\n-    posis = []\n-    sizes = []\n-    dic = {}\n-    for ind,row in test.iterrows():\n-        _,window = ind.split(\':\')\n-        strand = row[\'strand\']\n-        ini = row[\'iniprom\']\n-        end = row[\'endprom\']\n-        seq = row[\'seq\']\n-        pos = [ini,end,strand]\n-        dic[window] = pos\n-        s = seq\n-        score10_6,pos10_6 = get_scores(os.path.join(loc,\'pssm10_6.txt\'), s)\n-        maxi10_6 = get_max_pssm(os.path.join(loc,\'pssm10_6.txt\'))\n-        score10_8,pos10_8 = get_scores(os.path.join(loc,\'pssm10_8.txt\'), s)\n-        maxi10_8 = get_max_pssm(os.path.join(loc,\'pssm10_8.txt\'))\n-        scores23,pos23 = get_scores(os.path.join(loc,\'pssm_23.txt\'), s)\n-        maxi23 = get_max_pssm(os.path.join(loc,\'pssm_23.txt\'))\n-        scores21,pos21 = get_scores(os.path.join(loc,\'pssm_21.txt\'), s)\n-        maxi21 = get_max_pssm(os.path.join(loc,\'pssm_21.txt\'))\n-        scores27,pos27 = get_scores(os.path.join(loc,\'pssm_27.txt\'), s)\n-        maxi27 = get_max_pssm(os.path.join(loc,\'pssm_27.txt\'))\n-        scores32,pos32 = get_scores(os.path.join(loc,\'pssm_32.txt\'), s)\n-        maxi32 = get_max_pssm(os.path.join(loc,\'pssm_32.txt\'))\n-        score23 = max(scores23)\n-        score21 = max(scores21)\n-        score27 = max(scores27)\n-        score32 = max(scores32)\n-        maxiphage = max(score23,score21,score27,score32)\n-        if maxiphage == score23: phage_max = score23*maxi23\n-        elif maxiphage == score21: phage_max = score21*maxi21\n-        elif maxiphage == score27: phage_max = score27*maxi27\n-        elif maxiphage == score32: phage_max = score32*maxi32\n-        score35_6,pos35_6 = get_scores(os.path.join(loc,\'pssm35_6.txt\'), s)\n-        maxi35_6 = get_max_pssm(os.path.join(loc,\'pssm35_6.txt\'))\n-        score35_9,pos35_9 = get_scores(os.path.join(loc,\'pssm35_9.txt\'), s)\n-        maxi35_9 = get_max_pssm(os.path.join(loc,\'pssm35_9.txt\'))\n-        score35_t4,pos35_t4 = get_scores(os.path.join(loc,\'pssm35_t4.txt\'), s)\n-        maxi35_t4 = get_max_pssm(os.path.join(loc,\'pssm35_t4.txt\'))\n-        score35_cbb,pos35_cbb = get_scores(os.path.join(loc,\'pssm35_cbb.txt\'), s)\n-        maxi35_cbb = get_max_pssm(os.path.join(loc,\'pssm35_cbb.txt\'))\n-        score35_lb,pos35_lb = get_scores(os.path.join(loc,\'pssm35_lb.txt\'),s)\n-        maxi35_lb = get_max_pssm(os.path.join(loc,\'pssm35_lb.txt\'))\n-        score35_mu, pos35_mu = get_scores(os.path.join(loc,\'pssm35_mu.txt\'),s)\n-        maxi35_mu = get_max_pssm(os.path.join(loc,\'pssm35_mu.txt\'))\n-        \n-        dists6 = []\n-        score6 = []\n-        for p in pos10_6:\n-            for a in range(14,22):\n-                d = p-a-6\n-            '..b'    inds = new_df.groups[g]\n-        if len(inds) == 1: new_inds.append(inds[0])\n-        else:\n-            maxi = max(new_df.get_group(g)[\'Scores\'])\n-            i = new_df.groups[g][new_df.get_group(g)[\'Scores\']==maxi][0]\n-            new_inds.append(i)\n-    \n-    output = test.loc[new_inds,:]\n-    strands = []\n-    new_pos = []\n-    old_pos = output[\'Positions\'].tolist()\n-    \n-    from Bio.SeqFeature import SeqFeature, FeatureLocation\n-    feats = rec.features\n-    for ind, row in output.iterrows():\n-        pos = row[\'Positions\']\n-        if \'complement\' in pos: \n-            strands.append(\'-\')\n-            new_pos.append(pos[10:])\n-            ini,end= pos[11:-1].split(\'..\')\n-            new_loc = FeatureLocation(int(ini),int(end),strand=-1)\n-        else: \n-            strands.append(\'+\')\n-            new_pos.append(pos)\n-            ini,end= pos[1:-1].split(\'..\')\n-            new_loc = FeatureLocation(int(ini),int(end),strand=1)\n-        feat = SeqFeature(new_loc, type=\'regulatory\',qualifiers={\'regulatory_class\':[\'promoter\'], \'note=\':[\'predicted by PhagePromoter\']})\n-        feats.append(feat)            \n-            \n-    output.insert(loc=0, column=\'Strand\', value=strands)\n-    output[\'Positions\'] = new_pos\n-    output.to_html(\'output.html\',index=False, justify=\'center\')\n-    recs = []\n-    i = 0\n-    for ind,row in output.iterrows():\n-        s = Seq(row[\'Promoter Sequence\'])\n-        posis = old_pos[i]\n-        typ = row[\'Type\']\n-        score = row[\'Scores\']\n-        sq = SeqRecord(seq=s, id=ind, description=typ+\' \'+posis+\' score=\'+str(score))\n-        recs.append(sq)\n-        i += 1\n-    SeqIO.write(recs, \'output.fasta\',\'fasta\')\n-    new_rec = rec\n-    new_rec.seq.alphabet = IUPAC.IUPACAmbiguousDNA()\n-    new_feats = sorted(feats, key=lambda x: x.location.start)\n-    new_rec.features = new_feats\n-    SeqIO.write(new_rec,\'output.gb\',\'genbank\')\n-\n-if __name__== "__main__":\n-    \n-    import sys\n-    import os\n-    __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))\n-    \n-    gen_format = sys.argv[1]\n-    genome_file = sys.argv[2]\n-    both = sys.argv[3]\n-    threshold = sys.argv[4]\n-    family = sys.argv[5]\n-    host = sys.argv[6]\n-    phage_type = sys.argv[7]\n-    model = sys.argv[8]\n-    \'\'\'\n-    gen_format = \'genbank\'\n-    genome_file = \'test-data/NC_015264.gb\'\n-    both = False\n-    threshold = \'0.50\'\n-    family = \'Podoviridae\'\n-    host = \'Pseudomonas\'\n-    phage_type = \'virulent\'\n-    model = \'SVM2400\'\n-    #model = \'ANN1600\'\n-    \'\'\'\n-    \n-    rec = SeqIO.read(genome_file, gen_format)\n-    test_windows = get_testseqs65(gen_format, genome_file,both)\n-    try: score_test,dic_window = get_testScores(__location__,test_windows)\n-    except IndexError: print(\'Error. Input sequence can only have A,C,G or T\')\n-    else:\n-        df_test,df_testinfo = create_dftest(score_test,dic_window,family,host,phage_type)\n-        if model == \'ANN1600\':\n-            scaler_file = os.path.join(__location__, \'scaler1600.sav\')\n-            model_file = os.path.join(__location__, \'model1600.sav\')\n-            preds =  get_predictions(scaler_file, model_file, df_test,df_testinfo,threshold)\n-            if preds is None: print(\'There is no sequence with a score value higher or equal to the threshold \'+str(threshold))\n-            elif type(preds) == str: print(preds)\n-            else: output = get_finaldf(preds,rec)\n-        else:\n-            scaler_file = os.path.join(__location__, \'scaler2400.sav\')\n-            model_file = os.path.join(__location__, \'model2400.sav\')\n-            new_df_test = df_test.iloc[:,[0,1,2,3,4,5,6,7,8,9,13,14,16,17,19,20,22,24,25]]\n-            preds =  get_predictions(scaler_file, model_file, new_df_test,df_testinfo,threshold)\n-            if preds is None: print(\'There is no sequence with a score value higher or equal to the threshold \'+str(threshold))\n-            elif type(preds) == str: print(preds)\n-            else: output = get_finaldf(preds,rec)\n-        \n'
b
diff -r 636e332434ff -r 4289d1b76d66 scaler1600.sav
b
Binary file scaler1600.sav has changed
b
diff -r 636e332434ff -r 4289d1b76d66 scaler2400.sav
b
Binary file scaler2400.sav has changed