Repository 'phage_promoters'
hg clone https://toolshed.g2.bx.psu.edu/repos/martasampaio/phage_promoters

Changeset 12:c6ab2150079e (2018-08-01)
Previous changeset 11:7036941f737c (2018-08-01) Next changeset 13:e06247ca6776 (2018-08-01)
Commit message:
Uploaded
added:
phage_promoters.py
b
diff -r 7036941f737c -r c6ab2150079e phage_promoters.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/phage_promoters.py Wed Aug 01 05:30:59 2018 -0400
[
b'@@ -0,0 +1,507 @@\n+# -*- coding: utf-8 -*-\n+"""\n+Created on Thu Jul 19 13:45:05 2018\n+\n+@author: Marta\n+"""\n+\n+from Bio import SeqIO\n+import numpy as np\n+import pandas as pd\n+from auxiliar import free_energy,freq_base\n+from Bio.Seq import Seq\n+from Bio.SeqRecord import SeqRecord\n+from Bio.Alphabet import IUPAC\n+from auxiliar import get_bacteria, get_families, get_max_pssm, get_scores, get_lifecycle\n+\n+#division of the test genome in sequences of 65 bp\n+def get_testseqs65(form,fic,both=False):\n+    ALL = []\n+    indexes = []\n+    a = 0\n+    rec = SeqIO.read(fic,form)\n+    genome = rec.seq\n+    i = 0\n+    j = 65\n+    while j < len(genome):\n+        s = genome[i:j]\n+        ALL.append([1,i,j,s])\n+        i += 20\n+        j += 20\n+        a += 1\n+        indexes.append(rec.name+":"+str(a))\n+    if both:\n+        i = 0\n+        j = 65\n+        while j < len(genome):\n+            s = genome[i:j].reverse_complement()\n+            ALL.append([-1,i,j,s])\n+            i += 20\n+            j += 20\n+            a += 1\n+            indexes.append(rec.name+":"+str(a))\n+    df = pd.DataFrame(ALL, index=indexes, columns=[\'strand\',\'iniprom\',\'endprom\',\'seq\'])\n+    return df\n+\n+#calculate the scores of all sequences (similar to get_posScores and get_negScores)\n+def get_testScores(loc,test):\n+    scores = []\n+    posis = []\n+    sizes = []\n+    dic = {}\n+    for ind,row in test.iterrows():\n+        _,window = ind.split(\':\')\n+        strand = row[\'strand\']\n+        ini = row[\'iniprom\']\n+        end = row[\'endprom\']\n+        seq = row[\'seq\']\n+        pos = [ini,end,strand]\n+        dic[window] = pos\n+        s = seq\n+        score10_6,pos10_6 = get_scores(os.path.join(loc,\'pssm10_6.txt\'), s)\n+        maxi10_6 = get_max_pssm(os.path.join(loc,\'pssm10_6.txt\'))\n+        score10_8,pos10_8 = get_scores(os.path.join(loc,\'pssm10_8.txt\'), s)\n+        maxi10_8 = get_max_pssm(os.path.join(loc,\'pssm10_8.txt\'))\n+        scores23,pos23 = get_scores(os.path.join(loc,\'pssm_23.txt\'), s)\n+        maxi23 = get_max_pssm(os.path.join(loc,\'pssm_23.txt\'))\n+        scores21,pos21 = get_scores(os.path.join(loc,\'pssm_21.txt\'), s)\n+        maxi21 = get_max_pssm(os.path.join(loc,\'pssm_21.txt\'))\n+        scores27,pos27 = get_scores(os.path.join(loc,\'pssm_27.txt\'), s)\n+        maxi27 = get_max_pssm(os.path.join(loc,\'pssm_27.txt\'))\n+        scores32,pos32 = get_scores(os.path.join(loc,\'pssm_32.txt\'), s)\n+        maxi32 = get_max_pssm(os.path.join(loc,\'pssm_32.txt\'))\n+        score23 = max(scores23)\n+        score21 = max(scores21)\n+        score27 = max(scores27)\n+        score32 = max(scores32)\n+        maxiphage = max(score23,score21,score27,score32)\n+        if maxiphage == score23: phage_max = score23*maxi23\n+        elif maxiphage == score21: phage_max = score21*maxi21\n+        elif maxiphage == score27: phage_max = score27*maxi27\n+        elif maxiphage == score32: phage_max = score32*maxi32\n+        score35_6,pos35_6 = get_scores(os.path.join(loc,\'pssm35_6.txt\'), s)\n+        maxi35_6 = get_max_pssm(os.path.join(loc,\'pssm35_6.txt\'))\n+        score35_9,pos35_9 = get_scores(os.path.join(loc,\'pssm35_9.txt\'), s)\n+        maxi35_9 = get_max_pssm(os.path.join(loc,\'pssm35_9.txt\'))\n+        score35_t4,pos35_t4 = get_scores(os.path.join(loc,\'pssm35_t4.txt\'), s)\n+        maxi35_t4 = get_max_pssm(os.path.join(loc,\'pssm35_t4.txt\'))\n+        score35_cbb,pos35_cbb = get_scores(os.path.join(loc,\'pssm35_cbb.txt\'), s)\n+        maxi35_cbb = get_max_pssm(os.path.join(loc,\'pssm35_cbb.txt\'))\n+        score35_lb,pos35_lb = get_scores(os.path.join(loc,\'pssm35_lb.txt\'),s)\n+        maxi35_lb = get_max_pssm(os.path.join(loc,\'pssm35_lb.txt\'))\n+        score35_mu, pos35_mu = get_scores(os.path.join(loc,\'pssm35_mu.txt\'),s)\n+        maxi35_mu = get_max_pssm(os.path.join(loc,\'pssm35_mu.txt\'))\n+        \n+        dists6 = []\n+        score6 = []\n+        for p in pos10_6:\n+            for a in range(14,22):\n+                d = p-a-6\n+                if d >= 0: \n+                    s10 = score10_6[p]\n+ '..b'ectobacterium\',\'Cronobacter\'])\n+    return df_test,df_INFO\n+\n+\n+def get_finaldf(test):\n+    new_df = test.groupby([\'positions\'])\n+    groups = list(new_df.groups.keys())\n+    for i in range(len(groups)-1):\n+        for j in range(i, len(groups)):\n+            inii = int(groups[i][1:].split(\'..\')[0])\n+            inij = int(groups[j][1:].split(\'..\')[0])\n+            if inij < inii:\n+                temp = groups[i]\n+                groups[i] = groups[j]\n+                groups[j] = temp\n+    new_inds = []\n+    for g in groups:\n+        inds = new_df.groups[g]\n+        if len(inds) == 1: new_inds.append(inds[0])\n+        else:\n+            maxi = max(new_df.get_group(g)[\'scores\'])\n+            i = new_df.groups[g][new_df.get_group(g)[\'scores\']==maxi][0]\n+            new_inds.append(i)\n+    \n+    output = test.loc[new_inds,:]\n+    output.to_html(\'output.html\',index=False)\n+    recs = []\n+    for ind,row in output.iterrows():\n+        s = Seq(row[\'promoter_seq\'])\n+        posis = row[\'positions\']\n+        typ = row[\'promoter_type\']\n+        score = row[\'scores\']\n+        sq = SeqRecord(seq=s, id=ind, description=typ+\' \'+posis+\' score=\'+str(score))\n+        recs.append(sq)\n+    #SeqIO.write(recs, \'output.fasta\',\'fasta\')\n+    \n+def get_predictions(scaler_file,model_file,test,df_testinfo,threshold):\n+    from sklearn.externals import joblib\n+    scaler = joblib.load(scaler_file)\n+    model = joblib.load(model_file)\n+    feat_scaled = pd.DataFrame(scaler.transform(test.iloc[:,:7]),index =test.index, columns=test.columns[:7])\n+    TEST_scaled = pd.concat([feat_scaled,test.iloc[:,7:]],axis=1)\n+    scores = model.predict_proba(TEST_scaled)\n+    pos_scores = np.empty((TEST_scaled.shape[0],0), float)\n+    for x in scores: pos_scores = np.append(pos_scores,x[1])\n+    try: positive_indexes = np.nonzero(pos_scores>=float(threshold))[0] #escolher os positivos, podia ser escolher com score > x\n+    except ValueError: return \'The threshold value is not a float\'\n+    else:\n+        if len(positive_indexes) == 0: return None\n+        else:\n+            positive_windows = TEST_scaled.index[positive_indexes]\n+            INFO = df_testinfo.loc[positive_windows,[\'positions\',\'promoter_seq\']]\n+            promoter_type = []\n+            for x in df_testinfo.loc[positive_windows,\'host\'].tolist():\n+                if x == 0: promoter_type.append(\'phage\')\n+                else: promoter_type.append(\'host\')\n+            INFO[\'promoter_type\'] = promoter_type\n+            INFO[\'scores\'] = np.around(pos_scores[positive_indexes],decimals=3)\n+            INFO.index = positive_windows\n+            return INFO\n+\n+if __name__== "__main__":\n+    \n+    import sys\n+    import os\n+    __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))\n+    scaler_file = os.path.join(__location__, \'scaler.sav\')\n+    model_file = os.path.join(__location__, \'model.sav\')\n+    \n+    gen_format = sys.argv[1]\n+    genome_file = sys.argv[2]\n+    both = sys.argv[3]\n+    threshold = sys.argv[4]\n+    family = sys.argv[5]\n+    host = sys.argv[6]\n+    phage_type = sys.argv[7]\n+    \'\'\'\n+    gen_format = \'gb\'\n+    genome_file = \'test-data/NC_015264.gb\'\n+    genbank_fasta = \'genbank\'\n+    both = False\n+    threshold = \'0.50\'\n+    family = \'Podoviridae\'\n+    host = \'Pseudomonas\'\n+    phage_type = \'virulent\'\n+    \'\'\'\n+\n+    test_windows = get_testseqs65(gen_format, genome_file,both)\n+    try: score_test,dic_window = get_testScores(__location__,test_windows)\n+    \n+    except IndexError: print(\'Error. Input sequence can only have A,C,G or T\')\n+    \n+    else:\n+        df_test,df_testinfo = create_dftest(score_test,dic_window,family,host,phage_type)\n+        \n+        preds =  get_predictions(scaler_file, model_file, df_test,df_testinfo,threshold)\n+        if preds is None: print(\'There is no sequence with a score value higher or equal to the threshold \'+str(threshold))\n+        elif type(preds) == str: print(preds)\n+        else: output = get_finaldf(preds)\n+        \n'