Repository 'phage_promoters'
hg clone https://toolshed.g2.bx.psu.edu/repos/martasampaio/phage_promoters

Changeset 19:4691c4418b00 (2018-09-10)
Previous changeset 18:f391041379cc (2018-08-01) Next changeset 20:d88f0e7350f9 (2018-09-10)
Commit message:
Deleted selected files
removed:
model.sav
phage_promoters.py
phage_promoters.xml
scaler.sav
b
diff -r f391041379cc -r 4691c4418b00 model.sav
b
Binary file model.sav has changed
b
diff -r f391041379cc -r 4691c4418b00 phage_promoters.py
--- a/phage_promoters.py Wed Aug 01 05:59:45 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,507 +0,0 @@\n-# -*- coding: utf-8 -*-\n-"""\n-Created on Thu Jul 19 13:45:05 2018\n-\n-@author: Marta\n-"""\n-\n-from Bio import SeqIO\n-import numpy as np\n-import pandas as pd\n-from auxiliar import free_energy,freq_base\n-from Bio.Seq import Seq\n-from Bio.SeqRecord import SeqRecord\n-from Bio.Alphabet import IUPAC\n-from auxiliar import get_bacteria, get_families, get_max_pssm, get_scores, get_lifecycle\n-\n-#division of the test genome in sequences of 65 bp\n-def get_testseqs65(form,fic,both=False):\n-    ALL = []\n-    indexes = []\n-    a = 0\n-    rec = SeqIO.read(fic,form)\n-    genome = rec.seq\n-    i = 0\n-    j = 65\n-    while j < len(genome):\n-        s = genome[i:j]\n-        ALL.append([1,i,j,s])\n-        i += 20\n-        j += 20\n-        a += 1\n-        indexes.append(rec.name+":"+str(a))\n-    if both:\n-        i = 0\n-        j = 65\n-        while j < len(genome):\n-            s = genome[i:j].reverse_complement()\n-            ALL.append([-1,i,j,s])\n-            i += 20\n-            j += 20\n-            a += 1\n-            indexes.append(rec.name+":"+str(a))\n-    df = pd.DataFrame(ALL, index=indexes, columns=[\'strand\',\'iniprom\',\'endprom\',\'seq\'])\n-    return df\n-\n-#calculate the scores of all sequences (similar to get_posScores and get_negScores)\n-def get_testScores(loc,test):\n-    scores = []\n-    posis = []\n-    sizes = []\n-    dic = {}\n-    for ind,row in test.iterrows():\n-        _,window = ind.split(\':\')\n-        strand = row[\'strand\']\n-        ini = row[\'iniprom\']\n-        end = row[\'endprom\']\n-        seq = row[\'seq\']\n-        pos = [ini,end,strand]\n-        dic[window] = pos\n-        s = seq\n-        score10_6,pos10_6 = get_scores(os.path.join(loc,\'pssm10_6.txt\'), s)\n-        maxi10_6 = get_max_pssm(os.path.join(loc,\'pssm10_6.txt\'))\n-        score10_8,pos10_8 = get_scores(os.path.join(loc,\'pssm10_8.txt\'), s)\n-        maxi10_8 = get_max_pssm(os.path.join(loc,\'pssm10_8.txt\'))\n-        scores23,pos23 = get_scores(os.path.join(loc,\'pssm_23.txt\'), s)\n-        maxi23 = get_max_pssm(os.path.join(loc,\'pssm_23.txt\'))\n-        scores21,pos21 = get_scores(os.path.join(loc,\'pssm_21.txt\'), s)\n-        maxi21 = get_max_pssm(os.path.join(loc,\'pssm_21.txt\'))\n-        scores27,pos27 = get_scores(os.path.join(loc,\'pssm_27.txt\'), s)\n-        maxi27 = get_max_pssm(os.path.join(loc,\'pssm_27.txt\'))\n-        scores32,pos32 = get_scores(os.path.join(loc,\'pssm_32.txt\'), s)\n-        maxi32 = get_max_pssm(os.path.join(loc,\'pssm_32.txt\'))\n-        score23 = max(scores23)\n-        score21 = max(scores21)\n-        score27 = max(scores27)\n-        score32 = max(scores32)\n-        maxiphage = max(score23,score21,score27,score32)\n-        if maxiphage == score23: phage_max = score23*maxi23\n-        elif maxiphage == score21: phage_max = score21*maxi21\n-        elif maxiphage == score27: phage_max = score27*maxi27\n-        elif maxiphage == score32: phage_max = score32*maxi32\n-        score35_6,pos35_6 = get_scores(os.path.join(loc,\'pssm35_6.txt\'), s)\n-        maxi35_6 = get_max_pssm(os.path.join(loc,\'pssm35_6.txt\'))\n-        score35_9,pos35_9 = get_scores(os.path.join(loc,\'pssm35_9.txt\'), s)\n-        maxi35_9 = get_max_pssm(os.path.join(loc,\'pssm35_9.txt\'))\n-        score35_t4,pos35_t4 = get_scores(os.path.join(loc,\'pssm35_t4.txt\'), s)\n-        maxi35_t4 = get_max_pssm(os.path.join(loc,\'pssm35_t4.txt\'))\n-        score35_cbb,pos35_cbb = get_scores(os.path.join(loc,\'pssm35_cbb.txt\'), s)\n-        maxi35_cbb = get_max_pssm(os.path.join(loc,\'pssm35_cbb.txt\'))\n-        score35_lb,pos35_lb = get_scores(os.path.join(loc,\'pssm35_lb.txt\'),s)\n-        maxi35_lb = get_max_pssm(os.path.join(loc,\'pssm35_lb.txt\'))\n-        score35_mu, pos35_mu = get_scores(os.path.join(loc,\'pssm35_mu.txt\'),s)\n-        maxi35_mu = get_max_pssm(os.path.join(loc,\'pssm35_mu.txt\'))\n-        \n-        dists6 = []\n-        score6 = []\n-        for p in pos10_6:\n-            for a in range(14,22):\n-                d = p-a-6\n-                if d >= 0: \n-                    s10 = score10_6[p]\n- '..b'Pectobacterium\',\'Cronobacter\'])\n-    return df_test,df_INFO\n-\n-\n-def get_finaldf(test):\n-    new_df = test.groupby([\'positions\'])\n-    groups = list(new_df.groups.keys())\n-    for i in range(len(groups)-1):\n-        for j in range(i, len(groups)):\n-            inii = int(groups[i][1:].split(\'..\')[0])\n-            inij = int(groups[j][1:].split(\'..\')[0])\n-            if inij < inii:\n-                temp = groups[i]\n-                groups[i] = groups[j]\n-                groups[j] = temp\n-    new_inds = []\n-    for g in groups:\n-        inds = new_df.groups[g]\n-        if len(inds) == 1: new_inds.append(inds[0])\n-        else:\n-            maxi = max(new_df.get_group(g)[\'scores\'])\n-            i = new_df.groups[g][new_df.get_group(g)[\'scores\']==maxi][0]\n-            new_inds.append(i)\n-    \n-    output = test.loc[new_inds,:]\n-    output.to_html(\'output.html\',index=False)\n-    recs = []\n-    for ind,row in output.iterrows():\n-        s = Seq(row[\'promoter_seq\'])\n-        posis = row[\'positions\']\n-        typ = row[\'promoter_type\']\n-        score = row[\'scores\']\n-        sq = SeqRecord(seq=s, id=ind, description=typ+\' \'+posis+\' score=\'+str(score))\n-        recs.append(sq)\n-    SeqIO.write(recs, \'output.fasta\',\'fasta\')\n-    \n-def get_predictions(scaler_file,model_file,test,df_testinfo,threshold):\n-    from sklearn.externals import joblib\n-    scaler = joblib.load(scaler_file)\n-    model = joblib.load(model_file)\n-    feat_scaled = pd.DataFrame(scaler.transform(test.iloc[:,:7]),index =test.index, columns=test.columns[:7])\n-    TEST_scaled = pd.concat([feat_scaled,test.iloc[:,7:]],axis=1)\n-    scores = model.predict_proba(TEST_scaled)\n-    pos_scores = np.empty((TEST_scaled.shape[0],0), float)\n-    for x in scores: pos_scores = np.append(pos_scores,x[1])\n-    try: positive_indexes = np.nonzero(pos_scores>=float(threshold))[0] #escolher os positivos, podia ser escolher com score > x\n-    except ValueError: return \'The threshold value is not a float\'\n-    else:\n-        if len(positive_indexes) == 0: return None\n-        else:\n-            positive_windows = TEST_scaled.index[positive_indexes]\n-            INFO = df_testinfo.loc[positive_windows,[\'positions\',\'promoter_seq\']]\n-            promoter_type = []\n-            for x in df_testinfo.loc[positive_windows,\'host\'].tolist():\n-                if x == 0: promoter_type.append(\'phage\')\n-                else: promoter_type.append(\'host\')\n-            INFO[\'promoter_type\'] = promoter_type\n-            INFO[\'scores\'] = np.around(pos_scores[positive_indexes],decimals=3)\n-            INFO.index = positive_windows\n-            return INFO\n-\n-if __name__== "__main__":\n-    \n-    import sys\n-    import os\n-    __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))\n-    scaler_file = os.path.join(__location__, \'scaler.sav\')\n-    model_file = os.path.join(__location__, \'model.sav\')\n-    \n-    gen_format = sys.argv[1]\n-    genome_file = sys.argv[2]\n-    both = sys.argv[3]\n-    threshold = sys.argv[4]\n-    family = sys.argv[5]\n-    host = sys.argv[6]\n-    phage_type = sys.argv[7]\n-    \'\'\'\n-    gen_format = \'gb\'\n-    genome_file = \'test-data/NC_015264.gb\'\n-    genbank_fasta = \'genbank\'\n-    both = False\n-    threshold = \'0.50\'\n-    family = \'Podoviridae\'\n-    host = \'Pseudomonas\'\n-    phage_type = \'virulent\'\n-    \'\'\'\n-\n-    test_windows = get_testseqs65(gen_format, genome_file,both)\n-    try: score_test,dic_window = get_testScores(__location__,test_windows)\n-    \n-    except IndexError: print(\'Error. Input sequence can only have A,C,G or T\')\n-    \n-    else:\n-        df_test,df_testinfo = create_dftest(score_test,dic_window,family,host,phage_type)\n-        \n-        preds =  get_predictions(scaler_file, model_file, df_test,df_testinfo,threshold)\n-        if preds is None: print(\'There is no sequence with a score value higher or equal to the threshold \'+str(threshold))\n-        elif type(preds) == str: print(preds)\n-        else: output = get_finaldf(preds)\n-        \n'
b
diff -r f391041379cc -r 4691c4418b00 phage_promoters.xml
--- a/phage_promoters.xml Wed Aug 01 05:59:45 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,104 +0,0 @@
-<tool id="get_proms" name="PhagePromoters" version="0.1.0">
-    <description>
-Get promoters of phage genomes
-    </description>
-    <requirements>
-        <requirement type="package">biopython</requirement>
-        <requirement type="package">scikit-learn</requirement>
-        <requirement type="package">numpy</requirement>
-        <requirement type="package">pandas</requirement>
-    </requirements>
-    <command detect_errors="exit_code" interpreter="python3"><![CDATA[ 
- phage_promoters.py "$input_type.genome_format" "$genome" "$both" "$threshold" "$family" "$bacteria"  "$lifecycle"
- ]]>
-    </command>
-    <inputs>
- <conditional name="input_type">
-           <param type="select" name="genome_format" label='file format'>
-              <option value="genbank" selected="yes">genbank</option>
-              <option value="fasta">fasta</option>
-    </param>
-    <when value="genbank">
-              <param type="data" name="genome" format="genbank" label='genome'/>
-    </when>
-    <when value="fasta">
-              <param type="data" name="genome" format="fasta" label='genome'/>
-    </when>
-        </conditional>
-        <param type="boolean" name="both" label='Search both strands' checked="false" truevalue="-both" falsevalue="" />
- <param name="threshold" type="float" value="0.50" label="Threshold" help="Probabilty of being a promoter (float between 0 and 1)" />
-        <param type="select" name="family" label='Phage family'>
-   <option value="Podoviridae" selected="yes">Podoviridae</option>
-   <option value="Siphoviridae">Siphoviridae</option>
-   <option value="Myoviridae">Myoviridae</option>
- </param>
-        <param type="select" name="bacteria" label='Host bacteria Genus'>
-   <option value="Escherichia coli" selected="yes">Escherichia coli</option>
-   <option value="Salmonella">Salmonella</option>
-   <option value="Pseudomonas">Pseudomonas</option>
-   <option value="Yersinia">Yersinia</option>
-   <option value="Morganella">Morganella</option>
-   <option value="Cronobacter">Cronobacter</option>
-   <option value="Staphylococcus">Staphylococcus</option>
-   <option value="Streptococcus">Streptococcus</option>
-   <option value="Lactococcus">Lactococcus</option>
-   <option value="Streptomyces">Streptomyces</option>
-   <option value="Klebsiella">Klebsiella</option>
-   <option value="Bacillus">Bacillus</option>
-   <option value="Pectobacterium">Pectobacterium</option>
-   <option value="other">other</option>
- </param>
-        <param type="select" name="lifecycle" label='Phage type'>
-   <option value="virulent" selected="yes">virulent</option>
-   <option value="temperate">temperate</option>
- </param>
-    </inputs>
-    <outputs>
-        <data name="output1" format="html" from_work_dir="output.html" />
-        <data name="output2" format="fasta" from_work_dir="output.fasta" />
-    </outputs>
-    <tests>
-        <test>
-     <param name="genome_format" value="genbank"/>
-            <param name="genome" value="NC_015264.gb"/>
-            <param name="both" value="False"/>
-     <param name="threshold" value="0.50"/>
-            <param name="family" value="Podoviridae"/>
-            <param name="bacteria" value="Pseudomonas"/>
-            <param name="lifecycle" value="virulent"/>
-            <output name="output1" file="output.html"/>
-            <output name="output2" file="output.fasta"/>
-        </test>
-    </tests>
-    <help><![CDATA[
-
-===============
-PhagePromoters
-===============
-
-Get promoters of phage genomes
-
-PhagePromoters is a python script that predicts promoter sequences in phage genomes, using a machine learning SVM model. This model was built from a train dataset with 25 features and 3200 examples (800 positives and 2400 negatives), each representing a 65 bp sequence of a phage genome. The positive cases represent the phage sequences that are already identified as promoters. 
-
-**Inputs:**
-
-* genome format: fasta vs genbank; 
-* genome file: acepts both genbank and fasta formats;
-* both strands (yes or no): allows the search in both DNA strands;
-* threshold: represents the probability of the test sequence be a promoter (float between 0 and 1)"
-* family: The family of the testing phage - Podoviridae, Siphoviridae or Myoviridae;
-* Bacteria: The host of the phage. The train dataset include the following hosts: Bacillus, EColi, Salmonella, Pseudomonas, Yersinia, Klebsiella, Pectobacterium, Morganella, Cronobacter, Staphylococcus, Streptococcus, Streptomyces, Lactococcus. If the testing phage has a different host, select the option 'other', and it is recommended the use of a higher threshold value for more accurate results.
-* phage type: The type of the phage, according to its lifecycle: virulent or temperate;
-
-**Outputs:**
-This tool outputs two files: a FASTA file and a table in HTML, with the locations, sequence, score and type (recognized by host or phage RNAP) of the predicted promoters.
-
-**Requirements:**
-
-* Biopython
-* Sklearn 
-* Numpy
-* Pandas  
-
-    ]]></help>
-</tool>
b
diff -r f391041379cc -r 4691c4418b00 scaler.sav
b
Binary file scaler.sav has changed