view auxiliar.py @ 0:404a98e6759c draft

Uploaded
author martasampaio
date Sat, 20 Apr 2019 10:55:45 -0400
parents
children
line wrap: on
line source

# -*- coding: utf-8 -*-
"""
Created on Sun May 27 17:37:09 2018

@author: Marta
"""


#get the phage host from the file 'bacteria.xlsx'
def get_bacteria(file):
    import pandas as pd
    df = pd.read_excel(file,header=0,index_col=0)
    bacteria = {}
    for ind,row in df.iterrows():
        bac = row['Bacteria']
        bacteria[ind] = bac
    return bacteria

#get the phage family from the file 'family.xlsx'
def get_families(file):
    import pandas as pd
    df = pd.read_excel(file,header=0,index_col=0)
    families = {}
    for ind,row in df.iterrows():
        fam = row['Family']
        families[ind] = fam
    return families

#get phage lifecycle from the file 'lifecycle.xlsx'
def get_lifecycle(file):
    import pandas as pd
    df = pd.read_excel(file,header=0,index_col=0)
    types = {}
    for ind,row in df.iterrows():
        lc = row['lifecycle']
        types[ind] = lc
    return types

#reads a file with a PSSM and return the max possible score of that PSSM
def get_max_pssm(file_pssm):
    from Bio.Alphabet import IUPAC
    from Bio.motifs import matrix
    m = []
    fic = open(file_pssm,'r')
    rf = fic.readline()
    while rf:
        new_l = []
        l = rf.strip().split('\t')
        for val in l:
            x = float(val)
            new_l.append(x)
        m.append(new_l)
        rf = fic.readline()
    a = IUPAC.unambiguous_dna
    dic = {'A':m[0],'C':m[1], 'G':m[2], 'T':m[3]}
    pssm = matrix.PositionSpecificScoringMatrix(a,dic)
    return pssm.max

#reads a file with a PSSM and returns a list of scores in all positions of the sequence
#returns the score divided by the maximum possible value
def get_scores(file_pssm, seq):
    from Bio.Alphabet import IUPAC
    from Bio.motifs import matrix
    maxi = get_max_pssm(file_pssm)
    m = []
    fic = open(file_pssm,'r')
    rf = fic.readline()
    while rf:
        new_l = []
        l = rf.strip().split('\t')
        for val in l:
            x = float(val)
            new_l.append(x)
        m.append(new_l)
        rf = fic.readline()
    a = IUPAC.unambiguous_dna
    dic = {'A':m[0],'C':m[1], 'G':m[2], 'T':m[3]}
    pssm = matrix.PositionSpecificScoringMatrix(a,dic)
    scores = []
    positions = []
    a = IUPAC.unambiguous_dna
    seq.alphabet = a
    for pos, score in pssm.search(seq, both=False,threshold=-50):
        scores.append(score/maxi)
        positions.append(pos)
    return scores,positions

#returns the frequencia of A and T bases in a sequence    
def freq_base(seq):
    A = seq.count('A')
    T = seq.count('T')
    AT = A+T
    return AT

#returns the free energy value of that sequence
def free_energy(seq):
    dic1 = {'AA':-1.00, 
        'TT':-1.00, 
        'AT':-0.88, 
        'TA':-0.58, 
        'CA':-1.45,
        'AC':-1.44, 
        'GG':-1.84, 
        'CC':-1.84, 
        'GA':-1.30, 
        'AG':-1.28, 
        'TC':-1.30, 
        'CT':-1.28, 
        'TG':-1.45,
        'GT':-1.44,
        'GC':-2.24,
        'CG':-2.17}
    total = 0
    i = 0
    j = 1
    while i < len(seq)-1:
        dint = seq[i]+seq[j]
        total += dic1[dint]
        i += 1
        j += 1
    return total