Mercurial > repos > martasampaio > phagepromoter
view auxiliar.py @ 0:404a98e6759c draft
Uploaded
author | martasampaio |
---|---|
date | Sat, 20 Apr 2019 10:55:45 -0400 |
parents | |
children |
line wrap: on
line source
# -*- coding: utf-8 -*- """ Created on Sun May 27 17:37:09 2018 @author: Marta """ #get the phage host from the file 'bacteria.xlsx' def get_bacteria(file): import pandas as pd df = pd.read_excel(file,header=0,index_col=0) bacteria = {} for ind,row in df.iterrows(): bac = row['Bacteria'] bacteria[ind] = bac return bacteria #get the phage family from the file 'family.xlsx' def get_families(file): import pandas as pd df = pd.read_excel(file,header=0,index_col=0) families = {} for ind,row in df.iterrows(): fam = row['Family'] families[ind] = fam return families #get phage lifecycle from the file 'lifecycle.xlsx' def get_lifecycle(file): import pandas as pd df = pd.read_excel(file,header=0,index_col=0) types = {} for ind,row in df.iterrows(): lc = row['lifecycle'] types[ind] = lc return types #reads a file with a PSSM and return the max possible score of that PSSM def get_max_pssm(file_pssm): from Bio.Alphabet import IUPAC from Bio.motifs import matrix m = [] fic = open(file_pssm,'r') rf = fic.readline() while rf: new_l = [] l = rf.strip().split('\t') for val in l: x = float(val) new_l.append(x) m.append(new_l) rf = fic.readline() a = IUPAC.unambiguous_dna dic = {'A':m[0],'C':m[1], 'G':m[2], 'T':m[3]} pssm = matrix.PositionSpecificScoringMatrix(a,dic) return pssm.max #reads a file with a PSSM and returns a list of scores in all positions of the sequence #returns the score divided by the maximum possible value def get_scores(file_pssm, seq): from Bio.Alphabet import IUPAC from Bio.motifs import matrix maxi = get_max_pssm(file_pssm) m = [] fic = open(file_pssm,'r') rf = fic.readline() while rf: new_l = [] l = rf.strip().split('\t') for val in l: x = float(val) new_l.append(x) m.append(new_l) rf = fic.readline() a = IUPAC.unambiguous_dna dic = {'A':m[0],'C':m[1], 'G':m[2], 'T':m[3]} pssm = matrix.PositionSpecificScoringMatrix(a,dic) scores = [] positions = [] a = IUPAC.unambiguous_dna seq.alphabet = a for pos, score in pssm.search(seq, both=False,threshold=-50): scores.append(score/maxi) positions.append(pos) return scores,positions #returns the frequencia of A and T bases in a sequence def freq_base(seq): A = seq.count('A') T = seq.count('T') AT = A+T return AT #returns the free energy value of that sequence def free_energy(seq): dic1 = {'AA':-1.00, 'TT':-1.00, 'AT':-0.88, 'TA':-0.58, 'CA':-1.45, 'AC':-1.44, 'GG':-1.84, 'CC':-1.84, 'GA':-1.30, 'AG':-1.28, 'TC':-1.30, 'CT':-1.28, 'TG':-1.45, 'GT':-1.44, 'GC':-2.24, 'CG':-2.17} total = 0 i = 0 j = 1 while i < len(seq)-1: dint = seq[i]+seq[j] total += dic1[dint] i += 1 j += 1 return total