# HG changeset patch # User martasampaio # Date 1555772145 14400 # Node ID 404a98e6759c7a9d4b9876e7eab7d97c2336ced3 Uploaded diff -r 000000000000 -r 404a98e6759c auxiliar.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/auxiliar.py Sat Apr 20 10:55:45 2019 -0400 @@ -0,0 +1,121 @@ +# -*- coding: utf-8 -*- +""" +Created on Sun May 27 17:37:09 2018 + +@author: Marta +""" + + +#get the phage host from the file 'bacteria.xlsx' +def get_bacteria(file): + import pandas as pd + df = pd.read_excel(file,header=0,index_col=0) + bacteria = {} + for ind,row in df.iterrows(): + bac = row['Bacteria'] + bacteria[ind] = bac + return bacteria + +#get the phage family from the file 'family.xlsx' +def get_families(file): + import pandas as pd + df = pd.read_excel(file,header=0,index_col=0) + families = {} + for ind,row in df.iterrows(): + fam = row['Family'] + families[ind] = fam + return families + +#get phage lifecycle from the file 'lifecycle.xlsx' +def get_lifecycle(file): + import pandas as pd + df = pd.read_excel(file,header=0,index_col=0) + types = {} + for ind,row in df.iterrows(): + lc = row['lifecycle'] + types[ind] = lc + return types + +#reads a file with a PSSM and return the max possible score of that PSSM +def get_max_pssm(file_pssm): + from Bio.Alphabet import IUPAC + from Bio.motifs import matrix + m = [] + fic = open(file_pssm,'r') + rf = fic.readline() + while rf: + new_l = [] + l = rf.strip().split('\t') + for val in l: + x = float(val) + new_l.append(x) + m.append(new_l) + rf = fic.readline() + a = IUPAC.unambiguous_dna + dic = {'A':m[0],'C':m[1], 'G':m[2], 'T':m[3]} + pssm = matrix.PositionSpecificScoringMatrix(a,dic) + return pssm.max + +#reads a file with a PSSM and returns a list of scores in all positions of the sequence +#returns the score divided by the maximum possible value +def get_scores(file_pssm, seq): + from Bio.Alphabet import IUPAC + from Bio.motifs import matrix + maxi = get_max_pssm(file_pssm) + m = [] + fic = open(file_pssm,'r') + rf = fic.readline() + while rf: + new_l = [] + l = rf.strip().split('\t') + for val in l: + x = float(val) + new_l.append(x) + m.append(new_l) + rf = fic.readline() + a = IUPAC.unambiguous_dna + dic = {'A':m[0],'C':m[1], 'G':m[2], 'T':m[3]} + pssm = matrix.PositionSpecificScoringMatrix(a,dic) + scores = [] + positions = [] + a = IUPAC.unambiguous_dna + seq.alphabet = a + for pos, score in pssm.search(seq, both=False,threshold=-50): + scores.append(score/maxi) + positions.append(pos) + return scores,positions + +#returns the frequencia of A and T bases in a sequence +def freq_base(seq): + A = seq.count('A') + T = seq.count('T') + AT = A+T + return AT + +#returns the free energy value of that sequence +def free_energy(seq): + dic1 = {'AA':-1.00, + 'TT':-1.00, + 'AT':-0.88, + 'TA':-0.58, + 'CA':-1.45, + 'AC':-1.44, + 'GG':-1.84, + 'CC':-1.84, + 'GA':-1.30, + 'AG':-1.28, + 'TC':-1.30, + 'CT':-1.28, + 'TG':-1.45, + 'GT':-1.44, + 'GC':-2.24, + 'CG':-2.17} + total = 0 + i = 0 + j = 1 + while i < len(seq)-1: + dint = seq[i]+seq[j] + total += dic1[dint] + i += 1 + j += 1 + return total \ No newline at end of file