0
|
1 # -*- coding: utf-8 -*-
|
|
2 """
|
|
3 Created on Sun May 27 17:37:09 2018
|
|
4
|
|
5 @author: Marta
|
|
6 """
|
|
7
|
|
8
|
|
9 #get the phage host from the file 'bacteria.xlsx'
|
|
10 def get_bacteria(file):
|
|
11 import pandas as pd
|
|
12 df = pd.read_excel(file,header=0,index_col=0)
|
|
13 bacteria = {}
|
|
14 for ind,row in df.iterrows():
|
|
15 bac = row['Bacteria']
|
|
16 bacteria[ind] = bac
|
|
17 return bacteria
|
|
18
|
|
19 #get the phage family from the file 'family.xlsx'
|
|
20 def get_families(file):
|
|
21 import pandas as pd
|
|
22 df = pd.read_excel(file,header=0,index_col=0)
|
|
23 families = {}
|
|
24 for ind,row in df.iterrows():
|
|
25 fam = row['Family']
|
|
26 families[ind] = fam
|
|
27 return families
|
|
28
|
|
29 #get phage lifecycle from the file 'lifecycle.xlsx'
|
|
30 def get_lifecycle(file):
|
|
31 import pandas as pd
|
|
32 df = pd.read_excel(file,header=0,index_col=0)
|
|
33 types = {}
|
|
34 for ind,row in df.iterrows():
|
|
35 lc = row['lifecycle']
|
|
36 types[ind] = lc
|
|
37 return types
|
|
38
|
|
39 #reads a file with a PSSM and return the max possible score of that PSSM
|
|
40 def get_max_pssm(file_pssm):
|
|
41 from Bio.Alphabet import IUPAC
|
|
42 from Bio.motifs import matrix
|
|
43 m = []
|
|
44 fic = open(file_pssm,'r')
|
|
45 rf = fic.readline()
|
|
46 while rf:
|
|
47 new_l = []
|
|
48 l = rf.strip().split('\t')
|
|
49 for val in l:
|
|
50 x = float(val)
|
|
51 new_l.append(x)
|
|
52 m.append(new_l)
|
|
53 rf = fic.readline()
|
|
54 a = IUPAC.unambiguous_dna
|
|
55 dic = {'A':m[0],'C':m[1], 'G':m[2], 'T':m[3]}
|
|
56 pssm = matrix.PositionSpecificScoringMatrix(a,dic)
|
|
57 return pssm.max
|
|
58
|
|
59 #reads a file with a PSSM and returns a list of scores in all positions of the sequence
|
|
60 #returns the score divided by the maximum possible value
|
|
61 def get_scores(file_pssm, seq):
|
|
62 from Bio.Alphabet import IUPAC
|
|
63 from Bio.motifs import matrix
|
|
64 maxi = get_max_pssm(file_pssm)
|
|
65 m = []
|
|
66 fic = open(file_pssm,'r')
|
|
67 rf = fic.readline()
|
|
68 while rf:
|
|
69 new_l = []
|
|
70 l = rf.strip().split('\t')
|
|
71 for val in l:
|
|
72 x = float(val)
|
|
73 new_l.append(x)
|
|
74 m.append(new_l)
|
|
75 rf = fic.readline()
|
|
76 a = IUPAC.unambiguous_dna
|
|
77 dic = {'A':m[0],'C':m[1], 'G':m[2], 'T':m[3]}
|
|
78 pssm = matrix.PositionSpecificScoringMatrix(a,dic)
|
|
79 scores = []
|
|
80 positions = []
|
|
81 a = IUPAC.unambiguous_dna
|
|
82 seq.alphabet = a
|
|
83 for pos, score in pssm.search(seq, both=False,threshold=-50):
|
|
84 scores.append(score/maxi)
|
|
85 positions.append(pos)
|
|
86 return scores,positions
|
|
87
|
|
88 #returns the frequencia of A and T bases in a sequence
|
|
89 def freq_base(seq):
|
|
90 A = seq.count('A')
|
|
91 T = seq.count('T')
|
|
92 AT = A+T
|
|
93 return AT
|
|
94
|
|
95 #returns the free energy value of that sequence
|
|
96 def free_energy(seq):
|
|
97 dic1 = {'AA':-1.00,
|
|
98 'TT':-1.00,
|
|
99 'AT':-0.88,
|
|
100 'TA':-0.58,
|
|
101 'CA':-1.45,
|
|
102 'AC':-1.44,
|
|
103 'GG':-1.84,
|
|
104 'CC':-1.84,
|
|
105 'GA':-1.30,
|
|
106 'AG':-1.28,
|
|
107 'TC':-1.30,
|
|
108 'CT':-1.28,
|
|
109 'TG':-1.45,
|
|
110 'GT':-1.44,
|
|
111 'GC':-2.24,
|
|
112 'CG':-2.17}
|
|
113 total = 0
|
|
114 i = 0
|
|
115 j = 1
|
|
116 while i < len(seq)-1:
|
|
117 dint = seq[i]+seq[j]
|
|
118 total += dic1[dint]
|
|
119 i += 1
|
|
120 j += 1
|
|
121 return total |