annotate auxiliar.py @ 13:8b9534a83ae2 draft

Uploaded
author martasampaio
date Sat, 20 Apr 2019 11:03:04 -0400
parents 404a98e6759c
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
1 # -*- coding: utf-8 -*-
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
2 """
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
3 Created on Sun May 27 17:37:09 2018
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
4
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
5 @author: Marta
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
6 """
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
7
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
8
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
9 #get the phage host from the file 'bacteria.xlsx'
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
10 def get_bacteria(file):
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
11 import pandas as pd
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
12 df = pd.read_excel(file,header=0,index_col=0)
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
13 bacteria = {}
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
14 for ind,row in df.iterrows():
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
15 bac = row['Bacteria']
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
16 bacteria[ind] = bac
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
17 return bacteria
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
18
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
19 #get the phage family from the file 'family.xlsx'
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
20 def get_families(file):
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
21 import pandas as pd
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
22 df = pd.read_excel(file,header=0,index_col=0)
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
23 families = {}
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
24 for ind,row in df.iterrows():
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
25 fam = row['Family']
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
26 families[ind] = fam
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
27 return families
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
28
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
29 #get phage lifecycle from the file 'lifecycle.xlsx'
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
30 def get_lifecycle(file):
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
31 import pandas as pd
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
32 df = pd.read_excel(file,header=0,index_col=0)
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
33 types = {}
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
34 for ind,row in df.iterrows():
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
35 lc = row['lifecycle']
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
36 types[ind] = lc
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
37 return types
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
38
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
39 #reads a file with a PSSM and return the max possible score of that PSSM
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
40 def get_max_pssm(file_pssm):
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
41 from Bio.Alphabet import IUPAC
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
42 from Bio.motifs import matrix
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
43 m = []
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
44 fic = open(file_pssm,'r')
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
45 rf = fic.readline()
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
46 while rf:
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
47 new_l = []
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
48 l = rf.strip().split('\t')
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
49 for val in l:
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
50 x = float(val)
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
51 new_l.append(x)
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
52 m.append(new_l)
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
53 rf = fic.readline()
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
54 a = IUPAC.unambiguous_dna
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
55 dic = {'A':m[0],'C':m[1], 'G':m[2], 'T':m[3]}
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
56 pssm = matrix.PositionSpecificScoringMatrix(a,dic)
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
57 return pssm.max
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
58
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
59 #reads a file with a PSSM and returns a list of scores in all positions of the sequence
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
60 #returns the score divided by the maximum possible value
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
61 def get_scores(file_pssm, seq):
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
62 from Bio.Alphabet import IUPAC
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
63 from Bio.motifs import matrix
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
64 maxi = get_max_pssm(file_pssm)
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
65 m = []
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
66 fic = open(file_pssm,'r')
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
67 rf = fic.readline()
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
68 while rf:
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
69 new_l = []
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
70 l = rf.strip().split('\t')
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
71 for val in l:
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
72 x = float(val)
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
73 new_l.append(x)
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
74 m.append(new_l)
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
75 rf = fic.readline()
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
76 a = IUPAC.unambiguous_dna
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
77 dic = {'A':m[0],'C':m[1], 'G':m[2], 'T':m[3]}
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
78 pssm = matrix.PositionSpecificScoringMatrix(a,dic)
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
79 scores = []
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
80 positions = []
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
81 a = IUPAC.unambiguous_dna
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
82 seq.alphabet = a
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
83 for pos, score in pssm.search(seq, both=False,threshold=-50):
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
84 scores.append(score/maxi)
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
85 positions.append(pos)
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
86 return scores,positions
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
87
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
88 #returns the frequencia of A and T bases in a sequence
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
89 def freq_base(seq):
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
90 A = seq.count('A')
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
91 T = seq.count('T')
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
92 AT = A+T
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
93 return AT
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
94
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
95 #returns the free energy value of that sequence
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
96 def free_energy(seq):
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
97 dic1 = {'AA':-1.00,
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
98 'TT':-1.00,
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
99 'AT':-0.88,
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
100 'TA':-0.58,
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
101 'CA':-1.45,
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
102 'AC':-1.44,
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
103 'GG':-1.84,
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
104 'CC':-1.84,
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
105 'GA':-1.30,
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
106 'AG':-1.28,
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
107 'TC':-1.30,
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
108 'CT':-1.28,
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
109 'TG':-1.45,
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
110 'GT':-1.44,
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
111 'GC':-2.24,
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
112 'CG':-2.17}
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
113 total = 0
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
114 i = 0
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
115 j = 1
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
116 while i < len(seq)-1:
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
117 dint = seq[i]+seq[j]
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
118 total += dic1[dint]
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
119 i += 1
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
120 j += 1
404a98e6759c Uploaded
martasampaio
parents:
diff changeset
121 return total