annotate phagepromoter.py @ 6:30b5e33eca40 draft

Uploaded
author martasampaio
date Sat, 20 Apr 2019 10:58:17 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
1 # -*- coding: utf-8 -*-
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
2 """
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
3 Created on Thu Jul 19 13:45:05 2018
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
4
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
5 @author: Marta
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
6 """
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
7
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
8 from Bio import SeqIO
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
9 import numpy as np
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
10 import pandas as pd
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
11 from auxiliar import free_energy,freq_base
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
12 from Bio.Seq import Seq
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
13 from Bio.SeqRecord import SeqRecord
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
14 from Bio.Alphabet import IUPAC
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
15 from auxiliar import get_bacteria, get_families, get_max_pssm, get_scores, get_lifecycle
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
16
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
17 #division of the test genome in sequences of 65 bp
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
18 def get_testseqs65(form,fic,both=False):
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
19 ALL = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
20 indexes = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
21 a = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
22 rec = SeqIO.read(fic,form)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
23 genome = rec.seq
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
24 i = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
25 j = 65
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
26 while j < len(genome):
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
27 s = genome[i:j]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
28 ALL.append([1,i,j,s])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
29 i += 20
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
30 j += 20
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
31 a += 1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
32 indexes.append(rec.name+":"+str(a))
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
33 if both:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
34 comp = genome.reverse_complement()
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
35 size = len(rec.seq)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
36 i = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
37 j = 65
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
38 while j < len(comp):
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
39 s = comp[i:j]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
40 ALL.append([-1,size-j,size-i,s])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
41 i += 20
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
42 j += 20
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
43 a += 1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
44 indexes.append(rec.name+":"+str(a))
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
45 df = pd.DataFrame(ALL, index=indexes, columns=['strand','iniprom','endprom','seq'])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
46 return df
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
47
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
48 #calculate the scores of all sequences (similar to get_posScores and get_negScores)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
49 def get_testScores(loc,test):
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
50 scores = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
51 posis = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
52 sizes = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
53 dic = {}
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
54 for ind,row in test.iterrows():
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
55 _,window = ind.split(':')
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
56 strand = row['strand']
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
57 ini = row['iniprom']
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
58 end = row['endprom']
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
59 seq = row['seq']
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
60 pos = [ini,end,strand]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
61 dic[window] = pos
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
62 s = seq
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
63 score10_6,pos10_6 = get_scores(os.path.join(loc,'pssm10_6.txt'), s)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
64 maxi10_6 = get_max_pssm(os.path.join(loc,'pssm10_6.txt'))
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
65 score10_8,pos10_8 = get_scores(os.path.join(loc,'pssm10_8.txt'), s)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
66 maxi10_8 = get_max_pssm(os.path.join(loc,'pssm10_8.txt'))
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
67 scores23,pos23 = get_scores(os.path.join(loc,'pssm_23.txt'), s)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
68 maxi23 = get_max_pssm(os.path.join(loc,'pssm_23.txt'))
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
69 scores21,pos21 = get_scores(os.path.join(loc,'pssm_21.txt'), s)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
70 maxi21 = get_max_pssm(os.path.join(loc,'pssm_21.txt'))
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
71 scores27,pos27 = get_scores(os.path.join(loc,'pssm_27.txt'), s)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
72 maxi27 = get_max_pssm(os.path.join(loc,'pssm_27.txt'))
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
73 scores32,pos32 = get_scores(os.path.join(loc,'pssm_32.txt'), s)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
74 maxi32 = get_max_pssm(os.path.join(loc,'pssm_32.txt'))
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
75 score23 = max(scores23)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
76 score21 = max(scores21)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
77 score27 = max(scores27)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
78 score32 = max(scores32)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
79 maxiphage = max(score23,score21,score27,score32)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
80 if maxiphage == score23: phage_max = score23*maxi23
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
81 elif maxiphage == score21: phage_max = score21*maxi21
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
82 elif maxiphage == score27: phage_max = score27*maxi27
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
83 elif maxiphage == score32: phage_max = score32*maxi32
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
84 score35_6,pos35_6 = get_scores(os.path.join(loc,'pssm35_6.txt'), s)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
85 maxi35_6 = get_max_pssm(os.path.join(loc,'pssm35_6.txt'))
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
86 score35_9,pos35_9 = get_scores(os.path.join(loc,'pssm35_9.txt'), s)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
87 maxi35_9 = get_max_pssm(os.path.join(loc,'pssm35_9.txt'))
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
88 score35_t4,pos35_t4 = get_scores(os.path.join(loc,'pssm35_t4.txt'), s)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
89 maxi35_t4 = get_max_pssm(os.path.join(loc,'pssm35_t4.txt'))
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
90 score35_cbb,pos35_cbb = get_scores(os.path.join(loc,'pssm35_cbb.txt'), s)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
91 maxi35_cbb = get_max_pssm(os.path.join(loc,'pssm35_cbb.txt'))
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
92 score35_lb,pos35_lb = get_scores(os.path.join(loc,'pssm35_lb.txt'),s)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
93 maxi35_lb = get_max_pssm(os.path.join(loc,'pssm35_lb.txt'))
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
94 score35_mu, pos35_mu = get_scores(os.path.join(loc,'pssm35_mu.txt'),s)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
95 maxi35_mu = get_max_pssm(os.path.join(loc,'pssm35_mu.txt'))
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
96
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
97 dists6 = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
98 score6 = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
99 for p in pos10_6:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
100 for a in range(14,22):
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
101 d = p-a-6
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
102 if d >= 0:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
103 s10 = score10_6[p]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
104 s35_6 = score35_6[d]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
105 score6.append([s35_6,s10])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
106 dists6.append([d,p])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
107 dists9 = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
108 score9 = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
109 for p in pos10_6:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
110 for a in range(11,14):
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
111 d = p-a-9
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
112 if d >= 0:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
113 s10 = score10_6[p]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
114 s35_9 = score35_9[d]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
115 score9.append([s35_9,s10])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
116 dists9.append([d,p])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
117 distst4 = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
118 scoret4 = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
119 distscbb = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
120 scorecbb = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
121 for p in pos10_6:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
122 for a in range(16,18):
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
123 d = p-a-7
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
124 if d >= 0:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
125 s10 = score10_6[p]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
126 s35_t4 = score35_t4[d]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
127 s35_cbb = score35_cbb[d]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
128 scoret4.append([s35_t4,s10])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
129 distst4.append([d,p])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
130 scorecbb.append([s35_cbb,s10])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
131 distscbb.append([d,p])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
132
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
133
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
134 distsmu = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
135 scoremu = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
136 for p in pos10_6:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
137 d = p-16-14
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
138 if d >= 0:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
139 s10 = score10_6[p]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
140 s35_mu = score35_mu[d]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
141 scoremu.append([s35_mu,s10])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
142 distsmu.append([d,p])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
143
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
144 distslb = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
145 scorelb = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
146 for p in pos10_6:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
147 d = p-13-14
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
148 if d >= 0:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
149 s10 = score10_6[p]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
150 s35_lb = score35_lb[d]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
151 scorelb.append([s35_lb,s10])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
152 distslb.append([d,p])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
153
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
154 soma6 = [sum(x) for x in score6]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
155 soma9 = [sum(x) for x in score9]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
156 somat4 = [sum(x) for x in scoret4]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
157 somacbb = [sum(x) for x in scorecbb]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
158 somamu = [sum(x) for x in scoremu]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
159 somalb = [sum(x) for x in scorelb]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
160
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
161 maxi6 = max(soma6)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
162 maxi9 = max(soma9)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
163 maxit4 = max(somat4)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
164 maxicbb = max(somacbb)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
165 maximu = max(somamu)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
166 maxilb = max(somalb)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
167 maxi_elems = max(maxi6,maxi9,maxit4,maxicbb,maxilb,maximu)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
168
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
169 if maxi_elems == maxilb:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
170 indmax = somalb.index(maxilb)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
171 sc35 = scorelb[indmax][0]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
172 sc10 = scorelb[indmax][1]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
173 score_elems = [sc35,sc10]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
174 posel = distslb[indmax]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
175 size35 = 14
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
176 elems_maxi = sc35*maxi35_lb+sc10*maxi10_6
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
177 elif maxi_elems == maximu:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
178 indmax = somamu.index(maximu)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
179 sc35 = scoremu[indmax][0]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
180 sc10 = scoremu[indmax][1]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
181 score_elems = [sc35,sc10]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
182 posel = distsmu[indmax]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
183 size35 = 14
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
184 elems_maxi = sc35*maxi35_mu+sc10*maxi10_6
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
185 elif maxi_elems == maxi9:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
186 indmax = soma9.index(maxi9)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
187 sc35 = score9[indmax][0]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
188 sc10 = score9[indmax][1]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
189 score_elems = [sc35,sc10]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
190 posel = dists9[indmax]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
191 size35 = 9
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
192 elems_maxi = sc35*maxi35_9+sc10*maxi10_6
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
193 elif maxi_elems == maxit4:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
194 indmax = somat4.index(maxit4)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
195 sc35 = scoret4[indmax][0]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
196 sc10 = scoret4[indmax][1]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
197 score_elems = [sc35,sc10]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
198 posel = distst4[indmax]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
199 size35 = 7
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
200 elems_maxi = sc35*maxi35_t4+sc10*maxi10_6
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
201 elif maxi_elems == maxicbb:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
202 indmax = somacbb.index(maxicbb)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
203 sc35 = scorecbb[indmax][0]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
204 sc10 = scorecbb[indmax][1]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
205 score_elems = [sc35,sc10]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
206 posel = distscbb[indmax]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
207 size35 = 7
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
208 elems_maxi = sc35*maxi35_cbb+sc10*maxi10_6
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
209 else:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
210 indmax = soma6.index(maxi6)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
211 sc35 = score6[indmax][0]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
212 sc10 = score6[indmax][1]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
213 score_elems = [sc35,sc10]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
214 posel = dists6[indmax]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
215 size35 = 6
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
216 elems_maxi = sc35*maxi35_6+sc10*maxi10_6
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
217
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
218 if score23 == maxiphage:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
219 phage_score = score23
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
220 posiphage = scores23.index(score23)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
221 sizephage = 23
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
222 elif score21 == maxiphage:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
223 phage_score = score21
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
224 posiphage = scores21.index(score21)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
225 sizephage = 21
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
226 elif score27 == maxiphage:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
227 phage_score = score27
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
228 posiphage = scores27.index(score27)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
229 sizephage = 27
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
230 else:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
231 phage_score = score32
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
232 posiphage = scores32.index(score32)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
233 sizephage = 32
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
234
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
235 if elems_maxi >= max(score10_8)*maxi10_8:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
236 i = posel[1]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
237 ext = s[i-3:i-1]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
238 if ext == 'TG': tg = 1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
239 else: tg = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
240 if elems_maxi > phage_max: host = 1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
241 else:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
242 host = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
243 tg = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
244 sc = max(score10_8)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
245 end35 = posel[0]+size35
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
246 dist = posel[1]-end35
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
247 scores.append([host, score_elems[1],sc,score_elems[0],phage_score,tg,dist,str(seq)])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
248 posis.append([posel[1],posel[0],posiphage])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
249 sizes.append([6,size35,sizephage])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
250 else:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
251 host = 1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
252 sc = max(score10_8)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
253 i = score10_8.index(sc)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
254 ext = s[i-3:i-1]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
255 if ext == 'TG': tg = 1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
256 else: tg = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
257 if max(score10_8)*maxi10_8 > phage_max: host = 1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
258 else:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
259 host = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
260 tg = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
261 end35 = posel[0]+size35
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
262 dist = posel[1]-end35
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
263 scores.append([host,score_elems[1],sc,score_elems[0],phage_score,tg,dist,str(seq)])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
264 posis.append([i,posel[0],posiphage])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
265 sizes.append([8,size35,sizephage])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
266 score = pd.DataFrame(scores, index=test.index, columns=['host','score10','score10_8','score35','score_phage','tg','dist','seq'])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
267 posis = pd.DataFrame(posis, index=test.index, columns=['pos10','pos35','posphage'])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
268 sizes = pd.DataFrame(sizes, index=test.index, columns=['size10','size35','size_phage'])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
269 df_all = pd.concat([score,posis,sizes],axis=1)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
270 return df_all,dic
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
271
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
272 def create_dftest(scores_test,dic_window,family,bacteria,lifecycle):
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
273 tudo = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
274 tudo2 = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
275 for ind,row in scores_test.iterrows():
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
276 _,window = ind.split(':')
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
277 posis = dic_window[window]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
278 strand=posis[2]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
279 if strand == 1: ini=posis[0]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
280 else: ini=posis[1]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
281 seqprom = row['seq']
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
282 score10 = row['score10']
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
283 score10_8 = row['score10_8']
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
284 score35 = row['score35']
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
285 scorephage = row['score_phage']
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
286 size10 = row['size10']
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
287 size35 = row['size35']
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
288 sizephage = row['size_phage']
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
289 ini10 = row['pos10']
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
290 tg = row['tg']
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
291 host = row['host']
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
292 ini35 = row['pos35']
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
293 dist = row['dist']
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
294 end10=ini10+size10
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
295 iniphage = row['posphage']
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
296 endphage = iniphage+sizephage
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
297 if strand == 1:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
298 if host == 0:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
299 new_seq = seqprom[iniphage:endphage]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
300 new_ini = ini+iniphage+1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
301 new_end = ini+endphage
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
302 else:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
303 if size10 == 6:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
304 new_seq = seqprom[ini35:end10]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
305 new_ini = ini+ini35+1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
306 new_end = ini+end10
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
307 else:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
308 new_seq = seqprom[ini10:end10]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
309 new_ini = ini+ini10+1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
310 new_end = ini+end10
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
311 new_pos = '('+str(new_ini)+'..'+str(new_end)+')'
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
312 else:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
313 if host == 0:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
314 new_seq = seqprom[iniphage:endphage]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
315 new_ini = ini-endphage+1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
316 new_end = ini-iniphage
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
317 else:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
318 if size10 == 6:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
319 new_seq = seqprom[ini35:end10]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
320 new_ini = ini-end10+1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
321 new_end = ini-ini35
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
322 else:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
323 new_seq = seqprom[ini10:end10]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
324 new_ini = ini-end10+1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
325 new_end = ini-ini10
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
326 new_pos = 'complement('+str(new_ini)+'..'+str(new_end)+')'
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
327 if size10 == 6:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
328 size10_6 = 1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
329 else:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
330 size10_6 = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
331 if size35 == 6:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
332 size35_6 = 1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
333 size35_7 = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
334 size35_9 = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
335 size35_14 = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
336 elif size35 == 7:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
337 size35_6 = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
338 size35_7 = 1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
339 size35_9 = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
340 size35_14 = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
341 elif size35 == 9:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
342 size35_6 = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
343 size35_7 = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
344 size35_9 = 1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
345 size35_14 = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
346 else:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
347 size35_6 = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
348 size35_7 = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
349 size35_9 = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
350 size35_14 = 1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
351 if sizephage == 23:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
352 sizephage_23 = 1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
353 sizephage_21 = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
354 sizephage_32 = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
355 elif sizephage == 21:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
356 sizephage_23 = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
357 sizephage_21 = 1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
358 sizephage_32 = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
359 elif sizephage == 32:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
360 sizephage_23 = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
361 sizephage_21 = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
362 sizephage_32 = 1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
363 else:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
364 sizephage_23 = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
365 sizephage_21 = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
366 sizephage_32 = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
367 if family == 'Podoviridae':
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
368 Podo = 1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
369 Sipho = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
370 Myo = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
371 elif family == 'Siphoviridae':
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
372 Podo = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
373 Sipho = 1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
374 Myo = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
375 elif family == 'Myoviridae':
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
376 Podo = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
377 Sipho = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
378 Myo = 1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
379 else:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
380 Podo = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
381 Sipho = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
382 Myo = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
383 if bacteria == 'Bacillus':
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
384 bac = [1,0,0,0,0]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
385 elif bacteria == 'Escherichia coli':
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
386 bac = [0,1,0,0,0]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
387 elif bacteria == 'Klebsiella':
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
388 bac = [0,0,1,0,0]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
389 elif bacteria == 'Pectobacterium':
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
390 bac = [0,0,0,1,0]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
391 elif bacteria == 'Cronobacter':
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
392 bac = [0,0,0,0,1]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
393 else:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
394 bac = [0,0,0,0,0]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
395 if lifecycle == 'virulent': tp = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
396 else: tp = 1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
397 fe = free_energy(str(seqprom))
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
398 AT = freq_base(str(seqprom))
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
399 linha = [score10, score10_8, score35, dist, scorephage, fe, AT, host,size10_6,
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
400 size35_6, size35_7, size35_9, size35_14,
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
401 sizephage_23, sizephage_21,
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
402 sizephage_32, tg, Podo, Sipho, Myo,tp]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
403 linha.extend(bac)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
404 tudo.append(linha)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
405 linha2 = [new_pos,str(new_seq), host, size10_6,
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
406 score10, score10_8, size35_6, size35_7, size35_9,size35_14,
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
407 score35, dist, sizephage_23, sizephage_21,
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
408 sizephage_32, scorephage, tg, Podo, Sipho, Myo,tp, fe, AT]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
409 linha2.extend(bac)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
410 tudo2.append(linha2)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
411 df_test = pd.DataFrame(tudo, index=scores_test.index,
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
412 columns = ['score10', 'score10_8','score35', 'dist35_10',
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
413 'scorephage','fe', 'freqAT',
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
414 'host','size10',
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
415 'size35_6', 'size35_7','size35_9','size35_14',
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
416 'sizephage_23',
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
417 'sizephage_21', 'sizephage_32',
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
418 'TG', 'Podo', 'Sipho', 'Myo', 'tp',
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
419 'Bacillus', 'EColi',
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
420 'Pectobacterium','Klebsiella', 'Cronobacter'])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
421 df_INFO = pd.DataFrame(tudo2, index=scores_test.index,
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
422 columns = ['Positions','Promoter Sequence','host','size10',
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
423 'score10', 'score10_8',
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
424 'size35_6', 'size35_7','size35_9','size35_14',
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
425 'score35', 'dist35_10','sizephage_23',
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
426 'sizephage_21', 'sizephage_32',
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
427 'scorephage', 'TG', 'Podo', 'Sipho', 'Myo', 'tp','fe', 'freqAT',
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
428 'EColi', 'Salmonella',
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
429 'Pectobacterium','Cronobacter', 'Streptococcus'])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
430 return df_test,df_INFO
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
431
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
432
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
433 def get_predictions(scaler_file,model_file,test,df_testinfo,threshold):
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
434 from sklearn.externals import joblib
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
435 scaler = joblib.load(scaler_file)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
436 model = joblib.load(model_file)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
437 feat_scaled = pd.DataFrame(scaler.transform(test.iloc[:,:7]),index =test.index, columns=test.columns[:7])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
438 TEST_scaled = pd.concat([feat_scaled,test.iloc[:,7:]],axis=1)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
439 scores = model.predict_proba(TEST_scaled)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
440 pos_scores = np.empty((TEST_scaled.shape[0],0), float)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
441 for x in scores: pos_scores = np.append(pos_scores,x[1])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
442 try: positive_indexes = np.nonzero(pos_scores>float(threshold))[0] #escolher os positivos, podia ser escolher com score > x
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
443 except ValueError: return 'The threshold value is not a float'
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
444 else:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
445 if len(positive_indexes) == 0: return None
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
446 else:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
447 positive_windows = TEST_scaled.index[positive_indexes]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
448 INFO = df_testinfo.loc[positive_windows,['Positions','Promoter Sequence']]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
449 promoter_type = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
450 for x in df_testinfo.loc[positive_windows,'host'].tolist():
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
451 if x == 0: promoter_type.append('phage')
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
452 else: promoter_type.append('host')
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
453 INFO['Type'] = promoter_type
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
454 INFO['Scores'] = np.around(pos_scores[positive_indexes],decimals=3)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
455 INFO.index = positive_windows
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
456 return INFO
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
457
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
458
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
459 def get_finaldf(test,rec):
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
460 new_df = test.groupby(['Positions'])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
461 groups = list(new_df.groups.keys())
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
462 for i in range(len(groups)-1):
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
463 for j in range(i, len(groups)):
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
464 if 'complement' in groups[i]: inii = int(groups[i][11:].split('..')[0])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
465 else: inii = int(groups[i][1:].split('..')[0])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
466 if 'complement' in groups[j]: inij = int(groups[j][11:].split('..')[0])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
467 else: inij = int(groups[j][1:].split('..')[0])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
468 if inij < inii:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
469 temp = groups[i]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
470 groups[i] = groups[j]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
471 groups[j] = temp
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
472 new_inds = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
473 for g in groups:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
474 inds = new_df.groups[g]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
475 if len(inds) == 1: new_inds.append(inds[0])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
476 else:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
477 maxi = max(new_df.get_group(g)['Scores'])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
478 i = new_df.groups[g][new_df.get_group(g)['Scores']==maxi][0]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
479 new_inds.append(i)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
480
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
481 output = test.loc[new_inds,:]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
482 strands = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
483 new_pos = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
484 old_pos = output['Positions'].tolist()
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
485
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
486 from Bio.SeqFeature import SeqFeature, FeatureLocation
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
487 feats = rec.features
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
488 for ind, row in output.iterrows():
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
489 pos = row['Positions']
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
490 if 'complement' in pos:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
491 strands.append('-')
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
492 new_pos.append(pos[10:])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
493 ini,end= pos[11:-1].split('..')
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
494 new_loc = FeatureLocation(int(ini),int(end),strand=-1)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
495 else:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
496 strands.append('+')
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
497 new_pos.append(pos)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
498 ini,end= pos[1:-1].split('..')
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
499 new_loc = FeatureLocation(int(ini),int(end),strand=1)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
500 feat = SeqFeature(new_loc, type='regulatory',qualifiers={'regulatory_class':['promoter'], 'note=':['predicted by PhagePromoter']})
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
501 feats.append(feat)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
502
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
503 output.insert(loc=0, column='Strand', value=strands)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
504 output['Positions'] = new_pos
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
505 output.to_html('output.html',index=False, justify='center')
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
506 recs = []
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
507 i = 0
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
508 for ind,row in output.iterrows():
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
509 s = Seq(row['Promoter Sequence'])
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
510 posis = old_pos[i]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
511 typ = row['Type']
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
512 score = row['Scores']
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
513 sq = SeqRecord(seq=s, id=ind, description=typ+' '+posis+' score='+str(score))
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
514 recs.append(sq)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
515 i += 1
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
516 SeqIO.write(recs, 'output.fasta','fasta')
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
517 new_rec = rec
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
518 new_rec.seq.alphabet = IUPAC.IUPACAmbiguousDNA()
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
519 new_feats = sorted(feats, key=lambda x: x.location.start)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
520 new_rec.features = new_feats
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
521 SeqIO.write(new_rec,'output.gb','genbank')
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
522
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
523 if __name__== "__main__":
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
524
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
525 import sys
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
526 import os
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
527 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
528
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
529 gen_format = sys.argv[1]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
530 genome_file = sys.argv[2]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
531 both = sys.argv[3]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
532 threshold = sys.argv[4]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
533 family = sys.argv[5]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
534 host = sys.argv[6]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
535 phage_type = sys.argv[7]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
536 model = sys.argv[8]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
537 '''
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
538 gen_format = 'genbank'
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
539 genome_file = 'test-data/NC_015264.gb'
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
540 both = False
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
541 threshold = '0.50'
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
542 family = 'Podoviridae'
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
543 host = 'Pseudomonas'
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
544 phage_type = 'virulent'
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
545 model = 'SVM2400'
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
546 #model = 'ANN1600'
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
547 '''
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
548
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
549 rec = SeqIO.read(genome_file, gen_format)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
550 test_windows = get_testseqs65(gen_format, genome_file,both)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
551 try: score_test,dic_window = get_testScores(__location__,test_windows)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
552 except IndexError: print('Error. Input sequence can only have A,C,G or T')
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
553 else:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
554 df_test,df_testinfo = create_dftest(score_test,dic_window,family,host,phage_type)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
555 if model == 'ANN1600':
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
556 scaler_file = os.path.join(__location__, 'scaler1600.sav')
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
557 model_file = os.path.join(__location__, 'model1600.sav')
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
558 preds = get_predictions(scaler_file, model_file, df_test,df_testinfo,threshold)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
559 if preds is None: print('There is no sequence with a score value higher or equal to the threshold '+str(threshold))
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
560 elif type(preds) == str: print(preds)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
561 else: output = get_finaldf(preds,rec)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
562 else:
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
563 scaler_file = os.path.join(__location__, 'scaler2400.sav')
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
564 model_file = os.path.join(__location__, 'model2400.sav')
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
565 new_df_test = df_test.iloc[:,[0,1,2,3,4,5,6,7,8,9,13,14,16,17,19,20,22,24,25]]
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
566 preds = get_predictions(scaler_file, model_file, new_df_test,df_testinfo,threshold)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
567 if preds is None: print('There is no sequence with a score value higher or equal to the threshold '+str(threshold))
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
568 elif type(preds) == str: print(preds)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
569 else: output = get_finaldf(preds,rec)
30b5e33eca40 Uploaded
martasampaio
parents:
diff changeset
570