0
|
1
|
|
2 class FeatureConstruction:
|
|
3
|
|
4 def __init__(self):
|
|
5 """
|
|
6 In development. Extract features from proteins.
|
|
7 """
|
|
8 import pandas as pd
|
|
9 import json
|
|
10 import ast
|
|
11 from pathlib import Path
|
|
12 import os
|
|
13 from random import randint
|
|
14 data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0)
|
|
15 with open('files/phagesProteins.json', encoding='utf-8') as F:
|
|
16 self.phagesProteins = json.loads(F.read())
|
|
17 self._filter_phage_domains()
|
|
18 # with open('files/bactProteins.json', encoding='utf-8') as F:
|
|
19 # self.bactProteins = json.loads(F.read())
|
|
20 # self._filter_bacteria()
|
|
21 all_phages = {}
|
|
22 ecoli = {}
|
|
23 kpneumoniae = {}
|
|
24 abaumannii = {}
|
|
25 my_file = Path("files/FeatureDataset")
|
|
26 if not my_file.is_file():
|
|
27 for phage in self.phageTails:
|
|
28 if phage in data.index and self.phageTails[phage]:
|
|
29 for bact in ast.literal_eval(data.loc[phage, 'Host_ID']):
|
|
30 bact = bact[:-2]
|
|
31 if bact + '.json' in os.listdir('files/bacteria'):
|
|
32 # if self.externalProts[bact]: # This verification is not necessary for carbohydrates
|
|
33 all_phages[phage + '--' + bact] = 'Yes'
|
|
34 name = data.loc[phage, 'Host']
|
|
35 if 'escherichia' in name.lower() or 'coli' in name.lower():
|
|
36 ecoli[bact] = 0
|
|
37 elif 'klebsiella' in name.lower() or 'pneumoniae' in name.lower():
|
|
38 kpneumoniae[bact] = 0
|
|
39 elif 'acinetobacter' in name.lower() or 'baumannii' in name.lower():
|
|
40 abaumannii[bact] = 0
|
|
41 for phage in self.phageTails:
|
|
42 if phage in data.index and self.phageTails[phage]:
|
|
43 # if self.phageTails[phage]:
|
|
44 name = data.loc[phage, 'Host']
|
|
45 if 'escherichia' in name.lower() or 'coli' in name.lower():
|
|
46 i = 0
|
|
47 while i < 12:
|
|
48 bact = list(kpneumoniae.keys())[randint(0, len(kpneumoniae.keys()) - 1)]
|
|
49 all_phages[phage + '--' + bact] = 'No'
|
|
50 i += 1
|
|
51 elif 'klebsiella' in name.lower() or 'pneumoniae' in name.lower():
|
|
52 i = 0
|
|
53 while i < 12:
|
|
54 bact = list(ecoli.keys())[randint(0, len(ecoli.keys()) - 1)]
|
|
55 all_phages[phage + '--' + bact] = 'No'
|
|
56 i += 1
|
|
57 elif 'acinetobacter' in name.lower() or 'baumannii' in name.lower():
|
|
58 i = 0
|
|
59 while i < 12:
|
|
60 bact = list(kpneumoniae.keys())[randint(0, len(kpneumoniae.keys()) - 1)]
|
|
61 all_phages[phage + '--' + bact] = 'No'
|
|
62 i += 1
|
|
63 self.features_data = pd.DataFrame({'ID': list(all_phages.keys()), 'Infects': list(all_phages.values())})
|
|
64 self.features_data = self.features_data.set_index('ID')
|
|
65 else:
|
|
66 self.import_feat_data()
|
|
67
|
|
68 def _filter_phage_domains(self):
|
|
69 import json
|
|
70 from pathlib import Path
|
|
71 '''
|
|
72 Filters out unwanted proteins. Domains that are unknown or are not associated with fibers, spikes, tails, enzymatic or binding are not considered.
|
|
73 Still in development.
|
|
74 :return: phageTails, a dictionary containing only
|
|
75 '''
|
|
76 my_file = Path("files/phageTails.json")
|
|
77 if not my_file.is_file():
|
|
78 self.phageTails = {}
|
|
79 for phage in self.phagesProteins:
|
|
80 self.phageTails[phage] = {}
|
|
81 for protein in self.phagesProteins[phage]:
|
|
82 if any(z in self.phagesProteins[phage][protein][0].lower() for z in ['fiber', 'fibre', 'spike', 'hydrolase', 'bind', 'depolymerase', 'peptidase', 'lyase', 'sialidase', 'dextranase', 'lipase', 'adhesin', 'baseplate', 'protein h', 'recognizing'
|
|
83 'protein j', 'protein g', 'gpe', 'duf4035', 'host specifity', 'cor protein', 'specificity', 'baseplate component', 'gp38', 'gp12 tail', 'receptor', 'recognition', 'tail']) \
|
|
84 and not any(z in self.phagesProteins[phage][protein][0].lower() for z in ['nucle', 'dna', 'rna', 'ligase', 'transferase', 'inhibitor', 'assembly', 'connect', 'nudix', 'atp', 'nad', 'transpos', 'ntp', 'molybdenum', 'hns',
|
|
85 'gtp', 'riib', 'inhibitor', 'replicat', 'codon', 'pyruvate', 'catalyst', 'hinge', 'sheath completion', 'head', 'capsid', 'tape', 'tip', 'strand', 'matur', 'portal'
|
|
86 'terminase', 'nucl', 'promot', 'block', 'olfact', 'wedge', 'lysozyme', 'mur', 'sheat']):
|
|
87 self.phageTails[phage][protein] = self.phagesProteins[phage][protein]
|
|
88 '''else:
|
|
89 for i in self.phagesProteins[phage][protein]:
|
|
90 if type(i) == str:
|
|
91 if any(z in str(i).lower() for z in ['fiber', 'fibre', 'spike', 'hydrolase', 'bind', 'depolymerase', 'peptidase', 'lyase', 'sialidase', 'dextranase', 'lipase', 'adhesin', 'baseplate', 'protein h', 'recognizing'
|
|
92 'protein j', 'protein g', 'gpe', 'duf4035', 'host specifity', 'cor protein', 'specificity', 'baseplate component', 'gp38', 'gp12 tail', 'receptor', 'recognition', 'tail']) \
|
|
93 and not any(z in str(i).lower() for z in ['nucle', 'dna', 'rna', 'ligase', 'transferase', 'inhibitor', 'assembly', 'connect', 'nudix', 'atp', 'nad', 'transpos', 'ntp', 'molybdenum', 'hns', 'gtp',
|
|
94 'riib', 'inhibitor', 'replicat', 'codon', 'pyruvate', 'catalyst', 'hinge', 'sheath completion', 'head', 'capsid', 'tape', 'tip', 'strand', 'matur', 'portal'
|
|
95 'terminase', 'nucl']):
|
|
96 self.phageTails[phage][protein] = self.phagesProteins[phage][protein]
|
|
97 else:
|
|
98 for j in i:
|
|
99 if any(z in str(j).lower() for z in ['fiber', 'fibre', 'spike', 'hydrolase', 'bind', 'depolymerase', 'peptidase', 'lyase', 'sialidase', 'dextranase', 'lipase', 'adhesin', 'baseplate', 'protein h', 'recognizing'
|
|
100 'protein j', 'protein g', 'gpe', 'duf4035', 'host specifity', 'cor protein', 'specificity', 'baseplate component', 'gp38', 'gp12 tail', 'receptor', 'recognition', 'tail']) \
|
|
101 and not any(z in str(j).lower() for z in ['nucle', 'dna', 'rna', 'ligase', 'transferase', 'inhibitor', 'assembly', 'connect', 'nudix', 'atp', 'nad', 'transpos', 'ntp', 'molybdenum', 'hns', 'gtp',
|
|
102 'riib', 'inhibitor', 'replicat', 'codon', 'pyruvate', 'catalyst', 'hinge', 'sheath completion', 'head', 'capsid', 'tape', 'tip', 'strand', 'matur', 'portal'
|
|
103 'terminase', 'nucl']):
|
|
104 self.phageTails[phage][protein] = self.phagesProteins[phage][protein]'''
|
|
105 with open('files/phageTails.json', 'w') as f:
|
|
106 json.dump(self.phageTails, f)
|
|
107 self.__create_phage_fasta()
|
|
108 else:
|
|
109 with open('files/phageTails.json', encoding='utf-8') as F:
|
|
110 self.phageTails = json.loads(F.read())
|
|
111 return self.phageTails
|
|
112
|
|
113 def _filter_bacteria(self):
|
|
114 import json
|
|
115 from pathlib import Path
|
|
116 import pandas as pd
|
|
117 my_file = Path("files/externalProts.json")
|
|
118 if not my_file.is_file():
|
|
119 self.externalProts = {}
|
|
120 predictions = pd.read_csv('files/results_psort.txt', sep='\t', index_col=False)
|
|
121 predictions = predictions.set_index('SeqID')
|
|
122 predictions = predictions.drop_duplicates()
|
|
123 for bac in self.bactProteins:
|
|
124 self.externalProts[bac] = {}
|
|
125 for protein in self.bactProteins[bac]:
|
|
126 if protein + ' ' in predictions.index:
|
|
127 maxScore = 0.0
|
|
128 for loc in ['Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Periplasmic_Score', 'OuterMembrane_Score', 'Extracellular_Score']:
|
|
129 if predictions.loc[protein + ' ', loc] > maxScore:
|
|
130 maxScore = predictions.loc[protein + ' ', loc]
|
|
131 location = loc
|
|
132 if location == 'CytoplasmicMembrane_Score' or location == 'OuterMembrane_Score' or location == 'Extracellular_Score':
|
|
133 self.externalProts[bac][protein] = self.bactProteins[bac][protein][1]
|
|
134 if self.externalProts != {}:
|
|
135 del self.bactProteins
|
|
136 with open('files/externalProts.json', 'w') as f:
|
|
137 json.dump(self.externalProts, f)
|
|
138 else:
|
|
139 with open('files/externalProts.json', encoding='utf-8') as F:
|
|
140 self.externalProts = json.loads(F.read())
|
|
141 return self.externalProts
|
|
142
|
|
143 def __create_phage_fasta(self):
|
|
144 """
|
|
145 Creates a fasta file containing every protein sequence for every phage.
|
|
146 :return:
|
|
147 """
|
|
148 with open('files/tails.fasta', 'w') as F:
|
|
149 for phage in self.phageTails:
|
|
150 for prot in self.phageTails[phage]:
|
|
151 F.write('>' + prot + '\n' + self.phageTails[phage][prot][1] + '\n')
|
|
152
|
|
153 def add_kmers(self):
|
|
154 from skbio import Sequence
|
|
155 import json
|
|
156 groups = '0123456'
|
|
157 freqs = {}
|
|
158 for i in groups:
|
|
159 for j in groups:
|
|
160 freqs[i+j] = 0.0
|
|
161 for i in freqs:
|
|
162 exec('phage_group_{0} = []'.format(i))
|
|
163 exec('bact_group_{0} = []'.format(i))
|
|
164 phage = ''
|
|
165 bact = ''
|
|
166 for ID in self.features_data.index:
|
|
167 done_phage = False
|
|
168 done_bact = False
|
|
169 if ID[:ID.find('--')] == phage:
|
|
170 for i in freqs.keys():
|
|
171 exec('phage_group_{0}.append(phage_group_{0}[-1])'.format(i))
|
|
172 done_phage = True
|
|
173 if ID[ID.find('--') + 2:] == bact:
|
|
174 for i in freqs.keys():
|
|
175 exec('bact_group_{0}.append(bact_group_{0}[-1])'.format(i))
|
|
176 done_bact = True
|
|
177 bact = ID[ID.find('--') + 2:]
|
|
178 phage = ID[:ID.find('--')]
|
|
179
|
|
180 if not done_phage:
|
|
181 totalKmers = freqs.copy()
|
|
182 count_prots = 0
|
|
183 for prot in self.list_prot[phage]:
|
|
184 max_freq = 0.0
|
|
185 min_freq = 1000000.0
|
|
186 count_prots += 1
|
|
187 seq = self.__get_conjoint_triad(self.list_prot[phage][prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C').replace('J', 'L'))
|
|
188 seq = Sequence(seq)
|
|
189 temp = seq.kmer_frequencies(2, overlap=True, relative=True)
|
|
190 for i in temp.keys(): # para normalizar
|
|
191 if temp[i] < min_freq:
|
|
192 min_freq = temp[i]
|
|
193 if temp[i] > max_freq:
|
|
194 max_freq = temp[i]
|
|
195 for i in temp.keys():
|
|
196 totalKmers[i] += temp[i] - (min_freq / max_freq)
|
|
197 if count_prots != 0:
|
|
198 for i in totalKmers.keys():
|
|
199 totalKmers[i] = totalKmers[i] / count_prots
|
|
200 temp_value = totalKmers[i]
|
|
201 exec('phage_group_{0}.append(temp_value)'.format(i))
|
|
202 else:
|
|
203 for i in totalKmers.keys():
|
|
204 exec('phage_group_{0}.append(0.0)'.format(i))
|
|
205
|
|
206 if not done_bact:
|
|
207 totalKmers = freqs.copy()
|
|
208 count_prots = 0
|
|
209 with open('files/bacteria/' + bact + '.json', encoding='utf-8') as F:
|
|
210 bact_prots = json.loads(F.read())
|
|
211 for prot in bact_prots:
|
|
212 max_freq = 0.0
|
|
213 min_freq = 1000000.0
|
|
214 count_prots += 1
|
|
215 seq = bact_prots[prot][1]
|
|
216 seq = seq[:seq.find('"')]
|
|
217 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C').replace('J', 'L'))
|
|
218 seq = Sequence(seq)
|
|
219 temp = seq.kmer_frequencies(2, overlap=True, relative=True)
|
|
220 for i in temp.keys(): # para normalizar
|
|
221 if temp[i] < min_freq:
|
|
222 min_freq = temp[i]
|
|
223 if temp[i] > max_freq:
|
|
224 max_freq = temp[i]
|
|
225 for i in temp.keys():
|
|
226 totalKmers[i] += temp[i] - (min_freq / max_freq)
|
|
227 if count_prots != 0:
|
|
228 for i in totalKmers.keys():
|
|
229 totalKmers[i] = totalKmers[i] / count_prots
|
|
230 temp_value = totalKmers[i]
|
|
231 exec('bact_group_{0}.append(temp_value)'.format(i))
|
|
232 else:
|
|
233 for i in freqs.keys():
|
|
234 exec('bact_group_{0}.append(0.0)'.format(i))
|
|
235
|
|
236 for i in freqs.keys():
|
|
237 exec('self.features_data["phage_kmer_{0}"] = phage_group_{0}'.format(i))
|
|
238 exec('self.features_data["bact_kmer_{0}"] = bact_group_{0}'.format(i))
|
|
239
|
|
240 def get_kmers(self, phage, bacteria):
|
|
241 from skbio import Sequence
|
|
242 solution = []
|
|
243 groups = '0123456'
|
|
244 freqs = {}
|
|
245 for i in groups:
|
|
246 for j in groups:
|
|
247 freqs[i+j] = 0.0
|
|
248 for i in freqs:
|
|
249 exec('phage_group_{0} = 0.0'.format(i))
|
|
250 exec('bact_group_{0} = 0.0'.format(i))
|
|
251
|
|
252 totalKmers = freqs.copy()
|
|
253 count_prots = 0
|
|
254 for prot in phage:
|
|
255 max_freq = 0.0
|
|
256 min_freq = 1000000.0
|
|
257 count_prots += 1
|
|
258 seq = self.__get_conjoint_triad(phage[prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
|
|
259 seq = Sequence(seq)
|
|
260 temp = seq.kmer_frequencies(2, overlap=True, relative=True)
|
|
261 for i in temp.keys(): # para normalizar
|
|
262 if temp[i] < min_freq:
|
|
263 min_freq = temp[i]
|
|
264 if temp[i] > max_freq:
|
|
265 max_freq = temp[i]
|
|
266 for i in temp.keys():
|
|
267 totalKmers[i] += temp[i] - (min_freq / max_freq)
|
|
268 if count_prots != 0:
|
|
269 for i in totalKmers.keys():
|
|
270 totalKmers[i] = totalKmers[i] / count_prots
|
|
271 temp_value = totalKmers[i]
|
|
272 exec('phage_group_{0} += temp_value'.format(i))
|
|
273
|
|
274 totalKmers = freqs.copy()
|
|
275 count_prots = 0
|
|
276 for prot in bacteria:
|
|
277 max_freq = 0.0
|
|
278 min_freq = 1000000.0
|
|
279 count_prots += 1
|
|
280 seq = bacteria[prot][1]
|
|
281 seq = seq[:seq.find('"')]
|
|
282 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
|
|
283 seq = Sequence(seq)
|
|
284 temp = seq.kmer_frequencies(2, overlap=True, relative=True)
|
|
285 for i in temp.keys(): # para normalizar
|
|
286 if temp[i] < min_freq:
|
|
287 min_freq = temp[i]
|
|
288 if temp[i] > max_freq:
|
|
289 max_freq = temp[i]
|
|
290 for i in temp.keys():
|
|
291 totalKmers[i] += temp[i] - (min_freq / max_freq)
|
|
292 if count_prots != 0:
|
|
293 for i in totalKmers.keys():
|
|
294 totalKmers[i] = totalKmers[i] / count_prots
|
|
295 temp_value = totalKmers[i]
|
|
296 exec('bact_group_{0} += temp_value'.format(i))
|
|
297
|
|
298 for i in freqs.keys():
|
|
299 exec('solution.append(phage_group_{0})'.format(i))
|
|
300 exec('solution.append(bact_group_{0})'.format(i))
|
|
301 return solution
|
|
302
|
|
303 def add_composition(self):
|
|
304 from skbio import Sequence
|
|
305 import json
|
|
306 bact_comp = {}
|
|
307 phage_comp = {}
|
|
308 groups = '0123456'
|
|
309 for i in groups:
|
|
310 bact_comp['comp_' + i] = []
|
|
311 phage_comp['comp_' + i] = []
|
|
312 phage = ''
|
|
313 bact = ''
|
|
314 count = -1
|
|
315 for ID in self.features_data.index:
|
|
316 done_phage = False
|
|
317 done_bact = False
|
|
318 count += 1
|
|
319 if ID[:ID.find('--')] == phage:
|
|
320 for i in groups:
|
|
321 phage_comp['comp_' + i].append(phage_comp['comp_' + i][-1])
|
|
322 done_phage = True
|
|
323 if ID[ID.find('--') + 2:] == bact:
|
|
324 for i in groups:
|
|
325 bact_comp['comp_' + i].append(bact_comp['comp_' + i][-1])
|
|
326 done_bact = True
|
|
327 bact = ID[ID.find('--') + 2:]
|
|
328 phage = ID[:ID.find('--')]
|
|
329
|
|
330 if not done_phage:
|
|
331 count_prots = 0
|
|
332 for i in groups:
|
|
333 phage_comp['comp_' + i].append(0)
|
|
334 for prot in self.list_prot[phage]:
|
|
335 max_comp = 0.0
|
|
336 min_comp = 1000000.0
|
|
337 count_prots += 1
|
|
338 seq = self.__get_conjoint_triad(self.list_prot[phage][prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
|
|
339 seq = Sequence(seq)
|
|
340 for i in groups: # para normalizar
|
|
341 if seq.count(i) < min_comp:
|
|
342 min_comp = seq.count(i)
|
|
343 if seq.count(i) > max_comp:
|
|
344 max_comp = seq.count(i)
|
|
345 for i in groups:
|
|
346 phage_comp['comp_' + i][count] += seq.count(i) - (min_comp / max_comp)
|
|
347 total = 0
|
|
348 if count_prots != 0:
|
|
349 for i in groups:
|
|
350 phage_comp['comp_' + i][count] = phage_comp['comp_' + i][count] / count_prots
|
|
351 total += phage_comp['comp_' + i][count]
|
|
352 for i in groups:
|
|
353 phage_comp['comp_' + i][count] = phage_comp['comp_' + i][count] / total
|
|
354 else:
|
|
355 for i in groups:
|
|
356 phage_comp['comp_' + i][count] = 0.0
|
|
357
|
|
358 if not done_bact:
|
|
359 count_prots = 0
|
|
360 for i in groups:
|
|
361 bact_comp['comp_' + i].append(0)
|
|
362 with open('files/bacteria/' + bact + '.json', encoding='utf-8') as F:
|
|
363 bact_prots = json.loads(F.read())
|
|
364 for prot in bact_prots:
|
|
365 max_comp = 0.0
|
|
366 min_comp = 1000000.0
|
|
367 count_prots += 1
|
|
368 seq = bact_prots[prot][1]
|
|
369 seq = seq[:seq.find('"')]
|
|
370 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
|
|
371 seq = Sequence(seq)
|
|
372 for i in groups:
|
|
373 if seq.count(i) < min_comp:
|
|
374 min_comp = seq.count(i)
|
|
375 if seq.count(i) > max_comp:
|
|
376 max_comp = seq.count(i)
|
|
377 for i in groups:
|
|
378 bact_comp['comp_' + i][count] += seq.count(i) - (min_comp / max_comp)
|
|
379 total = 0
|
|
380 if count_prots != 0:
|
|
381 for i in groups:
|
|
382 bact_comp['comp_' + i][count] = bact_comp['comp_' + i][count] / count_prots
|
|
383 total += bact_comp['comp_' + i][count]
|
|
384 else:
|
|
385 for i in groups:
|
|
386 bact_comp['comp_' + i][count] = 0.0
|
|
387 if total != 0:
|
|
388 for i in groups:
|
|
389 bact_comp['comp_' + i][count] = bact_comp['comp_' + i][count] / total
|
|
390 else:
|
|
391 for i in groups:
|
|
392 bact_comp['comp_' + i][count] = 0.0
|
|
393
|
|
394 for i in groups:
|
|
395 self.features_data['bact_comp_' + i] = bact_comp['comp_' + i]
|
|
396 self.features_data['phage_comp_' + i] = phage_comp['comp_' + i]
|
|
397
|
|
398 def get_composition(self, phage, bacteria):
|
|
399 from skbio import Sequence
|
|
400 solution = []
|
|
401 bact_comp = {}
|
|
402 phage_comp = {}
|
|
403 phage_comp_carb = {}
|
|
404 groups = '0123456'
|
|
405 for i in groups:
|
|
406 bact_comp['comp_' + i] = 0
|
|
407 phage_comp['comp_' + i] = 0
|
|
408 count_prots = 0
|
|
409 for prot in phage:
|
|
410 max_comp = 0.0
|
|
411 min_comp = 1000000.0
|
|
412 count_prots += 1
|
|
413 seq = self.__get_conjoint_triad(phage[prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
|
|
414 seq = Sequence(seq)
|
|
415 for i in groups: # para normalizar
|
|
416 if seq.count(i) < min_comp:
|
|
417 min_comp = seq.count(i)
|
|
418 if seq.count(i) > max_comp:
|
|
419 max_comp = seq.count(i)
|
|
420 for i in groups:
|
|
421 phage_comp['comp_' + i] += seq.count(i) - (min_comp / max_comp)
|
|
422 total = 0
|
|
423 if count_prots != 0:
|
|
424 for i in groups:
|
|
425 phage_comp['comp_' + i] = phage_comp['comp_' + i] / count_prots
|
|
426 total += phage_comp['comp_' + i]
|
|
427 for i in groups:
|
|
428 phage_comp['comp_' + i] = phage_comp['comp_' + i] / total
|
|
429 else:
|
|
430 for i in groups:
|
|
431 phage_comp['comp_' + i] = 0.0
|
|
432
|
|
433 count_prots = 0
|
|
434 for prot in bacteria:
|
|
435 max_comp = 0.0
|
|
436 min_comp = 1000000.0
|
|
437 count_prots += 1
|
|
438 seq = bacteria[prot][1]
|
|
439 seq = seq[:seq.find('"')]
|
|
440 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
|
|
441 seq = Sequence(seq)
|
|
442 for i in groups:
|
|
443 if seq.count(i) < min_comp:
|
|
444 min_comp = seq.count(i)
|
|
445 if seq.count(i) > max_comp:
|
|
446 max_comp = seq.count(i)
|
|
447 for i in groups:
|
|
448 bact_comp['comp_' + i] += seq.count(i) - (min_comp / max_comp)
|
|
449 total = 0
|
|
450 if count_prots != 0:
|
|
451 for i in groups:
|
|
452 bact_comp['comp_' + i] = bact_comp['comp_' + i] / count_prots
|
|
453 total += bact_comp['comp_' + i]
|
|
454 for i in groups:
|
|
455 bact_comp['comp_' + i] = bact_comp['comp_' + i] / total
|
|
456 else:
|
|
457 for i in groups:
|
|
458 bact_comp['comp_' + i] = 0.0
|
|
459
|
|
460 for i in groups:
|
|
461 solution.append(bact_comp['comp_' + i])
|
|
462 solution.append(phage_comp['comp_' + i])
|
|
463 return solution
|
|
464
|
|
465 def add_grouping(self):
|
|
466 from skbio import Sequence
|
|
467 import json
|
|
468 bact_group = {}
|
|
469 phage_group = {}
|
|
470 groups = '0123456'
|
|
471 letters = 'ABCDEFGHIJ'
|
|
472 for i in groups:
|
|
473 for j in letters:
|
|
474 bact_group['group' + j + '_' + i] = []
|
|
475 phage_group['group' + j + '_' + i] = []
|
|
476 phage = ''
|
|
477 bact = ''
|
|
478 count = -1
|
|
479 for ID in self.features_data.index:
|
|
480 done_phage = False
|
|
481 done_bact = False
|
|
482 count += 1
|
|
483 if ID[:ID.find('--')] == phage:
|
|
484 for i in groups:
|
|
485 for j in letters:
|
|
486 phage_group['group' + j + '_' + i].append(phage_group['group' + j + '_' + i][-1])
|
|
487 done_phage = True
|
|
488 if ID[ID.find('--') + 2:] == bact:
|
|
489 for i in groups:
|
|
490 for j in letters:
|
|
491 bact_group['group' + j + '_' + i].append(bact_group['group' + j + '_' + i][-1])
|
|
492 done_bact = True
|
|
493 bact = ID[ID.find('--') + 2:]
|
|
494 phage = ID[:ID.find('--')]
|
|
495
|
|
496 if not done_phage:
|
|
497 count_prots = 0
|
|
498 for i in groups:
|
|
499 for j in letters:
|
|
500 phage_group['group' + j + '_' + i].append(0)
|
|
501 for prot in self.list_prot[phage]:
|
|
502 count_prots += 1
|
|
503 seq = self.__get_conjoint_triad(self.list_prot[phage][prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
|
|
504 seq = Sequence(seq)
|
|
505 for j in letters:
|
|
506 group = self.__get_grouping(seq, j)
|
|
507 for i in groups:
|
|
508 phage_group['group' + j + '_' + i][count] += group[i]
|
|
509 if count_prots != 0:
|
|
510 for i in groups:
|
|
511 for j in letters:
|
|
512 phage_group['group' + j + '_' + i][count] = phage_group['group' + j + '_' + i][count] / count_prots
|
|
513 else:
|
|
514 for i in groups:
|
|
515 for j in letters:
|
|
516 phage_group['group' + j + '_' + i][count] = 0.0
|
|
517
|
|
518 if not done_bact:
|
|
519 count_prots = 0
|
|
520 for i in groups:
|
|
521 for j in letters:
|
|
522 bact_group['group' + j + '_' + i].append(0)
|
|
523 with open('files/bacteria/' + bact + '.json', encoding='utf-8') as F:
|
|
524 bact_prots = json.loads(F.read())
|
|
525 for prot in bact_prots:
|
|
526 count_prots += 1
|
|
527 seq = bact_prots[prot][1]
|
|
528 seq = seq[:seq.find('"')]
|
|
529 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
|
|
530 seq = Sequence(seq)
|
|
531 for j in letters:
|
|
532 group = self.__get_grouping(seq, j)
|
|
533 for i in groups:
|
|
534 bact_group['group' + j + '_' + i][count] += group[i]
|
|
535 if count_prots != 0:
|
|
536 for i in groups:
|
|
537 for j in letters:
|
|
538 bact_group['group' + j + '_' + i][count] = bact_group['group' + j + '_' + i][count] / count_prots
|
|
539 else:
|
|
540 for i in groups:
|
|
541 for j in letters:
|
|
542 bact_group['group' + j + '_' + i][count] = 0.0
|
|
543
|
|
544 for i in groups:
|
|
545 for j in letters:
|
|
546 self.features_data['bact_group' + j + '_' + i] = bact_group['group' + j + '_' + i]
|
|
547 self.features_data['phage_group' + j + '_' + i] = phage_group['group' + j + '_' + i]
|
|
548
|
|
549 def get_grouping(self, phage, bacteria):
|
|
550 from skbio import Sequence
|
|
551 bact_group = {}
|
|
552 phage_group = {}
|
|
553 groups = '0123456'
|
|
554 letters = 'ABCDEFGHIJ'
|
|
555 for i in groups:
|
|
556 for j in letters:
|
|
557 bact_group['group' + j + '_' + i] = 0
|
|
558 phage_group['group' + j + '_' + i] = 0
|
|
559 solution = []
|
|
560 count_prots = 0
|
|
561 for prot in phage:
|
|
562 count_prots += 1
|
|
563 seq = self.__get_conjoint_triad(phage[prot][1].replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
|
|
564 seq = Sequence(seq)
|
|
565 for j in letters:
|
|
566 group = self.__get_grouping(seq, j)
|
|
567 for i in groups:
|
|
568 phage_group['group' + j + '_' + i] += group[i]
|
|
569 if count_prots != 0:
|
|
570 for i in groups:
|
|
571 for j in letters:
|
|
572 phage_group['group' + j + '_' + i] = phage_group['group' + j + '_' + i] / count_prots
|
|
573 else:
|
|
574 for i in groups:
|
|
575 for j in letters:
|
|
576 phage_group['group' + j + '_' + i] = 0.0
|
|
577
|
|
578 count_prots = 0
|
|
579 for prot in bacteria:
|
|
580 count_prots += 1
|
|
581 seq = bacteria[prot][1]
|
|
582 seq = seq[:seq.find('"')]
|
|
583 seq = self.__get_conjoint_triad(seq.replace('X', 'D').replace('B', 'N').replace('Z', 'E').replace('U', 'C'))
|
|
584 seq = Sequence(seq)
|
|
585 for j in letters:
|
|
586 group = self.__get_grouping(seq, j)
|
|
587 for i in groups:
|
|
588 bact_group['group' + j + '_' + i] += group[i]
|
|
589 if count_prots != 0:
|
|
590 for i in groups:
|
|
591 for j in letters:
|
|
592 bact_group['group' + j + '_' + i] = bact_group['group' + j + '_' + i] / count_prots
|
|
593 else:
|
|
594 for i in groups:
|
|
595 for j in letters:
|
|
596 bact_group['group' + j + '_' + i] = 0.0
|
|
597
|
|
598 for i in groups:
|
|
599 for j in letters:
|
|
600 solution.append(bact_group['group' + j + '_' + i])
|
|
601 solution.append(phage_group['group' + j + '_' + i])
|
|
602 return solution
|
|
603
|
|
604 def __get_conjoint_triad(self, prot):
|
|
605 ctm = {'A':'0', 'G':'0', 'V':'0','C':'1', 'F':'2', 'I':'2', 'L':'2', 'P':'2', 'M':'3', 'S':'3', 'T':'3', 'Y':'3', 'H':'4', 'N':'4', 'Q':'4', 'W':'4', 'K':'5', 'R':'5', 'D':'6', 'E':'6'}
|
|
606 for i, j in ctm.items():
|
|
607 prot = prot.replace(i, j)
|
|
608 return prot
|
|
609
|
|
610 def __get_grouping(self, prot, let='A'):
|
|
611 from skbio import Sequence
|
|
612 groups = '0123456'
|
|
613 group = {}
|
|
614 for i in groups:
|
|
615 group[i] = 0.0
|
|
616 if let == 'A':
|
|
617 seq = Sequence(prot[:int(len(prot) * 0.25)])
|
|
618 for i in groups:
|
|
619 group[i] += seq.count(i) / len(seq)
|
|
620 elif let == 'B':
|
|
621 seq = Sequence(prot[int(len(prot) * 0.25):int(len(prot) * 0.5)])
|
|
622 for i in groups:
|
|
623 group[i] += seq.count(i) / len(seq)
|
|
624 elif let == 'C':
|
|
625 seq = Sequence(prot[int(len(prot) * 0.5):int(len(prot) * 0.75)])
|
|
626 for i in groups:
|
|
627 group[i] += seq.count(i) / len(seq)
|
|
628 elif let == 'D':
|
|
629 seq = Sequence(prot[int(len(prot) * 0.75):])
|
|
630 for i in groups:
|
|
631 group[i] += seq.count(i) / len(seq)
|
|
632 elif let == 'E':
|
|
633 seq = Sequence(prot[:int(len(prot) * 0.5)])
|
|
634 for i in groups:
|
|
635 group[i] += seq.count(i) / len(seq)
|
|
636 elif let == 'F':
|
|
637 seq = Sequence(prot[int(len(prot) * 0.5):])
|
|
638 for i in groups:
|
|
639 group[i] += seq.count(i) / len(seq)
|
|
640 elif let == 'G':
|
|
641 seq = Sequence(prot[int(len(prot) * 0.25):int(len(prot) * 0.75)])
|
|
642 for i in groups:
|
|
643 group[i] += seq.count(i) / len(seq)
|
|
644 elif let == 'H':
|
|
645 seq = Sequence(prot[:int(len(prot) * 0.75)])
|
|
646 for i in groups:
|
|
647 group[i] += seq.count(i) / len(seq)
|
|
648 elif let == 'I':
|
|
649 seq = Sequence(prot[int(len(prot) * 0.25):])
|
|
650 for i in groups:
|
|
651 group[i] += seq.count(i) / len(seq)
|
|
652 elif let == 'J':
|
|
653 seq = Sequence(prot[int(len(prot) * 0.125):int(len(prot) * 0.875)])
|
|
654 for i in groups:
|
|
655 group[i] += seq.count(i) / len(seq)
|
|
656 return group
|
|
657
|
|
658 def set_output(self):
|
|
659 import pandas as pd
|
|
660 output = []
|
|
661 data = pd.read_csv('files/NCBI_Phage_Bacteria_Data.csv', header=0, index_col=0, names=['Phage Name', 'Bacteria Name', 'Bacteria ID'])
|
|
662 for phage in self.features_data['ID']:
|
|
663 phage = phage[:phage.find('--')]
|
|
664 bact = data.loc[phage, 'Bacteria Name']
|
|
665 if 'escherichia' in bact.lower():
|
|
666 output.append('Escherichia coli')
|
|
667 elif 'klebsiella' in bact.lower():
|
|
668 output.append('Klebsiella pneumoniae')
|
|
669 elif 'acinetobacter' in bact.lower():
|
|
670 output.append('Acinetobacter baumannii')
|
|
671 self.features_data = self.features_data.set_index('ID')
|
|
672 self.features_data['Bacteria'] = output
|
|
673
|
|
674 def save_feat_data(self):
|
|
675 import pickle
|
|
676 with open('files/FeatureDataset', 'wb') as f:
|
|
677 pickle.dump(self.features_data, f)
|
|
678 return self.features_data
|
|
679
|
|
680 def import_feat_data(self):
|
|
681 import pickle
|
|
682 with open('files/FeatureDataset', 'rb') as f:
|
|
683 self.features_data = pickle.load(f)
|
|
684 return self.features_data
|
|
685
|
|
686
|
|
687 if __name__ == '__main__':
|
|
688 test = FeatureConstruction()
|
|
689 # test.process_net_surf()
|
|
690 test.add_grouping()
|
|
691 test.add_composition()
|
|
692 test.add_kmers()
|
|
693 # test.set_output()
|
|
694 test.save_feat_data()
|
|
695 '''
|
|
696 test.process_net_surf()
|
|
697 test.add_aa_freq()
|
|
698 test.add_aromaticity()
|
|
699 test.add_flexibility()
|
|
700 test.add_molecular_weight()'''
|
|
701 # test.import_feat_data()
|
|
702 # test.netSurf()
|