0
|
1
|
|
2 class PhageBacteriaInformation:
|
|
3
|
|
4 def __init__(self, dataset=None):
|
|
5 """
|
|
6 Imports a dataset from NCBI Virus, where the columns are Phage ID, Phage Name, Bacteria Name, Bacteria ID.
|
|
7 If a phage entry does not have a bacteria associated, it is deleted
|
|
8 :param dataset:
|
|
9 """
|
|
10 import pandas as pd
|
|
11 import ast
|
|
12 self.phagesProteins = {}
|
|
13 # self.phagesDNA = {}
|
|
14 self.bactProteins = {}
|
|
15 # self.bactDNA = {}
|
|
16 self.data = pd.read_csv('files/'+dataset, header=0, index_col=0)
|
|
17 self.data = self.data.dropna(how='any')
|
|
18 self.data = self.data[self.data['Host_ID'] != '[]']
|
|
19 index_remove = []
|
|
20 for i in range(len(self.data)):
|
|
21 temp = self.data['Host'][i].split(' ')
|
|
22 if len(temp) <= 1:
|
|
23 index_remove.append(i)
|
|
24 self.data = self.data.drop(self.data.index[index_remove])
|
|
25 self.all_bact = []
|
|
26 for i in self.data.index:
|
|
27 for bact in ast.literal_eval(self.data.loc[i, 'Host_ID']):
|
|
28 if bact[:-2] not in self.all_bact:
|
|
29 self.all_bact.append(bact[:-2])
|
|
30 self.data.to_csv('files/Filtered_Phage_Bacteria.csv')
|
|
31
|
|
32 def addFeatures(self):
|
|
33 """
|
|
34 For each phage in the data, it saves its DNA sequence and all proteins, as provided by NCBI. It saves them into two variables
|
|
35 Each bacteria associated with the phage is also searched for its DNA and proteins sequences.
|
|
36 :return:
|
|
37 """
|
|
38 from Bio import Entrez
|
|
39 from Bio import SeqIO
|
|
40 import json
|
|
41 import ast
|
|
42 Entrez.email = 'pedro_araujo97@hotmail.com'
|
|
43 print('Working...')
|
|
44 for phageID in self.data.index:
|
|
45 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=phageID) as handle:
|
|
46 genomePhage = SeqIO.read(handle, "gb")
|
|
47 protsPhage = {}
|
|
48 for feat in genomePhage.features:
|
|
49 if feat.type == 'CDS':
|
|
50 try: protsPhage[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
|
|
51 except: pass
|
|
52 self.phagesProteins[phageID] = protsPhage
|
|
53
|
|
54 for bact in self.all_bact:
|
|
55 protsBac = {}
|
|
56 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=bact) as handle:
|
|
57 genomeBac = SeqIO.read(handle, "gb")
|
|
58 for feat in genomeBac.features:
|
|
59 if feat.type == 'CDS':
|
|
60 try: protsBac[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
|
|
61 except: pass
|
|
62 self.bactProteins[bact] = protsBac
|
|
63
|
|
64 with open('files/phagesProteins.json', 'w') as f:
|
|
65 json.dump(self.phagesProteins, f)
|
|
66 self.__createFasta(self.phagesProteins, 'phagesProteins')
|
|
67 with open('files/bactProteins.json', 'w') as f:
|
|
68 json.dump(self.bactProteins, f)
|
|
69 self.__createFasta(self.bactProteins, 'bactProteins')
|
|
70 print('Done')
|
|
71
|
|
72 def addBacteriaFeatures(self):
|
|
73 """
|
|
74 For each unique bacteria present in the dataset, the DNA and protein sequences are saved in two variables.
|
|
75 :return:
|
|
76 """
|
|
77 from Bio import Entrez
|
|
78 from Bio import SeqIO
|
|
79 import json
|
|
80 import ast
|
|
81 Entrez.email = 'pedro_araujo97@hotmail.com'
|
|
82 print('Working...')
|
|
83 for bact in self.all_bact:
|
|
84 if bact not in self.bactProteins.keys():
|
|
85 protsBac = {}
|
|
86 try:
|
|
87 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=bact) as handle:
|
|
88 genomeBac = SeqIO.read(handle, "gb")
|
|
89 for feat in genomeBac.features:
|
|
90 if feat.type == 'CDS':
|
|
91 try: protsBac[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
|
|
92 except: pass
|
|
93 if len(genomeBac.features) <= 5:
|
|
94 with Entrez.efetch(db="nucleotide", rettype="gbwithparts", retmode="text", id=bact) as handle:
|
|
95 genomeBac = handle.readlines()
|
|
96 for i in range(len(genomeBac)):
|
|
97 if ' CDS ' in genomeBac[i]:
|
|
98 j = i
|
|
99 protDone = False
|
|
100 while j < len(genomeBac):
|
|
101 if protDone:
|
|
102 break
|
|
103 if '/product=' in genomeBac[j]:
|
|
104 product = genomeBac[j].strip()[10:]
|
|
105 j += 1
|
|
106 elif '_id=' in genomeBac[j]:
|
|
107 protKey = genomeBac[j].strip()[13:-1]
|
|
108 j += 1
|
|
109 elif '/translation=' in genomeBac[j]:
|
|
110 protSeq = genomeBac[j].strip()[14:]
|
|
111 j += 1
|
|
112 for k in range(j, len(genomeBac)):
|
|
113 if genomeBac[k].islower():
|
|
114 j = k
|
|
115 protDone = True
|
|
116 break
|
|
117 else:
|
|
118 protSeq += genomeBac[k].strip()
|
|
119 else:
|
|
120 j += 1
|
|
121 protsBac[protKey] = [product, protSeq[:protSeq.find('"')]]
|
|
122 self.bactProteins[bact] = protsBac
|
|
123 except:
|
|
124 print(bact + ' failed')
|
|
125 with open('files/bactProteins.json', 'w') as f:
|
|
126 json.dump(self.bactProteins, f)
|
|
127 self.__createFasta(self.bactProteins, 'bactProteins')
|
|
128 print('Done')
|
|
129
|
|
130 def add_individual_bacteria(self):
|
|
131 """
|
|
132 For each unique bacteria present in the dataset, the DNA and protein sequences are saved in two variables.
|
|
133 :return:
|
|
134 """
|
|
135 from Bio import Entrez
|
|
136 from Bio import SeqIO
|
|
137 import json
|
|
138 from pathlib import Path
|
|
139 Entrez.email = 'pedro_araujo97@hotmail.com'
|
|
140 print('Working...')
|
|
141 for bact in self.all_bact:
|
|
142 my_file = Path('files/bacteria/' + bact + ".json")
|
|
143 if not my_file.is_file():
|
|
144 protsBac = {}
|
|
145 try:
|
|
146 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=bact) as handle:
|
|
147 genomeBac = SeqIO.read(handle, "gb")
|
|
148 for feat in genomeBac.features:
|
|
149 if feat.type == 'CDS':
|
|
150 try: protsBac[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
|
|
151 except: pass
|
|
152 if len(genomeBac.features) <= 5:
|
|
153 with Entrez.efetch(db="nucleotide", rettype="gbwithparts", retmode="text", id=bact) as handle:
|
|
154 genomeBac = handle.readlines()
|
|
155 for i in range(len(genomeBac)):
|
|
156 if ' CDS ' in genomeBac[i]:
|
|
157 j = i
|
|
158 protDone = False
|
|
159 while j < len(genomeBac):
|
|
160 if protDone:
|
|
161 break
|
|
162 if '/product=' in genomeBac[j]:
|
|
163 product = genomeBac[j].strip()[10:]
|
|
164 j += 1
|
|
165 elif '_id=' in genomeBac[j]:
|
|
166 protKey = genomeBac[j].strip()[13:-1]
|
|
167 j += 1
|
|
168 elif '/translation=' in genomeBac[j]:
|
|
169 protSeq = genomeBac[j].strip()[14:]
|
|
170 j += 1
|
|
171 for k in range(j, len(genomeBac)):
|
|
172 if genomeBac[k].islower():
|
|
173 j = k
|
|
174 protDone = True
|
|
175 break
|
|
176 else:
|
|
177 protSeq += genomeBac[k].strip()
|
|
178 else:
|
|
179 j += 1
|
|
180 protsBac[protKey] = [product, protSeq[:protSeq.find('"')]]
|
|
181 with open('files/bacteria/' + bact + '.json', 'w') as f:
|
|
182 json.dump(protsBac, f)
|
|
183 except:
|
|
184 print(bact + ' failed')
|
|
185 # with open('files/bactProteins.json', 'w') as f:
|
|
186 # json.dump(self.bactProteins, f)
|
|
187 # self.__createFasta(self.bactProteins, 'bactProteins')
|
|
188 print('Done')
|
|
189
|
|
190 def importData(self):
|
|
191 """
|
|
192 Imports the previously saved DNA and protein sequences. This needs to be improved so the user can specify which data to import.
|
|
193 :return:
|
|
194 """
|
|
195 import json
|
|
196 with open('files/phagesProteins.json', encoding='utf-8') as F:
|
|
197 self.phagesProteins = json.loads(F.read())
|
|
198 # with open('files/bactProteins.json', encoding='utf-8') as F:
|
|
199 # self.bactProteins = json.loads(F.read())
|
|
200
|
|
201 def addPhageProt(self):
|
|
202 """
|
|
203 For each unique phage present in the dataset, the DNA and protein sequences are saved in two variables.
|
|
204 :return:
|
|
205 """
|
|
206 from Bio import Entrez
|
|
207 from Bio import SeqIO
|
|
208 import json
|
|
209 Entrez.email = 'pedro_araujo97@hotmail.com'
|
|
210 print('Working...')
|
|
211
|
|
212 for phageID in self.data.index:
|
|
213 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=phageID) as handle:
|
|
214 genomePhage = SeqIO.read(handle, "gb")
|
|
215 protsPhage = {}
|
|
216 for feat in genomePhage.features:
|
|
217 if feat.type == 'CDS':
|
|
218 try: protsPhage[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
|
|
219 except: pass
|
|
220 self.phagesProteins[phageID] = protsPhage
|
|
221
|
|
222 with open('files/phagesProteins.json', 'w') as f:
|
|
223 json.dump(self.phagesProteins, f)
|
|
224 self.__createFasta(self.phagesProteins, 'phagesProteins')
|
|
225 return self.phagesProteins
|
|
226
|
|
227 def add_missing_phage(self):
|
|
228 """
|
|
229 For each unique phage present in the dataset, the DNA and protein sequences are saved in two variables.
|
|
230 :return:
|
|
231 """
|
|
232 from Bio import Entrez
|
|
233 from Bio import SeqIO
|
|
234 import json
|
|
235 Entrez.email = 'pedro_araujo97@hotmail.com'
|
|
236 print('Working...')
|
|
237
|
|
238 for phageID in self.data.index:
|
|
239 if phageID not in self.phagesProteins.keys():
|
|
240 print(phageID)
|
|
241 with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=phageID) as handle:
|
|
242 genomePhage = SeqIO.read(handle, "gb")
|
|
243 protsPhage = {}
|
|
244 for feat in genomePhage.features:
|
|
245 if feat.type == 'CDS':
|
|
246 try: protsPhage[feat.qualifiers['protein_id'][0]] = [feat.qualifiers['product'][0], feat.qualifiers['translation'][0]]
|
|
247 except: pass
|
|
248 self.phagesProteins[phageID] = protsPhage
|
|
249 with open('files/phagesProteins.json', 'w') as f:
|
|
250 json.dump(self.phagesProteins, f)
|
|
251 self.__createFasta(self.phagesProteins, 'phagesProteins')
|
|
252 return self.phagesProteins
|
|
253
|
|
254 def __createFasta(self, var, name):
|
|
255 with open('files/' + name + '.fasta', 'w') as F:
|
|
256 for spec in var:
|
|
257 try:
|
|
258 for prot in var[spec]:
|
|
259 F.write('>' + prot + '-' + spec + '\n' + var[spec][prot][1] + '\n')
|
|
260 except:
|
|
261 F.write('>' + spec + '\n' + var[spec] + '\n')
|
|
262
|
|
263
|
|
264 if __name__ == '__main__':
|
|
265 test = PhageBacteriaInformation('NCBI_Phage_Bacteria_Data.csv')
|
|
266 test.add_individual_bacteria()
|
|
267 test.addPhageProt()
|
|
268 test.add_missing_phage()
|
|
269 test.importData()
|