view process_raw_data.py @ 0:e4b3fc88efe0 draft

Uploaded
author pedro_araujo
date Wed, 27 Jan 2021 13:50:11 +0000
parents
children
line wrap: on
line source

"""
https://www.ncbi.nlm.nih.gov/Sequin/acc.html
https://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly
"""


class PhageBacteriaData:

	def __init__(self, dataset=None):
		"""
		Imports a dataset from NCBI Virus, where the columns are Phage ID, Phage Name, Bacteria Name, Bacteria ID.
		If a phage entry does not have a bacteria associated, it is deleted
		:param dataset:
		"""
		import pandas as pd
		import json
		self.listBacID = []
		if dataset is None:
			file = False
			while not file:
				try:
					name = input('File name: ')
					self.data = pd.read_csv('files/' + name, header=0, index_col=0)
					file = True
				except:
					print('Couldn\'t find file')
		else:
			self.data = pd.read_csv('files/' + dataset, header=0, index_col=0)
			self.data = self.data.dropna(how='any')
			self.data = self.data[self.data['Host'] != 'unclassified bacterium']
			index_remove = []
			for i in range(len(self.data)):
				if 'uncultured' in self.data.iloc[i]['Species']:
					index_remove.append(i)
				elif 'virus' not in self.data.iloc[i]['Species'] and 'phage' not in self.data.iloc[i]['Species']:
					index_remove.append(i)
			self.data = self.data.drop(self.data.index[index_remove])
			index_remove = []
			for i in range(len(self.data)):
				temp = self.data['Host'][i].split(' ')
				if len(temp) <= 1:
					index_remove.append(i)
			self.data = self.data.drop(self.data.index[index_remove])
		if 'Host_ID' not in self.data.columns:
			temp = []
			for i in self.data.index:
				temp.append([])
			self.data['Host_ID'] = temp
		try:
			with open('files/searched_accessions', 'r') as f:
				self.searched = json.loads(f.read())
		except:
			self.searched = {}

	def addPhageName(self):
		"""
		Using the entrez service, from NCBI, for each phage the name is added, from its features.
		:return:
		"""
		from Bio import Entrez
		from Bio import SeqIO
		Entrez.email = "pedro_araujo97@hotmail.com"
		listNames = []
		with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=self.data.index) as handle:
			for seq_record in SeqIO.parse(handle, "gb"):
				listNames.append(seq_record.annotations['organism'])
		self.data['Species'] = listNames

	def addBacteriaName(self):
		"""
		Using the entrez service, from NCBI, for each phage the infecting bacteria name is added, from its features.
		:return:
		"""
		from Bio import Entrez
		from Bio import SeqIO
		Entrez.email = "pedro_araujo97@hotmail.com"
		for phage in self.data.index:
			with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=phage) as handle:
				seq_record = SeqIO.read(handle, "gb")
				try:
					if len(seq_record.features[0].qualifiers['host'][0].split(' ')) > 2:
						self.data.loc[phage, 'Host'] = seq_record.features[0].qualifiers['host'][0]
				except:
					if len(seq_record.features[0].qualifiers['lab_host'][0].split(' ')) > 2:
						self.data.loc[phage, 'Host'] = seq_record.features[0].qualifiers['lab_host'][0]
		self.save_data()

	def addBacteriaGenome(self):
		"""
		For each phage, the associated scientific articles in pubmed are looked. From theses pubmed articles, all associated IDs are extracted.
		If the ID corresponds to a bacterial strain, its accession ID is added to the Bacteria ID column.
		:return:
		"""
		from Bio import Entrez
		import ast
		import pickle
		from pathlib import Path
		Entrez.email = "pedro_araujo97@hotmail.com"
		my_file = Path("files/searched_hosts")
		if my_file.is_file():
			with open('files/searched_hosts', 'rb') as f:
				list_done = pickle.load(f)
		else:
			list_done = []
		count = 0
		try:
			for phageID in self.data.index:
				if phageID in list_done: continue
				listBactID = ast.literal_eval(self.data.loc[phageID, 'Host_ID'])
				try:
					with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle:
						pubmed = Entrez.read(handle)
					for i in pubmed[0]['LinkSetDb']:
						if 'weighted' not in i["LinkName"]:
							for link in i["Link"]:
								try:
									with Entrez.elink(dbfrom='pubmed', db="nucleotide", id=link['Id']) as handle:
										genomes = Entrez.read(handle)
									for i in genomes[0]['LinkSetDb']:
										if 'weighted' not in i['LinkName']:
											for id in i['Link']:
												with Entrez.esummary(db='nucleotide', id=id['Id']) as handle:
													bacorg = Entrez.read(handle)
													if bacorg[0]['Caption'] != phageID and 'phage' not in bacorg[0][
														'Title'].lower() \
															and bacorg[0][
														'AccessionVersion'] not in listBactID and 'cds' not in \
															bacorg[0]['Title'].lower() and 'shotgun' not in bacorg[0][
														'Title'].lower():
														if any(z in bacorg[0]['AccessionVersion'][:3] for z in
															   ['NC_', 'AC_', 'NZ_', 'CP', 'AE', 'CY', 'AP']):
															listBactID.append(bacorg[0]['AccessionVersion'])
															self.searched[bacorg[0]['AccessionVersion']] = 'yes'
															count += 1
														elif not any(z in bacorg[0]['AccessionVersion'][:3] for z in
																	 ['MN', 'FM', 'MQ', 'MR', 'MK', 'AB', 'MF', 'KP',
																	  'NM_', 'KC', 'MH', 'AY', 'FN', 'AY']) \
																and 'complete' in bacorg[0]['Title'].lower():
															if bacorg[0]['AccessionVersion'] in self.searched.keys():
																add = self.searched[bacorg[0]['AccessionVersion']]
															else:
																add = input('Check ' + bacorg[0][
																	'AccessionVersion'] + '\nDo you wish to add it? (yes/no)')
															if 'y' in add.lower():
																listBactID.append(bacorg[0]['AccessionVersion'])
																self.searched[bacorg[0]['AccessionVersion']] = 'yes'
																count += 1
															else:
																self.searched[bacorg[0]['AccessionVersion']] = 'no'
								except:
									pass
				except:
					pass
				self.data.loc[phageID, 'Host_ID'] = listBactID
				list_done.append(phageID)
				with open('files/searched_hosts', 'wb') as f:
					pickle.dump(list_done, f)
				self.save_data()
				print(phageID)
		except:
			print('Bacterial host name missing. Searching from phage id')
			pass
		print('For future reference,', count, "new bacterial ID's were added.")
		self.save_data()

	def checkAbstracts(self):
		"""
		For each phage, the associated scientific articles in pubmed are looked. From theses pubmed articles, the abstracted is searched for mentions of bacterial strains.
		If bacterial strains are found, its accession IDs are added to the Bacteria ID column.
		:return:
		"""
		from Bio import Entrez
		import re
		import ast
		Entrez.email = 'pedro_araujo97@hotmail.com'
		count = 0
		for phageID in self.data.index:
			if len(self.data.loc[phageID, 'Host'].split()) < 3:
				with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle:
					pubmed = Entrez.read(handle)
				listBactID = ast.literal_eval(self.data.loc[phageID, 'Host_ID'])
				for i in pubmed[0]['LinkSetDb']:
					if 'weighted' not in i["LinkName"]:
						for link in i["Link"]:
							try:
								with Entrez.efetch(db="pubmed", rettype="medline", retmode="xml",
												   id=link['Id']) as handle:
									article = Entrez.read(handle)
								abstract = \
								article['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
								x = re.findall('\w{0,1}[A-Z]{1,5}[0-9]{1,5}[-,:]{0,1}[A-Z]{0,5}[1-9]{0,5}', abstract)
								for i in range(len(x)):
									x[i] = x[i].strip(',;')
								x = list(set(x))
								for i in x:
									if 'ORF' in i:
										x.remove(i)
								for strain in x:
									with Entrez.esearch(db='nucleotide', term='((((' + self.data.loc[
										phageID, 'Host'] + ' ' + strain + ' AND complete sequence) NOT shotgun[Title]) NOT phage[Title]) NOT cds[Title]) NOT gene[Title]',
														idtype="acc") as handle:
										species = Entrez.read(handle)
										strains = species['IdList']
										for i in strains:
											if any(z in i for z in ['NC_', 'NZ_', 'AC_', 'CP', 'AE', 'CY',
																	'AP']) and i not in listBactID:
												listBactID.append(i)
												self.searched[i] = 'yes'
												count += 1
											elif not any(z in i[:3] for z in
														 ['MN', 'FM', 'MQ', 'MR', 'MK', 'AB', 'MF', 'KP', 'NM_', 'KC',
														  'MH', 'AY', 'FN', 'AY']) and i not in listBactID:
												if i in self.searched.keys():
													add = self.searched[i]
												else:
													add = input('Check ' + i + '\nDo you wish to add it? (yes/no)')
												if 'y' in add.lower():
													listBactID.append(i)
													self.searched[i] = 'yes'
													count += 1
												else:
													self.searched = 'no'
							except:
								pass
				self.data.loc[phageID, 'Host_ID'] = listBactID
		print('For future reference,', count, "new bacterial ID's were added.")
		self.save_data()

	def saveAbstracts(self):
		"""
		For each phage, the associated scientific articles in pubmed are looked.
		From theses pubmed articles, the abstracted is saved in a dictionary structure, where each phage ID is associated with a list of abstracts.
		These abstracts can be used for later processing.
		:return:
		"""
		from Bio import Entrez
		import json
		Entrez.email = 'pedro_araujo97@hotmail.com'
		dicPhageAbstracts = {}

		for phageID in self.data.index:
			print(phageID, end='\n')
			dicPhageAbstracts[phageID] = []
			with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle:
				pubmed = Entrez.read(handle)
			lista = []
			for i in pubmed[0]['LinkSetDb']:
				if 'weighted' not in i["LinkName"]:
					for link in i["Link"]:
						try:
							with Entrez.efetch(db="pubmed", rettype="medline", retmode="xml", id=link['Id']) as handle:
								article = Entrez.read(handle)
							abstract = \
							article['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
							dicPhageAbstracts[phageID].append([link['Id'], abstract])
						except:
							pass
		with open('files/phageAbstracts.json', 'w') as f:
			json.dump(dicPhageAbstracts, f)

	def searchBacName(self):
		from Bio import Entrez
		import ast
		Entrez.email = "pedro_araujo97@hotmail.com"
		count = 0
		for phageID in self.data.index:
			if len(self.data.loc[phageID, 'Host'].split()) > 2:
				listBactID = ast.literal_eval(self.data.loc[phageID, 'Host_ID'])
				with Entrez.esearch(db='nucleotide', term='((((' + self.data.loc[
					phageID, 'Host'] + ' AND complete sequence) NOT shotgun[Title]) NOT phage[Title]) NOT cds[Title]) NOT gene[Title]',
									idtype="acc") as handle:
					species = Entrez.read(handle)
					strains = species['IdList']
					for j in strains:
						if any(z in j[:3] for z in
							   ['NC_', 'NZ_', 'AC_', 'CP', 'AE', 'CY', 'AP']) and j not in listBactID:
							listBactID.append(j)
							self.searched[j] = 'yes'
							count += 1
						elif not any(z in j[:3] for z in
									 ['MN', 'FM', 'MQ', 'MR', 'MK', 'AB', 'MF', 'KP', 'NM_', 'KC', 'MH', 'AY', 'FN',
									  'AY']) and j not in listBactID:
							if j in self.searched.keys():
								add = self.searched[j]
							else:
								add = input('Check ' + j + '\nDo you wish to add it? (yes/no)')
							if 'y' in add.lower():
								listBactID.append(j)
								self.searched[j] = 'yes'
								count += 1
							else:
								self.searched[j] = 'no'
				self.data.loc[phageID, 'Host_ID'] = listBactID
		print('For future reference,', count, "new bacterial ID's were added.")
		self.save_data()

	def createListBacID(self, lower=0, upper=100):
		"""
		More sequential than previous methods. Maybe include every single one...
		:param lower: lower index from the phage list (numeric)
		:param upper: upper index from the phage list (numeric)
		:return:
		"""
		from Bio import Entrez
		Entrez.email = 'pedro_araujo97@hotmail.com'
		for i in range(lower, upper):
			phageID = self.data.index[i]
			BactID = []
			name = test.data.loc[phageID]['Bacteria Name']
			try:
				if name != 'unclassified bacterium' and not name != name:  # Verificação de hosts válidos
					with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle:
						pubmed = Entrez.read(handle)
					for link in pubmed[0]["LinkSetDb"][0]["Link"]:
						try:
							with Entrez.elink(dbfrom='pubmed', db="nucleotide", id=link['Id']) as handle:
								genomes = Entrez.read(handle)
							for id in genomes[0]['LinkSetDb'][0]['Link']:
								with Entrez.esummary(db='nucleotide', id=id['Id']) as handle:
									bacorg = Entrez.read(handle)
									if 'NC_' in bacorg[0]['AccessionVersion'] or 'NZ_' in bacorg[0]['AccessionVersion']:
										if bacorg[0]['Caption'] != phageID:
											BactID.append(bacorg[0]['AccessionVersion'])
						except:
							pass
				else:
					pass
			except:
				pass
			self.listBacID.append(BactID)

	def check_bacteria(self):
		from Bio import Entrez
		from Bio import SeqIO
		import ast
		Entrez.email = "pedro_araujo97@hotmail.com"
		all_bact = []
		for i in self.data.index:
			for bact in ast.literal_eval(self.data.loc[i, 'Host_ID']):
				if bact[:-2] not in all_bact:
					all_bact.append(bact[:-2])
		list_remove = []
		for bact in all_bact:
			if bact not in list_remove:
				try:
					with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=bact) as handle:
						seq_record = SeqIO.read(handle, "gb")
					if not any(i in seq_record.description.lower() for i in ['pneumoniae', 'coli', 'baumannii']) or not any(i in seq_record.description.lower() for i in ['escherichia', 'acinetobacter', 'klebsiella']) \
							or 'phage' in seq_record.description.lower() or 'virus' in seq_record.description.lower():
						list_remove.append(bact)
				except:
					list_remove.append(bact)
		print(list_remove)
		for phage in self.data.index:
			listBactID = ast.literal_eval(self.data.loc[phage, 'Host_ID'])
			for bact in listBactID:
				if bact[:-2] in list_remove:
					listBactID.remove(bact)
			self.data.loc[phage, 'Host_ID'] = listBactID
		self.data.to_csv('files/NCBI_Phage_Bacteria_Data.csv')

	def save_data(self):
		"""
		Saves the data in csv format.
		:return:
		"""
		import json
		self.data.to_csv('files/NCBI_Phage_Bacteria_Data.csv')
		with open('files/searched_accessions', 'w') as f:
			json.dump(self.searched, f)


if __name__ == '__main__':
	test = PhageBacteriaData('NCBI_Phage_Bacteria_Data.csv')  # sequences
	test.addBacteriaName()
	test.addBacteriaGenome()
	test.searchBacName()  # 2266 bacteria added
	test.checkAbstracts()
	# test.data = test.data.drop(columns=['Bacteria ID'])
	test.searchBacName()
	# test.createListBacID(0, 100)
	# test.data = test.data.iloc[:, 0:3]
	test.check_bacteria()
	test.save_data()
# test.extractProtein()
# test.importProtein('Phage')