Mercurial > repos > pedro_araujo > phage_host_prediction
diff process_raw_data.py @ 0:e4b3fc88efe0 draft
Uploaded
author | pedro_araujo |
---|---|
date | Wed, 27 Jan 2021 13:50:11 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/process_raw_data.py Wed Jan 27 13:50:11 2021 +0000 @@ -0,0 +1,386 @@ +""" +https://www.ncbi.nlm.nih.gov/Sequin/acc.html +https://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly +""" + + +class PhageBacteriaData: + + def __init__(self, dataset=None): + """ + Imports a dataset from NCBI Virus, where the columns are Phage ID, Phage Name, Bacteria Name, Bacteria ID. + If a phage entry does not have a bacteria associated, it is deleted + :param dataset: + """ + import pandas as pd + import json + self.listBacID = [] + if dataset is None: + file = False + while not file: + try: + name = input('File name: ') + self.data = pd.read_csv('files/' + name, header=0, index_col=0) + file = True + except: + print('Couldn\'t find file') + else: + self.data = pd.read_csv('files/' + dataset, header=0, index_col=0) + self.data = self.data.dropna(how='any') + self.data = self.data[self.data['Host'] != 'unclassified bacterium'] + index_remove = [] + for i in range(len(self.data)): + if 'uncultured' in self.data.iloc[i]['Species']: + index_remove.append(i) + elif 'virus' not in self.data.iloc[i]['Species'] and 'phage' not in self.data.iloc[i]['Species']: + index_remove.append(i) + self.data = self.data.drop(self.data.index[index_remove]) + index_remove = [] + for i in range(len(self.data)): + temp = self.data['Host'][i].split(' ') + if len(temp) <= 1: + index_remove.append(i) + self.data = self.data.drop(self.data.index[index_remove]) + if 'Host_ID' not in self.data.columns: + temp = [] + for i in self.data.index: + temp.append([]) + self.data['Host_ID'] = temp + try: + with open('files/searched_accessions', 'r') as f: + self.searched = json.loads(f.read()) + except: + self.searched = {} + + def addPhageName(self): + """ + Using the entrez service, from NCBI, for each phage the name is added, from its features. + :return: + """ + from Bio import Entrez + from Bio import SeqIO + Entrez.email = "pedro_araujo97@hotmail.com" + listNames = [] + with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=self.data.index) as handle: + for seq_record in SeqIO.parse(handle, "gb"): + listNames.append(seq_record.annotations['organism']) + self.data['Species'] = listNames + + def addBacteriaName(self): + """ + Using the entrez service, from NCBI, for each phage the infecting bacteria name is added, from its features. + :return: + """ + from Bio import Entrez + from Bio import SeqIO + Entrez.email = "pedro_araujo97@hotmail.com" + for phage in self.data.index: + with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=phage) as handle: + seq_record = SeqIO.read(handle, "gb") + try: + if len(seq_record.features[0].qualifiers['host'][0].split(' ')) > 2: + self.data.loc[phage, 'Host'] = seq_record.features[0].qualifiers['host'][0] + except: + if len(seq_record.features[0].qualifiers['lab_host'][0].split(' ')) > 2: + self.data.loc[phage, 'Host'] = seq_record.features[0].qualifiers['lab_host'][0] + self.save_data() + + def addBacteriaGenome(self): + """ + For each phage, the associated scientific articles in pubmed are looked. From theses pubmed articles, all associated IDs are extracted. + If the ID corresponds to a bacterial strain, its accession ID is added to the Bacteria ID column. + :return: + """ + from Bio import Entrez + import ast + import pickle + from pathlib import Path + Entrez.email = "pedro_araujo97@hotmail.com" + my_file = Path("files/searched_hosts") + if my_file.is_file(): + with open('files/searched_hosts', 'rb') as f: + list_done = pickle.load(f) + else: + list_done = [] + count = 0 + try: + for phageID in self.data.index: + if phageID in list_done: continue + listBactID = ast.literal_eval(self.data.loc[phageID, 'Host_ID']) + try: + with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle: + pubmed = Entrez.read(handle) + for i in pubmed[0]['LinkSetDb']: + if 'weighted' not in i["LinkName"]: + for link in i["Link"]: + try: + with Entrez.elink(dbfrom='pubmed', db="nucleotide", id=link['Id']) as handle: + genomes = Entrez.read(handle) + for i in genomes[0]['LinkSetDb']: + if 'weighted' not in i['LinkName']: + for id in i['Link']: + with Entrez.esummary(db='nucleotide', id=id['Id']) as handle: + bacorg = Entrez.read(handle) + if bacorg[0]['Caption'] != phageID and 'phage' not in bacorg[0][ + 'Title'].lower() \ + and bacorg[0][ + 'AccessionVersion'] not in listBactID and 'cds' not in \ + bacorg[0]['Title'].lower() and 'shotgun' not in bacorg[0][ + 'Title'].lower(): + if any(z in bacorg[0]['AccessionVersion'][:3] for z in + ['NC_', 'AC_', 'NZ_', 'CP', 'AE', 'CY', 'AP']): + listBactID.append(bacorg[0]['AccessionVersion']) + self.searched[bacorg[0]['AccessionVersion']] = 'yes' + count += 1 + elif not any(z in bacorg[0]['AccessionVersion'][:3] for z in + ['MN', 'FM', 'MQ', 'MR', 'MK', 'AB', 'MF', 'KP', + 'NM_', 'KC', 'MH', 'AY', 'FN', 'AY']) \ + and 'complete' in bacorg[0]['Title'].lower(): + if bacorg[0]['AccessionVersion'] in self.searched.keys(): + add = self.searched[bacorg[0]['AccessionVersion']] + else: + add = input('Check ' + bacorg[0][ + 'AccessionVersion'] + '\nDo you wish to add it? (yes/no)') + if 'y' in add.lower(): + listBactID.append(bacorg[0]['AccessionVersion']) + self.searched[bacorg[0]['AccessionVersion']] = 'yes' + count += 1 + else: + self.searched[bacorg[0]['AccessionVersion']] = 'no' + except: + pass + except: + pass + self.data.loc[phageID, 'Host_ID'] = listBactID + list_done.append(phageID) + with open('files/searched_hosts', 'wb') as f: + pickle.dump(list_done, f) + self.save_data() + print(phageID) + except: + print('Bacterial host name missing. Searching from phage id') + pass + print('For future reference,', count, "new bacterial ID's were added.") + self.save_data() + + def checkAbstracts(self): + """ + For each phage, the associated scientific articles in pubmed are looked. From theses pubmed articles, the abstracted is searched for mentions of bacterial strains. + If bacterial strains are found, its accession IDs are added to the Bacteria ID column. + :return: + """ + from Bio import Entrez + import re + import ast + Entrez.email = 'pedro_araujo97@hotmail.com' + count = 0 + for phageID in self.data.index: + if len(self.data.loc[phageID, 'Host'].split()) < 3: + with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle: + pubmed = Entrez.read(handle) + listBactID = ast.literal_eval(self.data.loc[phageID, 'Host_ID']) + for i in pubmed[0]['LinkSetDb']: + if 'weighted' not in i["LinkName"]: + for link in i["Link"]: + try: + with Entrez.efetch(db="pubmed", rettype="medline", retmode="xml", + id=link['Id']) as handle: + article = Entrez.read(handle) + abstract = \ + article['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0] + x = re.findall('\w{0,1}[A-Z]{1,5}[0-9]{1,5}[-,:]{0,1}[A-Z]{0,5}[1-9]{0,5}', abstract) + for i in range(len(x)): + x[i] = x[i].strip(',;') + x = list(set(x)) + for i in x: + if 'ORF' in i: + x.remove(i) + for strain in x: + with Entrez.esearch(db='nucleotide', term='((((' + self.data.loc[ + phageID, 'Host'] + ' ' + strain + ' AND complete sequence) NOT shotgun[Title]) NOT phage[Title]) NOT cds[Title]) NOT gene[Title]', + idtype="acc") as handle: + species = Entrez.read(handle) + strains = species['IdList'] + for i in strains: + if any(z in i for z in ['NC_', 'NZ_', 'AC_', 'CP', 'AE', 'CY', + 'AP']) and i not in listBactID: + listBactID.append(i) + self.searched[i] = 'yes' + count += 1 + elif not any(z in i[:3] for z in + ['MN', 'FM', 'MQ', 'MR', 'MK', 'AB', 'MF', 'KP', 'NM_', 'KC', + 'MH', 'AY', 'FN', 'AY']) and i not in listBactID: + if i in self.searched.keys(): + add = self.searched[i] + else: + add = input('Check ' + i + '\nDo you wish to add it? (yes/no)') + if 'y' in add.lower(): + listBactID.append(i) + self.searched[i] = 'yes' + count += 1 + else: + self.searched = 'no' + except: + pass + self.data.loc[phageID, 'Host_ID'] = listBactID + print('For future reference,', count, "new bacterial ID's were added.") + self.save_data() + + def saveAbstracts(self): + """ + For each phage, the associated scientific articles in pubmed are looked. + From theses pubmed articles, the abstracted is saved in a dictionary structure, where each phage ID is associated with a list of abstracts. + These abstracts can be used for later processing. + :return: + """ + from Bio import Entrez + import json + Entrez.email = 'pedro_araujo97@hotmail.com' + dicPhageAbstracts = {} + + for phageID in self.data.index: + print(phageID, end='\n') + dicPhageAbstracts[phageID] = [] + with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle: + pubmed = Entrez.read(handle) + lista = [] + for i in pubmed[0]['LinkSetDb']: + if 'weighted' not in i["LinkName"]: + for link in i["Link"]: + try: + with Entrez.efetch(db="pubmed", rettype="medline", retmode="xml", id=link['Id']) as handle: + article = Entrez.read(handle) + abstract = \ + article['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0] + dicPhageAbstracts[phageID].append([link['Id'], abstract]) + except: + pass + with open('files/phageAbstracts.json', 'w') as f: + json.dump(dicPhageAbstracts, f) + + def searchBacName(self): + from Bio import Entrez + import ast + Entrez.email = "pedro_araujo97@hotmail.com" + count = 0 + for phageID in self.data.index: + if len(self.data.loc[phageID, 'Host'].split()) > 2: + listBactID = ast.literal_eval(self.data.loc[phageID, 'Host_ID']) + with Entrez.esearch(db='nucleotide', term='((((' + self.data.loc[ + phageID, 'Host'] + ' AND complete sequence) NOT shotgun[Title]) NOT phage[Title]) NOT cds[Title]) NOT gene[Title]', + idtype="acc") as handle: + species = Entrez.read(handle) + strains = species['IdList'] + for j in strains: + if any(z in j[:3] for z in + ['NC_', 'NZ_', 'AC_', 'CP', 'AE', 'CY', 'AP']) and j not in listBactID: + listBactID.append(j) + self.searched[j] = 'yes' + count += 1 + elif not any(z in j[:3] for z in + ['MN', 'FM', 'MQ', 'MR', 'MK', 'AB', 'MF', 'KP', 'NM_', 'KC', 'MH', 'AY', 'FN', + 'AY']) and j not in listBactID: + if j in self.searched.keys(): + add = self.searched[j] + else: + add = input('Check ' + j + '\nDo you wish to add it? (yes/no)') + if 'y' in add.lower(): + listBactID.append(j) + self.searched[j] = 'yes' + count += 1 + else: + self.searched[j] = 'no' + self.data.loc[phageID, 'Host_ID'] = listBactID + print('For future reference,', count, "new bacterial ID's were added.") + self.save_data() + + def createListBacID(self, lower=0, upper=100): + """ + More sequential than previous methods. Maybe include every single one... + :param lower: lower index from the phage list (numeric) + :param upper: upper index from the phage list (numeric) + :return: + """ + from Bio import Entrez + Entrez.email = 'pedro_araujo97@hotmail.com' + for i in range(lower, upper): + phageID = self.data.index[i] + BactID = [] + name = test.data.loc[phageID]['Bacteria Name'] + try: + if name != 'unclassified bacterium' and not name != name: # Verificação de hosts válidos + with Entrez.elink(dbfrom='nuccore', db='pubmed', id=phageID) as handle: + pubmed = Entrez.read(handle) + for link in pubmed[0]["LinkSetDb"][0]["Link"]: + try: + with Entrez.elink(dbfrom='pubmed', db="nucleotide", id=link['Id']) as handle: + genomes = Entrez.read(handle) + for id in genomes[0]['LinkSetDb'][0]['Link']: + with Entrez.esummary(db='nucleotide', id=id['Id']) as handle: + bacorg = Entrez.read(handle) + if 'NC_' in bacorg[0]['AccessionVersion'] or 'NZ_' in bacorg[0]['AccessionVersion']: + if bacorg[0]['Caption'] != phageID: + BactID.append(bacorg[0]['AccessionVersion']) + except: + pass + else: + pass + except: + pass + self.listBacID.append(BactID) + + def check_bacteria(self): + from Bio import Entrez + from Bio import SeqIO + import ast + Entrez.email = "pedro_araujo97@hotmail.com" + all_bact = [] + for i in self.data.index: + for bact in ast.literal_eval(self.data.loc[i, 'Host_ID']): + if bact[:-2] not in all_bact: + all_bact.append(bact[:-2]) + list_remove = [] + for bact in all_bact: + if bact not in list_remove: + try: + with Entrez.efetch(db="nucleotide", rettype="gb", retmode="text", id=bact) as handle: + seq_record = SeqIO.read(handle, "gb") + if not any(i in seq_record.description.lower() for i in ['pneumoniae', 'coli', 'baumannii']) or not any(i in seq_record.description.lower() for i in ['escherichia', 'acinetobacter', 'klebsiella']) \ + or 'phage' in seq_record.description.lower() or 'virus' in seq_record.description.lower(): + list_remove.append(bact) + except: + list_remove.append(bact) + print(list_remove) + for phage in self.data.index: + listBactID = ast.literal_eval(self.data.loc[phage, 'Host_ID']) + for bact in listBactID: + if bact[:-2] in list_remove: + listBactID.remove(bact) + self.data.loc[phage, 'Host_ID'] = listBactID + self.data.to_csv('files/NCBI_Phage_Bacteria_Data.csv') + + def save_data(self): + """ + Saves the data in csv format. + :return: + """ + import json + self.data.to_csv('files/NCBI_Phage_Bacteria_Data.csv') + with open('files/searched_accessions', 'w') as f: + json.dump(self.searched, f) + + +if __name__ == '__main__': + test = PhageBacteriaData('NCBI_Phage_Bacteria_Data.csv') # sequences + test.addBacteriaName() + test.addBacteriaGenome() + test.searchBacName() # 2266 bacteria added + test.checkAbstracts() + # test.data = test.data.drop(columns=['Bacteria ID']) + test.searchBacName() + # test.createListBacID(0, 100) + # test.data = test.data.iloc[:, 0:3] + test.check_bacteria() + test.save_data() +# test.extractProtein() +# test.importProtein('Phage')