Mercurial > repos > jose_duarte > phagedpo
changeset 34:a20338e6e58f draft
Deleted selected files
author | jose_duarte |
---|---|
date | Tue, 13 Jun 2023 09:53:20 +0000 |
parents | 269e43aa8721 |
children | a662eb3f87c2 |
files | ANN7185 DPOGALAXY.py SVM4311 d4311_SCALER d7185_SCALER |
diffstat | 5 files changed, 0 insertions(+), 193 deletions(-) [+] |
line wrap: on
line diff
--- a/DPOGALAXY.py Tue Jun 13 09:53:02 2023 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,193 +0,0 @@ -import pickle -from Bio import SeqIO -import os -import pandas as pd -import numpy as np -from local_ctd import CalculateCTD -from local_AAComposition import CalculateDipeptideComposition -import sys -from Bio.SeqUtils.ProtParam import ProteinAnalysis - - -class PDPOPrediction: - - def __init__(self, folder='location', mdl='', seq_file='fasta_file.fasta', ttable=11): - """ - Initialize PhageDPO prediction. - :param folder: data path - :param mdl: ml model, in this case ANN or SVM - :param seq_file: fasta file - :param ttable: Translational table. By default, The Bacterial, Archaeal and Plant Plastid Code Table 11 - """ - self.records = [] - self.data = {} - self.df_output = None - self.seqfile = seq_file - self.__location__ = os.path.realpath(os.path.join(os.getcwd(), folder)) - - with open(os.path.join(self.__location__, mdl), 'rb') as m: - self.model = pickle.load(m) - if mdl == 'SVM4311': - with open(os.path.join(__location__, 'd4311_SCALER'), 'rb') as sl: - self.scaler = pickle.load(sl) - self.name = mdl - elif mdl == 'ANN7185': - with open(os.path.join(__location__, 'd7185_SCALER'), 'rb') as sc: - self.scaler = pickle.load(sc) - self.name = mdl - - for seq in SeqIO.parse(os.path.join(self.__location__, self.seqfile), 'fasta'): - record = [] - DNA_seq = seq.seq - AA_seq = DNA_seq.translate(table=ttable) - descr_seq = seq.description.replace(' ', '') - self.data[descr_seq] = [DNA_seq._data, AA_seq._data] - record.append(seq.description) - record.append(DNA_seq._data) - record.append(AA_seq._data) - self.records.append(record) - - columns = ['ID', 'DNAseq', 'AAseq'] - self.df = pd.DataFrame(self.records, columns=columns) - #self.df = self.df.set_index('ID') - self.df.update(self.df.DNAseq[self.df.DNAseq.apply(type) == list].str[0]) - self.df.update(self.df.AAseq[self.df.AAseq.apply(type) == list].str[0]) - - def Datastructure(self): - """ - Create dataset with all features - """ - def count_orf(orf_seq): - """ - Function to count open reading frames - :param orf_seq: sequence to analyze - :return: dictionary with open reading frames - """ - dic = {'DNA-A': 0, 'DNA-C': 0, 'DNA-T': 0, 'DNA-G': 0, 'DNA-GC': 0} - for letter in range(len(orf_seq)): - for k in range(0, 4): - if str(orf_seq[letter]) in list(dic.keys())[k][-1]: - dic[list(dic.keys())[k]] += 1 - dic['DNA-GC'] = ((dic['DNA-C'] + dic['DNA-G']) / ( - dic['DNA-A'] + dic['DNA-C'] + dic['DNA-T'] + dic['DNA-G'])) * 100 - return dic - - def count_aa(aa_seq): - """ - Function to count amino acids - :param aa_seq: sequence to analyze - :return: dictionary with amino acid composition - """ - dic = {'G': 0, 'A': 0, 'L': 0, 'V': 0, 'I': 0, 'P': 0, 'F': 0, 'S': 0, 'T': 0, 'C': 0, - 'Y': 0, 'N': 0, 'Q': 0, 'D': 0, 'E': 0, 'R': 0, 'K': 0, 'H': 0, 'W': 0, 'M': 0} - for letter in range(len(aa_seq)): - if aa_seq[letter] in dic.keys(): - dic[aa_seq[letter]] += 1 - return dic - - def sec_st_fr(aa_seq): - """ - Function to analyze secondary structure. Helix, Turn and Sheet - :param aa_seq: sequence to analyze - :return: dictionary with composition of each secondary structure - """ - st_dic = {'Helix': 0, 'Turn': 0, 'Sheet': 0} - stu = ProteinAnalysis(aa_seq).secondary_structure_fraction() - st_dic['Helix'] = stu[0] - st_dic['Turn'] = stu[1] - st_dic['Sheet'] = stu[2] - return st_dic - - self.feat={"SVM4311": ["DNA-A", "DNA-T", "DNA-G", "DNA-GC", "AA_Len", "G", "A", "S", "T", "N", "Turn", "Sheet", - "_PolarizabilityC1", "_PolarizabilityC3", "_SolventAccessibilityC1", "_SecondaryStrC1", - "_SecondaryStrC2", "_SecondaryStrC3", "_ChargeC2", "_ChargeC3", "_PolarityC1", "_NormalizedVDWVC1", - "_NormalizedVDWVC3", "_HydrophobicityC2", "_HydrophobicityC3", "_SecondaryStrT23", - "_NormalizedVDWVT13", "_PolarizabilityD1001", "_SolventAccessibilityD1001", "_SolventAccessibilityD2001", - "_SolventAccessibilityD3001", "_SecondaryStrD1025", "_ChargeD1075","_ChargeD2001", "_ChargeD2025", - "_ChargeD3025", "_ChargeD3050", "_PolarityD1075", "_PolarityD3025","_NormalizedVDWVD1001", - "_NormalizedVDWVD3050", "_HydrophobicityD2001", "DG", "DT", "GD"], - "ANN7185": ["DNA-GC", "AA_Len", "Aromaticity", "IsoelectricPoint", "G", "A", "L", "V", "I", "P", "F", - "S", "T", "C", "Y", "N", "Q", "D", "E", "R", "K", "H", "W", "M", "Turn", "Sheet", "_PolarizabilityC1", - "_PolarizabilityC2", "_PolarizabilityC3", "_SolventAccessibilityC1", "_SolventAccessibilityC2", - "_SecondaryStrC1", "_SecondaryStrC3", "_ChargeC1", "_ChargeC2", "_ChargeC3", "_PolarityC2", - "_NormalizedVDWVC2", "_NormalizedVDWVC3", "_HydrophobicityC1", "_HydrophobicityC2", "_SecondaryStrT13", - "_SecondaryStrT23", "_ChargeT12", "_ChargeT13", "_HydrophobicityT12", "_PolarizabilityD1001", - "_PolarizabilityD1025", "_PolarizabilityD1050", "_PolarizabilityD2001", "_PolarizabilityD3025", - "_PolarizabilityD3050", "_PolarizabilityD3075", "_SolventAccessibilityD1050", "_SolventAccessibilityD2001", - "_SolventAccessibilityD2025", "_SolventAccessibilityD2050", "_SolventAccessibilityD3025", - "_SolventAccessibilityD3050", "_SolventAccessibilityD3100", "_SecondaryStrD1025", "_SecondaryStrD1050", - "_SecondaryStrD1075", "_SecondaryStrD2001", "_SecondaryStrD2050", "_SecondaryStrD2075", "_ChargeD1050", - "_ChargeD1075", "_ChargeD1100", "_ChargeD2025", "_ChargeD3025", "_ChargeD3050", "_PolarityD2050", - "_PolarityD3050", "_NormalizedVDWVD1001", "_NormalizedVDWVD1050", "_NormalizedVDWVD2001", "_NormalizedVDWVD2025", - "_HydrophobicityD3001", "_HydrophobicityD3075", "AD", "AW", "AY", "RC", "RT", "NA", "NE", - "NG", "NP", "DE", "DQ", "DG", "DT", "DY", "CG", "CL", "CY", "CV", "EN", "QA", "QR", "QE", - "QI", "GA", "GR", "GD", "GQ", "GG", "GH", "GL", "GF", "GP", "GT", "GY", "HA", "HC", "HI", - "HK", "HP", "IC", "IG", "IS", "IT", "IW", "LA", "LR", "LH", "LI", "LK", "LP", "KQ", "KH", - "KS", "KT", "MQ", "MG", "MI", "FA", "FR", "FS", "FY", "PC", "PE", "PG", "PH", "PM", "PF", - "PT", "SA", "SD", "SC", "SQ", "SW", "TA", "TC", "TM", "WL", "WV", "YE", "YG", "YH", "YI", - "YL", "YK", "YM", "YS"]} - - self.df_output = self.df.copy() - self.df_output.drop(['DNAseq', 'AAseq'], axis=1, inplace=True) - dna_feat = {} - aa_len = {} - aroma_dic = {} - iso_dic = {} - aa_content = {} - st_dic_master = {} - CTD_dic = {} - dp = {} - self.df1 = self.df[['ID']].copy() - self.df.drop(['ID'], axis=1, inplace=True) - for i in range(len(self.df)): - i_name = self.df.index[i] - dna_feat[i] = count_orf(self.df.iloc[i]['DNAseq']) - aa_len[i] = len(self.df.iloc[i]['AAseq']) - aroma_dic[i] = ProteinAnalysis(self.df.iloc[i]['AAseq']).aromaticity() - iso_dic[i] = ProteinAnalysis(self.df.iloc[i]['AAseq']).isoelectric_point() - aa_content[i] = count_aa(self.df.iloc[i]['AAseq']) - st_dic_master[i] = sec_st_fr(self.df.iloc[i]['AAseq']) - CTD_dic[i] = CalculateCTD(self.df.iloc[i]['AAseq']) - dp[i] = CalculateDipeptideComposition(self.df.iloc[i]['AAseq']) - for j in self.df.index: - self.df.loc[j, dna_feat[j].keys()] = dna_feat[j].values() #dic with multiple values - self.df.loc[j, 'AA_Len'] = int(aa_len[j]) #dic with one value - self.df.loc[j, 'Aromaticity'] = aroma_dic[j] - self.df.loc[j, 'IsoelectricPoint'] = iso_dic[j] - self.df.loc[j, aa_content[j].keys()] = aa_content[j].values() - self.df.loc[j, st_dic_master[j].keys()] = st_dic_master[j].values() - self.df.loc[j, CTD_dic[j].keys()] = CTD_dic[j].values() - self.df.loc[j, dp[j].keys()] = dp[j].values() - self.df.drop(['DNAseq', 'AAseq'], axis=1, inplace=True) - - def Prediction(self): - """ - Predicts the percentage of each CDS being depolymerase. - :return: None - """ - ft_scaler = pd.DataFrame(self.scaler.transform(self.df.iloc[:, :]), index=self.df.index, columns=self.df.columns) - ft_scaler = ft_scaler.drop(columns=[col for col in self.df if col not in self.feat[self.name]], axis=1) - scores = self.model.predict_proba(ft_scaler) - pos_scores = np.empty((self.df.shape[0], 0), float) - for x in scores: - pos_scores = np.append(pos_scores, round(x[1]*100)) - self.df_output.reset_index(inplace=True) - print(self.df_output.columns) - self.df_output.rename(columns={'index': 'CDS'}, inplace=True) - self.df_output['CDS'] += 1 - self.df_output['{} DPO Prediction (%)'.format(self.name)] = pos_scores - print(self.df_output) - #self.df_output = self.df_output.sort_values(by='{} DPO Prediction (%)'.format(self.name), ascending=False) - self.df_output.to_html('output.html', index=False, justify='center') - - -if __name__ == '__main__': - __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) - - model = sys.argv[1] - fasta_file = sys.argv[2] - - PDPO = PDPOPrediction(__location__, model, fasta_file) - PDPO.Datastructure() - PDPO.Prediction() -