Mercurial > repos > jose_duarte > phagedpo
comparison DPOGALAXY.py @ 32:5a0afb1578ea draft
Uploaded
author | jose_duarte |
---|---|
date | Wed, 15 Feb 2023 09:57:01 +0000 |
parents | ce0de724097a |
children |
comparison
equal
deleted
inserted
replaced
31:3d94608aea7a | 32:5a0afb1578ea |
---|---|
1 import pickle | |
2 from Bio import SeqIO | |
3 import os | |
4 import pandas as pd | |
5 import numpy as np | |
6 from local_ctd import CalculateCTD | |
7 from local_AAComposition import CalculateDipeptideComposition | |
8 import sys | |
9 from Bio.SeqUtils.ProtParam import ProteinAnalysis | |
10 | |
1 | 11 |
2 class PDPOPrediction: | 12 class PDPOPrediction: |
3 def __init__(self, Folder = 'location', mdl='',seq_file = 'fasta_file.fasta',ttable=11): | 13 |
4 import pickle | 14 def __init__(self, folder='location', mdl='', seq_file='fasta_file.fasta', ttable=11): |
5 import pandas as pd | 15 """ |
6 from Bio import SeqIO | 16 Initialize PhageDPO prediction. |
7 import os | 17 :param folder: data path |
8 from pathlib import Path | 18 :param mdl: ml model, in this case ANN or SVM |
19 :param seq_file: fasta file | |
20 :param ttable: Translational table. By default, The Bacterial, Archaeal and Plant Plastid Code Table 11 | |
21 """ | |
22 self.records = [] | |
9 self.data = {} | 23 self.data = {} |
10 self.df_output = None | 24 self.df_output = None |
11 self.seqfile = seq_file | 25 self.seqfile = seq_file |
12 self.__location__ = os.path.realpath(os.path.join(os.getcwd(), Folder)) | 26 self.__location__ = os.path.realpath(os.path.join(os.getcwd(), folder)) |
13 | 27 |
14 with open(os.path.join(self.__location__,mdl), 'rb') as m: | 28 with open(os.path.join(self.__location__, mdl), 'rb') as m: |
15 self.model = pickle.load(m) | 29 self.model = pickle.load(m) |
16 if mdl == 'SVM4311': | 30 if mdl == 'SVM4311': |
17 with open(os.path.join(__location__,'d4311_SCALER'),'rb') as sl: | 31 with open(os.path.join(__location__, 'd4311_SCALER'), 'rb') as sl: |
18 self.scaler = pickle.load(sl) | 32 self.scaler = pickle.load(sl) |
19 self.name = mdl | 33 self.name = mdl |
20 elif mdl == 'ANN7185': | 34 elif mdl == 'ANN7185': |
21 with open(os.path.join(__location__,'d7185_SCALER'),'rb') as sc: | 35 with open(os.path.join(__location__, 'd7185_SCALER'), 'rb') as sc: |
22 self.scaler = pickle.load(sc) | 36 self.scaler = pickle.load(sc) |
23 self.name = mdl | 37 self.name = mdl |
24 | 38 |
25 for seq in SeqIO.parse(os.path.join(self.__location__,self.seqfile), 'fasta'): | 39 for seq in SeqIO.parse(os.path.join(self.__location__, self.seqfile), 'fasta'): |
40 record = [] | |
26 DNA_seq = seq.seq | 41 DNA_seq = seq.seq |
27 AA_seq = DNA_seq.translate(table=ttable) | 42 AA_seq = DNA_seq.translate(table=ttable) |
28 descr_seq = seq.description.replace(' ','') | 43 descr_seq = seq.description.replace(' ', '') |
29 self.data[descr_seq]=[DNA_seq._data,AA_seq._data] | 44 self.data[descr_seq] = [DNA_seq._data, AA_seq._data] |
30 self.df = pd.DataFrame({'ID':list(self.data.keys()), | 45 record.append(seq.description) |
31 'DNAseq':[elem[0] for elem in self.data.values()], | 46 record.append(DNA_seq._data) |
32 'AAseq':[elem[1] for elem in self.data.values()]}) | 47 record.append(AA_seq._data) |
33 self.df = self.df.set_index('ID') | 48 self.records.append(record) |
49 | |
50 columns = ['ID', 'DNAseq', 'AAseq'] | |
51 self.df = pd.DataFrame(self.records, columns=columns) | |
52 #self.df = self.df.set_index('ID') | |
53 self.df.update(self.df.DNAseq[self.df.DNAseq.apply(type) == list].str[0]) | |
54 self.df.update(self.df.AAseq[self.df.AAseq.apply(type) == list].str[0]) | |
34 | 55 |
35 def Datastructure(self): | 56 def Datastructure(self): |
36 import pandas as pd | 57 """ |
37 import pickle | 58 Create dataset with all features |
38 from Bio.SeqUtils.ProtParam import ProteinAnalysis | 59 """ |
39 from local_ctd import CalculateCTD | |
40 from local_AAComposition import CalculateAAComposition, CalculateDipeptideComposition | |
41 | |
42 def count_orf(orf_seq): | 60 def count_orf(orf_seq): |
61 """ | |
62 Function to count open reading frames | |
63 :param orf_seq: sequence to analyze | |
64 :return: dictionary with open reading frames | |
65 """ | |
43 dic = {'DNA-A': 0, 'DNA-C': 0, 'DNA-T': 0, 'DNA-G': 0, 'DNA-GC': 0} | 66 dic = {'DNA-A': 0, 'DNA-C': 0, 'DNA-T': 0, 'DNA-G': 0, 'DNA-GC': 0} |
44 for letter in range(len(orf_seq)): | 67 for letter in range(len(orf_seq)): |
45 for k in range(0, 4): | 68 for k in range(0, 4): |
46 if orf_seq[letter] in list(dic.keys())[k][-1]: | 69 if str(orf_seq[letter]) in list(dic.keys())[k][-1]: |
47 dic[list(dic.keys())[k]] += 1 | 70 dic[list(dic.keys())[k]] += 1 |
48 dic['DNA-GC'] = ((dic['DNA-C'] + dic['DNA-G']) / ( | 71 dic['DNA-GC'] = ((dic['DNA-C'] + dic['DNA-G']) / ( |
49 dic['DNA-A'] + dic['DNA-C'] + dic['DNA-T'] + dic['DNA-G'])) * 100 | 72 dic['DNA-A'] + dic['DNA-C'] + dic['DNA-T'] + dic['DNA-G'])) * 100 |
50 return dic | 73 return dic |
51 | 74 |
52 def count_aa(aa_seq): | 75 def count_aa(aa_seq): |
76 """ | |
77 Function to count amino acids | |
78 :param aa_seq: sequence to analyze | |
79 :return: dictionary with amino acid composition | |
80 """ | |
53 dic = {'G': 0, 'A': 0, 'L': 0, 'V': 0, 'I': 0, 'P': 0, 'F': 0, 'S': 0, 'T': 0, 'C': 0, | 81 dic = {'G': 0, 'A': 0, 'L': 0, 'V': 0, 'I': 0, 'P': 0, 'F': 0, 'S': 0, 'T': 0, 'C': 0, |
54 'Y': 0, 'N': 0, 'Q': 0, 'D': 0, 'E': 0, 'R': 0, 'K': 0, 'H': 0, 'W': 0, 'M': 0} | 82 'Y': 0, 'N': 0, 'Q': 0, 'D': 0, 'E': 0, 'R': 0, 'K': 0, 'H': 0, 'W': 0, 'M': 0} |
55 for letter in range(len(aa_seq)): | 83 for letter in range(len(aa_seq)): |
56 if aa_seq[letter] in dic.keys(): | 84 if aa_seq[letter] in dic.keys(): |
57 dic[aa_seq[letter]] += 1 | 85 dic[aa_seq[letter]] += 1 |
58 return dic | 86 return dic |
59 | 87 |
60 def sec_st_fr(aa_seq): | 88 def sec_st_fr(aa_seq): |
61 from Bio.SeqUtils.ProtParam import ProteinAnalysis | 89 """ |
90 Function to analyze secondary structure. Helix, Turn and Sheet | |
91 :param aa_seq: sequence to analyze | |
92 :return: dictionary with composition of each secondary structure | |
93 """ | |
62 st_dic = {'Helix': 0, 'Turn': 0, 'Sheet': 0} | 94 st_dic = {'Helix': 0, 'Turn': 0, 'Sheet': 0} |
63 stu = ProteinAnalysis(aa_seq).secondary_structure_fraction() | 95 stu = ProteinAnalysis(aa_seq).secondary_structure_fraction() |
64 st_dic['Helix'] = stu[0] | 96 st_dic['Helix'] = stu[0] |
65 st_dic['Turn'] = stu[1] | 97 st_dic['Turn'] = stu[1] |
66 st_dic['Sheet'] = stu[2] | 98 st_dic['Sheet'] = stu[2] |
94 "KS", "KT", "MQ", "MG", "MI", "FA", "FR", "FS", "FY", "PC", "PE", "PG", "PH", "PM", "PF", | 126 "KS", "KT", "MQ", "MG", "MI", "FA", "FR", "FS", "FY", "PC", "PE", "PG", "PH", "PM", "PF", |
95 "PT", "SA", "SD", "SC", "SQ", "SW", "TA", "TC", "TM", "WL", "WV", "YE", "YG", "YH", "YI", | 127 "PT", "SA", "SD", "SC", "SQ", "SW", "TA", "TC", "TM", "WL", "WV", "YE", "YG", "YH", "YI", |
96 "YL", "YK", "YM", "YS"]} | 128 "YL", "YK", "YM", "YS"]} |
97 | 129 |
98 self.df_output = self.df.copy() | 130 self.df_output = self.df.copy() |
99 self.df_output.drop(['DNAseq','AAseq'],axis=1,inplace=True) | 131 self.df_output.drop(['DNAseq', 'AAseq'], axis=1, inplace=True) |
100 dna_feat = {} | 132 dna_feat = {} |
101 aa_len = {} | 133 aa_len = {} |
102 aroma_dic = {} | 134 aroma_dic = {} |
103 iso_dic = {} | 135 iso_dic = {} |
104 aa_content = {} | 136 aa_content = {} |
105 st_dic_master = {} | 137 st_dic_master = {} |
106 CTD_dic = {} | 138 CTD_dic = {} |
107 dp = {} | 139 dp = {} |
140 self.df1 = self.df[['ID']].copy() | |
141 self.df.drop(['ID'], axis=1, inplace=True) | |
108 for i in range(len(self.df)): | 142 for i in range(len(self.df)): |
109 i_name = self.df.index[i] | 143 i_name = self.df.index[i] |
110 dna_feat[i_name] = count_orf(self.df.iloc[i]['DNAseq']) | 144 dna_feat[i] = count_orf(self.df.iloc[i]['DNAseq']) |
111 aa_len[i_name] = len(self.df.iloc[i]['AAseq']) | 145 aa_len[i] = len(self.df.iloc[i]['AAseq']) |
112 aroma_dic[i_name] = ProteinAnalysis(self.df.iloc[i]['AAseq']).aromaticity() | 146 aroma_dic[i] = ProteinAnalysis(self.df.iloc[i]['AAseq']).aromaticity() |
113 iso_dic[i_name] = ProteinAnalysis(self.df.iloc[i]['AAseq']).isoelectric_point() | 147 iso_dic[i] = ProteinAnalysis(self.df.iloc[i]['AAseq']).isoelectric_point() |
114 aa_content[i_name] = count_aa(self.df.iloc[i]['AAseq']) | 148 aa_content[i] = count_aa(self.df.iloc[i]['AAseq']) |
115 st_dic_master[i_name] = sec_st_fr(self.df.iloc[i]['AAseq']) | 149 st_dic_master[i] = sec_st_fr(self.df.iloc[i]['AAseq']) |
116 CTD_dic[i_name] = CalculateCTD(self.df.iloc[i]['AAseq']) | 150 CTD_dic[i] = CalculateCTD(self.df.iloc[i]['AAseq']) |
117 dp[i_name] = CalculateDipeptideComposition(self.df.iloc[i]['AAseq']) | 151 dp[i] = CalculateDipeptideComposition(self.df.iloc[i]['AAseq']) |
118 for j in self.df.index: | 152 for j in self.df.index: |
119 self.df.loc[j, dna_feat[j].keys()] = dna_feat[j].values() #dic with multiple values | 153 self.df.loc[j, dna_feat[j].keys()] = dna_feat[j].values() #dic with multiple values |
120 self.df.loc[j, 'AA_Len'] = int(aa_len[j]) #dic with one value | 154 self.df.loc[j, 'AA_Len'] = int(aa_len[j]) #dic with one value |
121 self.df.loc[j, 'Aromaticity'] = aroma_dic[j] | 155 self.df.loc[j, 'Aromaticity'] = aroma_dic[j] |
122 self.df.loc[j, 'IsoelectricPoint'] = iso_dic[j] | 156 self.df.loc[j, 'IsoelectricPoint'] = iso_dic[j] |
123 self.df.loc[j, aa_content[j].keys()] = aa_content[j].values() | 157 self.df.loc[j, aa_content[j].keys()] = aa_content[j].values() |
124 self.df.loc[j, st_dic_master[j].keys()] = st_dic_master[j].values() | 158 self.df.loc[j, st_dic_master[j].keys()] = st_dic_master[j].values() |
125 self.df.loc[j, CTD_dic[j].keys()] = CTD_dic[j].values() | 159 self.df.loc[j, CTD_dic[j].keys()] = CTD_dic[j].values() |
126 self.df.loc[j, dp[j].keys()] = dp[j].values() | 160 self.df.loc[j, dp[j].keys()] = dp[j].values() |
127 self.df.drop(['DNAseq','AAseq'],axis=1,inplace=True) | 161 self.df.drop(['DNAseq', 'AAseq'], axis=1, inplace=True) |
128 | 162 |
129 def Prediction(self): | 163 def Prediction(self): |
130 import os | 164 """ |
131 import pickle | 165 Predicts the percentage of each CDS being depolymerase. |
132 import json | 166 :return: None |
133 import pandas as pd | 167 """ |
134 import numpy as np | 168 ft_scaler = pd.DataFrame(self.scaler.transform(self.df.iloc[:, :]), index=self.df.index, columns=self.df.columns) |
135 from pathlib import Path | |
136 ft_scaler = pd.DataFrame(self.scaler.transform(self.df.iloc[:, :]), index=self.df.index,columns=self.df.columns) | |
137 ft_scaler = ft_scaler.drop(columns=[col for col in self.df if col not in self.feat[self.name]], axis=1) | 169 ft_scaler = ft_scaler.drop(columns=[col for col in self.df if col not in self.feat[self.name]], axis=1) |
138 scores = self.model.predict_proba(ft_scaler) | 170 scores = self.model.predict_proba(ft_scaler) |
139 pos_scores = np.empty((self.df.shape[0], 0), float) | 171 pos_scores = np.empty((self.df.shape[0], 0), float) |
140 for x in scores: | 172 for x in scores: |
141 pos_scores = np.append(pos_scores, round(x[1]*100)) | 173 pos_scores = np.append(pos_scores, round(x[1]*100)) |
142 self.df_output.reset_index(inplace=True) | 174 self.df_output.reset_index(inplace=True) |
143 self.df_output['{} DPO Prediction (%)'.format(self.name)]= pos_scores | 175 print(self.df_output.columns) |
176 self.df_output.rename(columns={'index': 'CDS'}, inplace=True) | |
177 self.df_output['CDS'] += 1 | |
178 self.df_output['{} DPO Prediction (%)'.format(self.name)] = pos_scores | |
179 print(self.df_output) | |
144 #self.df_output = self.df_output.sort_values(by='{} DPO Prediction (%)'.format(self.name), ascending=False) | 180 #self.df_output = self.df_output.sort_values(by='{} DPO Prediction (%)'.format(self.name), ascending=False) |
145 self.df_output.to_html('output.html', index=False, justify='center') | 181 self.df_output.to_html('output.html', index=False, justify='center') |
146 | 182 |
183 | |
147 if __name__ == '__main__': | 184 if __name__ == '__main__': |
148 import os | |
149 import sys | |
150 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) | 185 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) |
151 | 186 |
152 model = sys.argv[1] | 187 model = sys.argv[1] |
153 fasta_file = sys.argv[2] | 188 fasta_file = sys.argv[2] |
154 | 189 |
155 PDPO = PDPOPrediction(__location__,model,fasta_file) | 190 PDPO = PDPOPrediction(__location__, model, fasta_file) |
156 PDPO.Datastructure() | 191 PDPO.Datastructure() |
157 PDPO.Prediction() | 192 PDPO.Prediction() |
158 | 193 |