31
|
1 # -*- coding: utf-8 -*-
|
|
2 """
|
|
3 Compute the composition, transition and distribution descriptors based on the
|
|
4 different properties of AADs.
|
|
5 The AADs with the same properties is marked as the same number. You can get 147
|
|
6 descriptors for a given protein sequence.
|
|
7 References
|
|
8 ----------
|
|
9 .. [1] Inna Dubchak, Ilya Muchink, Stephen R.Holbrook and Sung-Hou Kim.
|
|
10 Prediction of protein folding class using global description of amino
|
|
11 acid sequence. Proc.Natl. Acad.Sci.USA, 1995, 92, 8700-8704.
|
|
12 .. [2] Inna Dubchak, Ilya Muchink, Christopher Mayor, Igor Dralyuk and Sung-Hou
|
|
13 Kim. Recognition of a Protein Fold in the Context of the SCOP
|
|
14 classification. Proteins: Structure, Function and
|
|
15 Genetics, 1999, 35, 401-407.
|
|
16 Authors: Dongsheng Cao and Yizeng Liang.
|
|
17 Date: 2010.11.22
|
|
18 Email: oriental-cds@163.com
|
|
19 """
|
|
20
|
|
21 # Core Library
|
|
22 import copy
|
|
23 import math
|
|
24 from typing import Any, Dict
|
|
25
|
|
26 _Hydrophobicity = {"1": "RKEDQN", "2": "GASTPHY", "3": "CLVIMFW"}
|
|
27 # '1'stand for Polar; '2'stand for Neutral, '3' stand for Hydrophobicity
|
|
28
|
|
29 _NormalizedVDWV = {"1": "GASTPD", "2": "NVEQIL", "3": "MHKFRYW"}
|
|
30 # '1'stand for (0-2.78); '2'stand for (2.95-4.0), '3' stand for (4.03-8.08)
|
|
31
|
|
32 _Polarity = {"1": "LIFWCMVY", "2": "CPNVEQIL", "3": "KMHFRYW"}
|
|
33 # '1'stand for (4.9-6.2); '2'stand for (8.0-9.2), '3' stand for (10.4-13.0)
|
|
34
|
|
35 _Charge = {"1": "KR", "2": "ANCQGHILMFPSTWYV", "3": "DE"}
|
|
36 # '1'stand for Positive; '2'stand for Neutral, '3' stand for Negative
|
|
37
|
|
38 _SecondaryStr = {"1": "EALMQKRH", "2": "VIYCWFT", "3": "GNPSD"}
|
|
39 # '1'stand for Helix; '2'stand for Strand, '3' stand for coil
|
|
40
|
|
41 _SolventAccessibility = {"1": "ALFCGIVW", "2": "RKQEND", "3": "MPSTHY"}
|
|
42 # '1'stand for Buried; '2'stand for Exposed, '3' stand for Intermediate
|
|
43
|
|
44 _Polarizability = {"1": "GASDT", "2": "CPNVEQIL", "3": "KMHFRYW"}
|
|
45 # '1'stand for (0-0.108); '2'stand for (0.128-0.186), '3' stand for (0.219-0.409)
|
|
46
|
|
47
|
|
48 # You can continuely add other properties of AADs to compute descriptors of
|
|
49 # protein sequence.
|
|
50
|
|
51 _AATProperty = (
|
|
52 _Hydrophobicity,
|
|
53 _NormalizedVDWV,
|
|
54 _Polarity,
|
|
55 _Charge,
|
|
56 _SecondaryStr,
|
|
57 _SolventAccessibility,
|
|
58 _Polarizability,
|
|
59 )
|
|
60
|
|
61 _AATPropertyName = (
|
|
62 "_Hydrophobicity",
|
|
63 "_NormalizedVDWV",
|
|
64 "_Polarity",
|
|
65 "_Charge",
|
|
66 "_SecondaryStr",
|
|
67 "_SolventAccessibility",
|
|
68 "_Polarizability",
|
|
69 )
|
|
70
|
|
71
|
|
72 def StringtoNum(ProteinSequence: str, AAProperty: Dict[Any, Any]) -> str:
|
|
73 hardProteinSequence = copy.deepcopy(ProteinSequence)
|
|
74 for k, m in list(AAProperty.items()):
|
|
75 for index in m:
|
|
76 hardProteinSequence = hardProteinSequence.replace(index, k)
|
|
77 TProteinSequence = hardProteinSequence
|
|
78
|
|
79 return TProteinSequence
|
|
80
|
|
81
|
|
82 def CalculateComposition(
|
|
83 ProteinSequence: str, AAProperty: Dict[Any, Any], AAPName: str) -> Dict[Any, Any]:
|
|
84 TProteinSequence = StringtoNum(ProteinSequence, AAProperty)
|
|
85 result = {}
|
|
86 num = len(TProteinSequence)
|
|
87 result[AAPName + "C" + "1"] = round(float(TProteinSequence.count("1")) / num, 3)
|
|
88 result[AAPName + "C" + "2"] = round(float(TProteinSequence.count("2")) / num, 3)
|
|
89 result[AAPName + "C" + "3"] = round(float(TProteinSequence.count("3")) / num, 3)
|
|
90 return result
|
|
91
|
|
92
|
|
93 def CalculateTransition(
|
|
94 ProteinSequence: str, AAProperty: Dict[Any, Any], AAPName: str
|
|
95 ) -> Dict[Any, Any]:
|
|
96 TProteinSequence = StringtoNum(ProteinSequence, AAProperty)
|
|
97 Result = {}
|
|
98 num = len(TProteinSequence)
|
|
99 CTD = TProteinSequence
|
|
100 Result[AAPName + "T" + "12"] = round(
|
|
101 float(CTD.count("12") + CTD.count("21")) / (num - 1), 3
|
|
102 )
|
|
103 Result[AAPName + "T" + "13"] = round(
|
|
104 float(CTD.count("13") + CTD.count("31")) / (num - 1), 3
|
|
105 )
|
|
106 Result[AAPName + "T" + "23"] = round(
|
|
107 float(CTD.count("23") + CTD.count("32")) / (num - 1), 3
|
|
108 )
|
|
109 return Result
|
|
110
|
|
111
|
|
112 def CalculateDistribution(
|
|
113 ProteinSequence: str, AAProperty: Dict[Any, Any], AAPName: str
|
|
114 ) -> Dict[Any, Any]:
|
|
115 TProteinSequence = StringtoNum(ProteinSequence, AAProperty)
|
|
116 Result: Dict[str, float] = {}
|
|
117 Num = len(TProteinSequence)
|
|
118 for i in ("1", "2", "3"):
|
|
119 num = TProteinSequence.count(i)
|
|
120 ink = 1
|
|
121 indexk = 0
|
|
122 cds = []
|
|
123 while ink <= num:
|
|
124 indexk = TProteinSequence.find(i, indexk) + 1
|
|
125 cds.append(indexk)
|
|
126 ink = ink + 1
|
|
127
|
|
128 if cds == []:
|
|
129 Result[AAPName + "D" + i + "001"] = 0
|
|
130 Result[AAPName + "D" + i + "025"] = 0
|
|
131 Result[AAPName + "D" + i + "050"] = 0
|
|
132 Result[AAPName + "D" + i + "075"] = 0
|
|
133 Result[AAPName + "D" + i + "100"] = 0
|
|
134 else:
|
|
135 Result[AAPName + "D" + i + "001"] = round(float(cds[0]) / Num * 100, 3)
|
|
136 Result[AAPName + "D" + i + "025"] = round(
|
|
137 float(cds[int(math.floor(num * 0.25)) - 1]) / Num * 100, 3
|
|
138 )
|
|
139 Result[AAPName + "D" + i + "050"] = round(
|
|
140 float(cds[int(math.floor(num * 0.5)) - 1]) / Num * 100, 3
|
|
141 )
|
|
142 Result[AAPName + "D" + i + "075"] = round(
|
|
143 float(cds[int(math.floor(num * 0.75)) - 1]) / Num * 100, 3
|
|
144 )
|
|
145 Result[AAPName + "D" + i + "100"] = round(float(cds[-1]) / Num * 100, 3)
|
|
146
|
|
147 return Result
|
|
148
|
|
149
|
|
150 def CalculateCompositionHydrophobicity(ProteinSequence: str):
|
|
151 return CalculateComposition(ProteinSequence, _Hydrophobicity, "_Hydrophobicity")
|
|
152
|
|
153
|
|
154 def CalculateCompositionNormalizedVDWV(ProteinSequence: str):
|
|
155 return CalculateComposition(ProteinSequence, _NormalizedVDWV, "_NormalizedVDWV")
|
|
156
|
|
157
|
|
158 def CalculateCompositionPolarity(ProteinSequence: str):
|
|
159 return CalculateComposition(ProteinSequence, _Polarity, "_Polarity")
|
|
160
|
|
161
|
|
162 def CalculateCompositionCharge(ProteinSequence: str) -> Dict[Any, Any]:
|
|
163 return CalculateComposition(ProteinSequence, _Charge, "_Charge")
|
|
164
|
|
165
|
|
166 def CalculateCompositionSecondaryStr(ProteinSequence: str) -> Dict[Any, Any]:
|
|
167 return CalculateComposition(ProteinSequence, _SecondaryStr, "_SecondaryStr")
|
|
168
|
|
169
|
|
170 def CalculateCompositionSolventAccessibility(ProteinSequence: str) -> Dict[Any, Any]:
|
|
171 return CalculateComposition(
|
|
172 ProteinSequence, _SolventAccessibility, "_SolventAccessibility"
|
|
173 )
|
|
174
|
|
175
|
|
176 def CalculateCompositionPolarizability(ProteinSequence: str) -> Dict[Any, Any]:
|
|
177 return CalculateComposition(ProteinSequence, _Polarizability, "_Polarizability")
|
|
178
|
|
179
|
|
180 def CalculateTransitionHydrophobicity(ProteinSequence: str) -> Dict[Any, Any]:
|
|
181 result = CalculateTransition(ProteinSequence, _Hydrophobicity, "_Hydrophobicity")
|
|
182 return result
|
|
183
|
|
184
|
|
185 def CalculateTransitionNormalizedVDWV(ProteinSequence: str) -> Dict[Any, Any]:
|
|
186 result = CalculateTransition(ProteinSequence, _NormalizedVDWV, "_NormalizedVDWV")
|
|
187 return result
|
|
188
|
|
189
|
|
190 def CalculateTransitionPolarity(ProteinSequence: str) -> Dict[Any, Any]:
|
|
191 result = CalculateTransition(ProteinSequence, _Polarity, "_Polarity")
|
|
192 return result
|
|
193
|
|
194
|
|
195 def CalculateTransitionCharge(ProteinSequence: str) -> Dict[Any, Any]:
|
|
196 result = CalculateTransition(ProteinSequence, _Charge, "_Charge")
|
|
197 return result
|
|
198
|
|
199
|
|
200 def CalculateTransitionSecondaryStr(ProteinSequence: str) -> Dict[Any, Any]:
|
|
201 result = CalculateTransition(ProteinSequence, _SecondaryStr, "_SecondaryStr")
|
|
202 return result
|
|
203
|
|
204
|
|
205 def CalculateTransitionSolventAccessibility(ProteinSequence: str) -> Dict[Any, Any]:
|
|
206 result = CalculateTransition(
|
|
207 ProteinSequence, _SolventAccessibility, "_SolventAccessibility"
|
|
208 )
|
|
209 return result
|
|
210
|
|
211
|
|
212 def CalculateTransitionPolarizability(ProteinSequence: str) -> Dict[Any, Any]:
|
|
213 result = CalculateTransition(ProteinSequence, _Polarizability, "_Polarizability")
|
|
214 return result
|
|
215
|
|
216
|
|
217 def CalculateDistributionHydrophobicity(ProteinSequence: str) -> Dict[Any, Any]:
|
|
218 result = CalculateDistribution(ProteinSequence, _Hydrophobicity, "_Hydrophobicity")
|
|
219 return result
|
|
220
|
|
221
|
|
222 def CalculateDistributionNormalizedVDWV(ProteinSequence: str) -> Dict[Any, Any]:
|
|
223 result = CalculateDistribution(ProteinSequence, _NormalizedVDWV, "_NormalizedVDWV")
|
|
224 return result
|
|
225
|
|
226
|
|
227 def CalculateDistributionPolarity(ProteinSequence: str) -> Dict[Any, Any]:
|
|
228 result = CalculateDistribution(ProteinSequence, _Polarity, "_Polarity")
|
|
229 return result
|
|
230
|
|
231
|
|
232 def CalculateDistributionCharge(ProteinSequence: str) -> Dict[Any, Any]:
|
|
233 result = CalculateDistribution(ProteinSequence, _Charge, "_Charge")
|
|
234 return result
|
|
235
|
|
236
|
|
237 def CalculateDistributionSecondaryStr(ProteinSequence: str) -> Dict[Any, Any]:
|
|
238 result = CalculateDistribution(ProteinSequence, _SecondaryStr, "_SecondaryStr")
|
|
239 return result
|
|
240
|
|
241
|
|
242 def CalculateDistributionSolventAccessibility(ProteinSequence: str) -> Dict[Any, Any]:
|
|
243 result = CalculateDistribution(
|
|
244 ProteinSequence, _SolventAccessibility, "_SolventAccessibility"
|
|
245 )
|
|
246 return result
|
|
247
|
|
248
|
|
249 def CalculateDistributionPolarizability(ProteinSequence: str) -> Dict[Any, Any]:
|
|
250 result = CalculateDistribution(ProteinSequence, _Polarizability, "_Polarizability")
|
|
251 return result
|
|
252
|
|
253
|
|
254 def CalculateC(ProteinSequence: str) -> Dict[Any, Any]:
|
|
255 result: Dict[Any, Any] = {}
|
|
256 result.update(CalculateCompositionPolarizability(ProteinSequence))
|
|
257 result.update(CalculateCompositionSolventAccessibility(ProteinSequence))
|
|
258 result.update(CalculateCompositionSecondaryStr(ProteinSequence))
|
|
259 result.update(CalculateCompositionCharge(ProteinSequence))
|
|
260 result.update(CalculateCompositionPolarity(ProteinSequence))
|
|
261 result.update(CalculateCompositionNormalizedVDWV(ProteinSequence))
|
|
262 result.update(CalculateCompositionHydrophobicity(ProteinSequence))
|
|
263 return result
|
|
264
|
|
265
|
|
266 def CalculateT(ProteinSequence: str) -> Dict[Any, Any]:
|
|
267 result: Dict[Any, Any] = {}
|
|
268 result.update(CalculateTransitionPolarizability(ProteinSequence))
|
|
269 result.update(CalculateTransitionSolventAccessibility(ProteinSequence))
|
|
270 result.update(CalculateTransitionSecondaryStr(ProteinSequence))
|
|
271 result.update(CalculateTransitionCharge(ProteinSequence))
|
|
272 result.update(CalculateTransitionPolarity(ProteinSequence))
|
|
273 result.update(CalculateTransitionNormalizedVDWV(ProteinSequence))
|
|
274 result.update(CalculateTransitionHydrophobicity(ProteinSequence))
|
|
275 return result
|
|
276
|
|
277
|
|
278 def CalculateD(ProteinSequence: str) -> Dict[Any, Any]:
|
|
279 result: Dict[Any, Any] = {}
|
|
280 result.update(CalculateDistributionPolarizability(ProteinSequence))
|
|
281 result.update(CalculateDistributionSolventAccessibility(ProteinSequence))
|
|
282 result.update(CalculateDistributionSecondaryStr(ProteinSequence))
|
|
283 result.update(CalculateDistributionCharge(ProteinSequence))
|
|
284 result.update(CalculateDistributionPolarity(ProteinSequence))
|
|
285 result.update(CalculateDistributionNormalizedVDWV(ProteinSequence))
|
|
286 result.update(CalculateDistributionHydrophobicity(ProteinSequence))
|
|
287 return result
|
|
288
|
|
289
|
|
290 def CalculateCTD(ProteinSequence: str) -> Dict[Any, Any]:
|
|
291 result: Dict[Any, Any] = {}
|
|
292 result.update(CalculateCompositionPolarizability(ProteinSequence))
|
|
293 result.update(CalculateCompositionSolventAccessibility(ProteinSequence))
|
|
294 result.update(CalculateCompositionSecondaryStr(ProteinSequence))
|
|
295 result.update(CalculateCompositionCharge(ProteinSequence))
|
|
296 result.update(CalculateCompositionPolarity(ProteinSequence))
|
|
297 result.update(CalculateCompositionNormalizedVDWV(ProteinSequence))
|
|
298 result.update(CalculateCompositionHydrophobicity(ProteinSequence))
|
|
299 result.update(CalculateTransitionPolarizability(ProteinSequence))
|
|
300 result.update(CalculateTransitionSolventAccessibility(ProteinSequence))
|
|
301 result.update(CalculateTransitionSecondaryStr(ProteinSequence))
|
|
302 result.update(CalculateTransitionCharge(ProteinSequence))
|
|
303 result.update(CalculateTransitionPolarity(ProteinSequence))
|
|
304 result.update(CalculateTransitionNormalizedVDWV(ProteinSequence))
|
|
305 result.update(CalculateTransitionHydrophobicity(ProteinSequence))
|
|
306 result.update(CalculateDistributionPolarizability(ProteinSequence))
|
|
307 result.update(CalculateDistributionSolventAccessibility(ProteinSequence))
|
|
308 result.update(CalculateDistributionSecondaryStr(ProteinSequence))
|
|
309 result.update(CalculateDistributionCharge(ProteinSequence))
|
|
310 result.update(CalculateDistributionPolarity(ProteinSequence))
|
|
311 result.update(CalculateDistributionNormalizedVDWV(ProteinSequence))
|
|
312 result.update(CalculateDistributionHydrophobicity(ProteinSequence))
|
|
313 return result
|
|
314
|
|
315
|
|
316
|
|
317
|
|
318
|
|
319
|
|
320
|
|
321
|
|
322
|
|
323
|
|
324
|