comparison docker/alphafold/alphafold/common/residue_constants.py @ 1:6c92e000d684 draft

"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86"
author galaxy-australia
date Tue, 01 Mar 2022 02:53:05 +0000
parents
children
comparison
equal deleted inserted replaced
0:7ae9d78b06f5 1:6c92e000d684
1 # Copyright 2021 DeepMind Technologies Limited
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 """Constants used in AlphaFold."""
16
17 import collections
18 import functools
19 import os
20 from typing import List, Mapping, Tuple
21
22 import numpy as np
23 import tree
24
25 # Internal import (35fd).
26
27
28 # Distance from one CA to next CA [trans configuration: omega = 180].
29 ca_ca = 3.80209737096
30
31 # Format: The list for each AA type contains chi1, chi2, chi3, chi4 in
32 # this order (or a relevant subset from chi1 onwards). ALA and GLY don't have
33 # chi angles so their chi angle lists are empty.
34 chi_angles_atoms = {
35 'ALA': [],
36 # Chi5 in arginine is always 0 +- 5 degrees, so ignore it.
37 'ARG': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
38 ['CB', 'CG', 'CD', 'NE'], ['CG', 'CD', 'NE', 'CZ']],
39 'ASN': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'OD1']],
40 'ASP': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'OD1']],
41 'CYS': [['N', 'CA', 'CB', 'SG']],
42 'GLN': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
43 ['CB', 'CG', 'CD', 'OE1']],
44 'GLU': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
45 ['CB', 'CG', 'CD', 'OE1']],
46 'GLY': [],
47 'HIS': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'ND1']],
48 'ILE': [['N', 'CA', 'CB', 'CG1'], ['CA', 'CB', 'CG1', 'CD1']],
49 'LEU': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
50 'LYS': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD'],
51 ['CB', 'CG', 'CD', 'CE'], ['CG', 'CD', 'CE', 'NZ']],
52 'MET': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'SD'],
53 ['CB', 'CG', 'SD', 'CE']],
54 'PHE': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
55 'PRO': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD']],
56 'SER': [['N', 'CA', 'CB', 'OG']],
57 'THR': [['N', 'CA', 'CB', 'OG1']],
58 'TRP': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
59 'TYR': [['N', 'CA', 'CB', 'CG'], ['CA', 'CB', 'CG', 'CD1']],
60 'VAL': [['N', 'CA', 'CB', 'CG1']],
61 }
62
63 # If chi angles given in fixed-length array, this matrix determines how to mask
64 # them for each AA type. The order is as per restype_order (see below).
65 chi_angles_mask = [
66 [0.0, 0.0, 0.0, 0.0], # ALA
67 [1.0, 1.0, 1.0, 1.0], # ARG
68 [1.0, 1.0, 0.0, 0.0], # ASN
69 [1.0, 1.0, 0.0, 0.0], # ASP
70 [1.0, 0.0, 0.0, 0.0], # CYS
71 [1.0, 1.0, 1.0, 0.0], # GLN
72 [1.0, 1.0, 1.0, 0.0], # GLU
73 [0.0, 0.0, 0.0, 0.0], # GLY
74 [1.0, 1.0, 0.0, 0.0], # HIS
75 [1.0, 1.0, 0.0, 0.0], # ILE
76 [1.0, 1.0, 0.0, 0.0], # LEU
77 [1.0, 1.0, 1.0, 1.0], # LYS
78 [1.0, 1.0, 1.0, 0.0], # MET
79 [1.0, 1.0, 0.0, 0.0], # PHE
80 [1.0, 1.0, 0.0, 0.0], # PRO
81 [1.0, 0.0, 0.0, 0.0], # SER
82 [1.0, 0.0, 0.0, 0.0], # THR
83 [1.0, 1.0, 0.0, 0.0], # TRP
84 [1.0, 1.0, 0.0, 0.0], # TYR
85 [1.0, 0.0, 0.0, 0.0], # VAL
86 ]
87
88 # The following chi angles are pi periodic: they can be rotated by a multiple
89 # of pi without affecting the structure.
90 chi_pi_periodic = [
91 [0.0, 0.0, 0.0, 0.0], # ALA
92 [0.0, 0.0, 0.0, 0.0], # ARG
93 [0.0, 0.0, 0.0, 0.0], # ASN
94 [0.0, 1.0, 0.0, 0.0], # ASP
95 [0.0, 0.0, 0.0, 0.0], # CYS
96 [0.0, 0.0, 0.0, 0.0], # GLN
97 [0.0, 0.0, 1.0, 0.0], # GLU
98 [0.0, 0.0, 0.0, 0.0], # GLY
99 [0.0, 0.0, 0.0, 0.0], # HIS
100 [0.0, 0.0, 0.0, 0.0], # ILE
101 [0.0, 0.0, 0.0, 0.0], # LEU
102 [0.0, 0.0, 0.0, 0.0], # LYS
103 [0.0, 0.0, 0.0, 0.0], # MET
104 [0.0, 1.0, 0.0, 0.0], # PHE
105 [0.0, 0.0, 0.0, 0.0], # PRO
106 [0.0, 0.0, 0.0, 0.0], # SER
107 [0.0, 0.0, 0.0, 0.0], # THR
108 [0.0, 0.0, 0.0, 0.0], # TRP
109 [0.0, 1.0, 0.0, 0.0], # TYR
110 [0.0, 0.0, 0.0, 0.0], # VAL
111 [0.0, 0.0, 0.0, 0.0], # UNK
112 ]
113
114 # Atoms positions relative to the 8 rigid groups, defined by the pre-omega, phi,
115 # psi and chi angles:
116 # 0: 'backbone group',
117 # 1: 'pre-omega-group', (empty)
118 # 2: 'phi-group', (currently empty, because it defines only hydrogens)
119 # 3: 'psi-group',
120 # 4,5,6,7: 'chi1,2,3,4-group'
121 # The atom positions are relative to the axis-end-atom of the corresponding
122 # rotation axis. The x-axis is in direction of the rotation axis, and the y-axis
123 # is defined such that the dihedral-angle-definiting atom (the last entry in
124 # chi_angles_atoms above) is in the xy-plane (with a positive y-coordinate).
125 # format: [atomname, group_idx, rel_position]
126 rigid_group_atom_positions = {
127 'ALA': [
128 ['N', 0, (-0.525, 1.363, 0.000)],
129 ['CA', 0, (0.000, 0.000, 0.000)],
130 ['C', 0, (1.526, -0.000, -0.000)],
131 ['CB', 0, (-0.529, -0.774, -1.205)],
132 ['O', 3, (0.627, 1.062, 0.000)],
133 ],
134 'ARG': [
135 ['N', 0, (-0.524, 1.362, -0.000)],
136 ['CA', 0, (0.000, 0.000, 0.000)],
137 ['C', 0, (1.525, -0.000, -0.000)],
138 ['CB', 0, (-0.524, -0.778, -1.209)],
139 ['O', 3, (0.626, 1.062, 0.000)],
140 ['CG', 4, (0.616, 1.390, -0.000)],
141 ['CD', 5, (0.564, 1.414, 0.000)],
142 ['NE', 6, (0.539, 1.357, -0.000)],
143 ['NH1', 7, (0.206, 2.301, 0.000)],
144 ['NH2', 7, (2.078, 0.978, -0.000)],
145 ['CZ', 7, (0.758, 1.093, -0.000)],
146 ],
147 'ASN': [
148 ['N', 0, (-0.536, 1.357, 0.000)],
149 ['CA', 0, (0.000, 0.000, 0.000)],
150 ['C', 0, (1.526, -0.000, -0.000)],
151 ['CB', 0, (-0.531, -0.787, -1.200)],
152 ['O', 3, (0.625, 1.062, 0.000)],
153 ['CG', 4, (0.584, 1.399, 0.000)],
154 ['ND2', 5, (0.593, -1.188, 0.001)],
155 ['OD1', 5, (0.633, 1.059, 0.000)],
156 ],
157 'ASP': [
158 ['N', 0, (-0.525, 1.362, -0.000)],
159 ['CA', 0, (0.000, 0.000, 0.000)],
160 ['C', 0, (1.527, 0.000, -0.000)],
161 ['CB', 0, (-0.526, -0.778, -1.208)],
162 ['O', 3, (0.626, 1.062, -0.000)],
163 ['CG', 4, (0.593, 1.398, -0.000)],
164 ['OD1', 5, (0.610, 1.091, 0.000)],
165 ['OD2', 5, (0.592, -1.101, -0.003)],
166 ],
167 'CYS': [
168 ['N', 0, (-0.522, 1.362, -0.000)],
169 ['CA', 0, (0.000, 0.000, 0.000)],
170 ['C', 0, (1.524, 0.000, 0.000)],
171 ['CB', 0, (-0.519, -0.773, -1.212)],
172 ['O', 3, (0.625, 1.062, -0.000)],
173 ['SG', 4, (0.728, 1.653, 0.000)],
174 ],
175 'GLN': [
176 ['N', 0, (-0.526, 1.361, -0.000)],
177 ['CA', 0, (0.000, 0.000, 0.000)],
178 ['C', 0, (1.526, 0.000, 0.000)],
179 ['CB', 0, (-0.525, -0.779, -1.207)],
180 ['O', 3, (0.626, 1.062, -0.000)],
181 ['CG', 4, (0.615, 1.393, 0.000)],
182 ['CD', 5, (0.587, 1.399, -0.000)],
183 ['NE2', 6, (0.593, -1.189, -0.001)],
184 ['OE1', 6, (0.634, 1.060, 0.000)],
185 ],
186 'GLU': [
187 ['N', 0, (-0.528, 1.361, 0.000)],
188 ['CA', 0, (0.000, 0.000, 0.000)],
189 ['C', 0, (1.526, -0.000, -0.000)],
190 ['CB', 0, (-0.526, -0.781, -1.207)],
191 ['O', 3, (0.626, 1.062, 0.000)],
192 ['CG', 4, (0.615, 1.392, 0.000)],
193 ['CD', 5, (0.600, 1.397, 0.000)],
194 ['OE1', 6, (0.607, 1.095, -0.000)],
195 ['OE2', 6, (0.589, -1.104, -0.001)],
196 ],
197 'GLY': [
198 ['N', 0, (-0.572, 1.337, 0.000)],
199 ['CA', 0, (0.000, 0.000, 0.000)],
200 ['C', 0, (1.517, -0.000, -0.000)],
201 ['O', 3, (0.626, 1.062, -0.000)],
202 ],
203 'HIS': [
204 ['N', 0, (-0.527, 1.360, 0.000)],
205 ['CA', 0, (0.000, 0.000, 0.000)],
206 ['C', 0, (1.525, 0.000, 0.000)],
207 ['CB', 0, (-0.525, -0.778, -1.208)],
208 ['O', 3, (0.625, 1.063, 0.000)],
209 ['CG', 4, (0.600, 1.370, -0.000)],
210 ['CD2', 5, (0.889, -1.021, 0.003)],
211 ['ND1', 5, (0.744, 1.160, -0.000)],
212 ['CE1', 5, (2.030, 0.851, 0.002)],
213 ['NE2', 5, (2.145, -0.466, 0.004)],
214 ],
215 'ILE': [
216 ['N', 0, (-0.493, 1.373, -0.000)],
217 ['CA', 0, (0.000, 0.000, 0.000)],
218 ['C', 0, (1.527, -0.000, -0.000)],
219 ['CB', 0, (-0.536, -0.793, -1.213)],
220 ['O', 3, (0.627, 1.062, -0.000)],
221 ['CG1', 4, (0.534, 1.437, -0.000)],
222 ['CG2', 4, (0.540, -0.785, -1.199)],
223 ['CD1', 5, (0.619, 1.391, 0.000)],
224 ],
225 'LEU': [
226 ['N', 0, (-0.520, 1.363, 0.000)],
227 ['CA', 0, (0.000, 0.000, 0.000)],
228 ['C', 0, (1.525, -0.000, -0.000)],
229 ['CB', 0, (-0.522, -0.773, -1.214)],
230 ['O', 3, (0.625, 1.063, -0.000)],
231 ['CG', 4, (0.678, 1.371, 0.000)],
232 ['CD1', 5, (0.530, 1.430, -0.000)],
233 ['CD2', 5, (0.535, -0.774, 1.200)],
234 ],
235 'LYS': [
236 ['N', 0, (-0.526, 1.362, -0.000)],
237 ['CA', 0, (0.000, 0.000, 0.000)],
238 ['C', 0, (1.526, 0.000, 0.000)],
239 ['CB', 0, (-0.524, -0.778, -1.208)],
240 ['O', 3, (0.626, 1.062, -0.000)],
241 ['CG', 4, (0.619, 1.390, 0.000)],
242 ['CD', 5, (0.559, 1.417, 0.000)],
243 ['CE', 6, (0.560, 1.416, 0.000)],
244 ['NZ', 7, (0.554, 1.387, 0.000)],
245 ],
246 'MET': [
247 ['N', 0, (-0.521, 1.364, -0.000)],
248 ['CA', 0, (0.000, 0.000, 0.000)],
249 ['C', 0, (1.525, 0.000, 0.000)],
250 ['CB', 0, (-0.523, -0.776, -1.210)],
251 ['O', 3, (0.625, 1.062, -0.000)],
252 ['CG', 4, (0.613, 1.391, -0.000)],
253 ['SD', 5, (0.703, 1.695, 0.000)],
254 ['CE', 6, (0.320, 1.786, -0.000)],
255 ],
256 'PHE': [
257 ['N', 0, (-0.518, 1.363, 0.000)],
258 ['CA', 0, (0.000, 0.000, 0.000)],
259 ['C', 0, (1.524, 0.000, -0.000)],
260 ['CB', 0, (-0.525, -0.776, -1.212)],
261 ['O', 3, (0.626, 1.062, -0.000)],
262 ['CG', 4, (0.607, 1.377, 0.000)],
263 ['CD1', 5, (0.709, 1.195, -0.000)],
264 ['CD2', 5, (0.706, -1.196, 0.000)],
265 ['CE1', 5, (2.102, 1.198, -0.000)],
266 ['CE2', 5, (2.098, -1.201, -0.000)],
267 ['CZ', 5, (2.794, -0.003, -0.001)],
268 ],
269 'PRO': [
270 ['N', 0, (-0.566, 1.351, -0.000)],
271 ['CA', 0, (0.000, 0.000, 0.000)],
272 ['C', 0, (1.527, -0.000, 0.000)],
273 ['CB', 0, (-0.546, -0.611, -1.293)],
274 ['O', 3, (0.621, 1.066, 0.000)],
275 ['CG', 4, (0.382, 1.445, 0.0)],
276 # ['CD', 5, (0.427, 1.440, 0.0)],
277 ['CD', 5, (0.477, 1.424, 0.0)], # manually made angle 2 degrees larger
278 ],
279 'SER': [
280 ['N', 0, (-0.529, 1.360, -0.000)],
281 ['CA', 0, (0.000, 0.000, 0.000)],
282 ['C', 0, (1.525, -0.000, -0.000)],
283 ['CB', 0, (-0.518, -0.777, -1.211)],
284 ['O', 3, (0.626, 1.062, -0.000)],
285 ['OG', 4, (0.503, 1.325, 0.000)],
286 ],
287 'THR': [
288 ['N', 0, (-0.517, 1.364, 0.000)],
289 ['CA', 0, (0.000, 0.000, 0.000)],
290 ['C', 0, (1.526, 0.000, -0.000)],
291 ['CB', 0, (-0.516, -0.793, -1.215)],
292 ['O', 3, (0.626, 1.062, 0.000)],
293 ['CG2', 4, (0.550, -0.718, -1.228)],
294 ['OG1', 4, (0.472, 1.353, 0.000)],
295 ],
296 'TRP': [
297 ['N', 0, (-0.521, 1.363, 0.000)],
298 ['CA', 0, (0.000, 0.000, 0.000)],
299 ['C', 0, (1.525, -0.000, 0.000)],
300 ['CB', 0, (-0.523, -0.776, -1.212)],
301 ['O', 3, (0.627, 1.062, 0.000)],
302 ['CG', 4, (0.609, 1.370, -0.000)],
303 ['CD1', 5, (0.824, 1.091, 0.000)],
304 ['CD2', 5, (0.854, -1.148, -0.005)],
305 ['CE2', 5, (2.186, -0.678, -0.007)],
306 ['CE3', 5, (0.622, -2.530, -0.007)],
307 ['NE1', 5, (2.140, 0.690, -0.004)],
308 ['CH2', 5, (3.028, -2.890, -0.013)],
309 ['CZ2', 5, (3.283, -1.543, -0.011)],
310 ['CZ3', 5, (1.715, -3.389, -0.011)],
311 ],
312 'TYR': [
313 ['N', 0, (-0.522, 1.362, 0.000)],
314 ['CA', 0, (0.000, 0.000, 0.000)],
315 ['C', 0, (1.524, -0.000, -0.000)],
316 ['CB', 0, (-0.522, -0.776, -1.213)],
317 ['O', 3, (0.627, 1.062, -0.000)],
318 ['CG', 4, (0.607, 1.382, -0.000)],
319 ['CD1', 5, (0.716, 1.195, -0.000)],
320 ['CD2', 5, (0.713, -1.194, -0.001)],
321 ['CE1', 5, (2.107, 1.200, -0.002)],
322 ['CE2', 5, (2.104, -1.201, -0.003)],
323 ['OH', 5, (4.168, -0.002, -0.005)],
324 ['CZ', 5, (2.791, -0.001, -0.003)],
325 ],
326 'VAL': [
327 ['N', 0, (-0.494, 1.373, -0.000)],
328 ['CA', 0, (0.000, 0.000, 0.000)],
329 ['C', 0, (1.527, -0.000, -0.000)],
330 ['CB', 0, (-0.533, -0.795, -1.213)],
331 ['O', 3, (0.627, 1.062, -0.000)],
332 ['CG1', 4, (0.540, 1.429, -0.000)],
333 ['CG2', 4, (0.533, -0.776, 1.203)],
334 ],
335 }
336
337 # A list of atoms (excluding hydrogen) for each AA type. PDB naming convention.
338 residue_atoms = {
339 'ALA': ['C', 'CA', 'CB', 'N', 'O'],
340 'ARG': ['C', 'CA', 'CB', 'CG', 'CD', 'CZ', 'N', 'NE', 'O', 'NH1', 'NH2'],
341 'ASP': ['C', 'CA', 'CB', 'CG', 'N', 'O', 'OD1', 'OD2'],
342 'ASN': ['C', 'CA', 'CB', 'CG', 'N', 'ND2', 'O', 'OD1'],
343 'CYS': ['C', 'CA', 'CB', 'N', 'O', 'SG'],
344 'GLU': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'O', 'OE1', 'OE2'],
345 'GLN': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'NE2', 'O', 'OE1'],
346 'GLY': ['C', 'CA', 'N', 'O'],
347 'HIS': ['C', 'CA', 'CB', 'CG', 'CD2', 'CE1', 'N', 'ND1', 'NE2', 'O'],
348 'ILE': ['C', 'CA', 'CB', 'CG1', 'CG2', 'CD1', 'N', 'O'],
349 'LEU': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'N', 'O'],
350 'LYS': ['C', 'CA', 'CB', 'CG', 'CD', 'CE', 'N', 'NZ', 'O'],
351 'MET': ['C', 'CA', 'CB', 'CG', 'CE', 'N', 'O', 'SD'],
352 'PHE': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'N', 'O'],
353 'PRO': ['C', 'CA', 'CB', 'CG', 'CD', 'N', 'O'],
354 'SER': ['C', 'CA', 'CB', 'N', 'O', 'OG'],
355 'THR': ['C', 'CA', 'CB', 'CG2', 'N', 'O', 'OG1'],
356 'TRP': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE2', 'CE3', 'CZ2', 'CZ3',
357 'CH2', 'N', 'NE1', 'O'],
358 'TYR': ['C', 'CA', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'N', 'O',
359 'OH'],
360 'VAL': ['C', 'CA', 'CB', 'CG1', 'CG2', 'N', 'O']
361 }
362
363 # Naming swaps for ambiguous atom names.
364 # Due to symmetries in the amino acids the naming of atoms is ambiguous in
365 # 4 of the 20 amino acids.
366 # (The LDDT paper lists 7 amino acids as ambiguous, but the naming ambiguities
367 # in LEU, VAL and ARG can be resolved by using the 3d constellations of
368 # the 'ambiguous' atoms and their neighbours)
369 residue_atom_renaming_swaps = {
370 'ASP': {'OD1': 'OD2'},
371 'GLU': {'OE1': 'OE2'},
372 'PHE': {'CD1': 'CD2', 'CE1': 'CE2'},
373 'TYR': {'CD1': 'CD2', 'CE1': 'CE2'},
374 }
375
376 # Van der Waals radii [Angstroem] of the atoms (from Wikipedia)
377 van_der_waals_radius = {
378 'C': 1.7,
379 'N': 1.55,
380 'O': 1.52,
381 'S': 1.8,
382 }
383
384 Bond = collections.namedtuple(
385 'Bond', ['atom1_name', 'atom2_name', 'length', 'stddev'])
386 BondAngle = collections.namedtuple(
387 'BondAngle',
388 ['atom1_name', 'atom2_name', 'atom3name', 'angle_rad', 'stddev'])
389
390
391 @functools.lru_cache(maxsize=None)
392 def load_stereo_chemical_props() -> Tuple[Mapping[str, List[Bond]],
393 Mapping[str, List[Bond]],
394 Mapping[str, List[BondAngle]]]:
395 """Load stereo_chemical_props.txt into a nice structure.
396
397 Load literature values for bond lengths and bond angles and translate
398 bond angles into the length of the opposite edge of the triangle
399 ("residue_virtual_bonds").
400
401 Returns:
402 residue_bonds: Dict that maps resname -> list of Bond tuples.
403 residue_virtual_bonds: Dict that maps resname -> list of Bond tuples.
404 residue_bond_angles: Dict that maps resname -> list of BondAngle tuples.
405 """
406 stereo_chemical_props_path = os.path.join(
407 os.path.dirname(os.path.abspath(__file__)), 'stereo_chemical_props.txt'
408 )
409 with open(stereo_chemical_props_path, 'rt') as f:
410 stereo_chemical_props = f.read()
411 lines_iter = iter(stereo_chemical_props.splitlines())
412 # Load bond lengths.
413 residue_bonds = {}
414 next(lines_iter) # Skip header line.
415 for line in lines_iter:
416 if line.strip() == '-':
417 break
418 bond, resname, length, stddev = line.split()
419 atom1, atom2 = bond.split('-')
420 if resname not in residue_bonds:
421 residue_bonds[resname] = []
422 residue_bonds[resname].append(
423 Bond(atom1, atom2, float(length), float(stddev)))
424 residue_bonds['UNK'] = []
425
426 # Load bond angles.
427 residue_bond_angles = {}
428 next(lines_iter) # Skip empty line.
429 next(lines_iter) # Skip header line.
430 for line in lines_iter:
431 if line.strip() == '-':
432 break
433 bond, resname, angle_degree, stddev_degree = line.split()
434 atom1, atom2, atom3 = bond.split('-')
435 if resname not in residue_bond_angles:
436 residue_bond_angles[resname] = []
437 residue_bond_angles[resname].append(
438 BondAngle(atom1, atom2, atom3,
439 float(angle_degree) / 180. * np.pi,
440 float(stddev_degree) / 180. * np.pi))
441 residue_bond_angles['UNK'] = []
442
443 def make_bond_key(atom1_name, atom2_name):
444 """Unique key to lookup bonds."""
445 return '-'.join(sorted([atom1_name, atom2_name]))
446
447 # Translate bond angles into distances ("virtual bonds").
448 residue_virtual_bonds = {}
449 for resname, bond_angles in residue_bond_angles.items():
450 # Create a fast lookup dict for bond lengths.
451 bond_cache = {}
452 for b in residue_bonds[resname]:
453 bond_cache[make_bond_key(b.atom1_name, b.atom2_name)] = b
454 residue_virtual_bonds[resname] = []
455 for ba in bond_angles:
456 bond1 = bond_cache[make_bond_key(ba.atom1_name, ba.atom2_name)]
457 bond2 = bond_cache[make_bond_key(ba.atom2_name, ba.atom3name)]
458
459 # Compute distance between atom1 and atom3 using the law of cosines
460 # c^2 = a^2 + b^2 - 2ab*cos(gamma).
461 gamma = ba.angle_rad
462 length = np.sqrt(bond1.length**2 + bond2.length**2
463 - 2 * bond1.length * bond2.length * np.cos(gamma))
464
465 # Propagation of uncertainty assuming uncorrelated errors.
466 dl_outer = 0.5 / length
467 dl_dgamma = (2 * bond1.length * bond2.length * np.sin(gamma)) * dl_outer
468 dl_db1 = (2 * bond1.length - 2 * bond2.length * np.cos(gamma)) * dl_outer
469 dl_db2 = (2 * bond2.length - 2 * bond1.length * np.cos(gamma)) * dl_outer
470 stddev = np.sqrt((dl_dgamma * ba.stddev)**2 +
471 (dl_db1 * bond1.stddev)**2 +
472 (dl_db2 * bond2.stddev)**2)
473 residue_virtual_bonds[resname].append(
474 Bond(ba.atom1_name, ba.atom3name, length, stddev))
475
476 return (residue_bonds,
477 residue_virtual_bonds,
478 residue_bond_angles)
479
480
481 # Between-residue bond lengths for general bonds (first element) and for Proline
482 # (second element).
483 between_res_bond_length_c_n = [1.329, 1.341]
484 between_res_bond_length_stddev_c_n = [0.014, 0.016]
485
486 # Between-residue cos_angles.
487 between_res_cos_angles_c_n_ca = [-0.5203, 0.0353] # degrees: 121.352 +- 2.315
488 between_res_cos_angles_ca_c_n = [-0.4473, 0.0311] # degrees: 116.568 +- 1.995
489
490 # This mapping is used when we need to store atom data in a format that requires
491 # fixed atom data size for every residue (e.g. a numpy array).
492 atom_types = [
493 'N', 'CA', 'C', 'CB', 'O', 'CG', 'CG1', 'CG2', 'OG', 'OG1', 'SG', 'CD',
494 'CD1', 'CD2', 'ND1', 'ND2', 'OD1', 'OD2', 'SD', 'CE', 'CE1', 'CE2', 'CE3',
495 'NE', 'NE1', 'NE2', 'OE1', 'OE2', 'CH2', 'NH1', 'NH2', 'OH', 'CZ', 'CZ2',
496 'CZ3', 'NZ', 'OXT'
497 ]
498 atom_order = {atom_type: i for i, atom_type in enumerate(atom_types)}
499 atom_type_num = len(atom_types) # := 37.
500
501 # A compact atom encoding with 14 columns
502 # pylint: disable=line-too-long
503 # pylint: disable=bad-whitespace
504 restype_name_to_atom14_names = {
505 'ALA': ['N', 'CA', 'C', 'O', 'CB', '', '', '', '', '', '', '', '', ''],
506 'ARG': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'NE', 'CZ', 'NH1', 'NH2', '', '', ''],
507 'ASN': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'OD1', 'ND2', '', '', '', '', '', ''],
508 'ASP': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'OD1', 'OD2', '', '', '', '', '', ''],
509 'CYS': ['N', 'CA', 'C', 'O', 'CB', 'SG', '', '', '', '', '', '', '', ''],
510 'GLN': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'OE1', 'NE2', '', '', '', '', ''],
511 'GLU': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'OE1', 'OE2', '', '', '', '', ''],
512 'GLY': ['N', 'CA', 'C', 'O', '', '', '', '', '', '', '', '', '', ''],
513 'HIS': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'ND1', 'CD2', 'CE1', 'NE2', '', '', '', ''],
514 'ILE': ['N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2', 'CD1', '', '', '', '', '', ''],
515 'LEU': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', '', '', '', '', '', ''],
516 'LYS': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'CE', 'NZ', '', '', '', '', ''],
517 'MET': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'SD', 'CE', '', '', '', '', '', ''],
518 'PHE': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', '', '', ''],
519 'PRO': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', '', '', '', '', '', '', ''],
520 'SER': ['N', 'CA', 'C', 'O', 'CB', 'OG', '', '', '', '', '', '', '', ''],
521 'THR': ['N', 'CA', 'C', 'O', 'CB', 'OG1', 'CG2', '', '', '', '', '', '', ''],
522 'TRP': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'NE1', 'CE2', 'CE3', 'CZ2', 'CZ3', 'CH2'],
523 'TYR': ['N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2', 'CZ', 'OH', '', ''],
524 'VAL': ['N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2', '', '', '', '', '', '', ''],
525 'UNK': ['', '', '', '', '', '', '', '', '', '', '', '', '', ''],
526
527 }
528 # pylint: enable=line-too-long
529 # pylint: enable=bad-whitespace
530
531
532 # This is the standard residue order when coding AA type as a number.
533 # Reproduce it by taking 3-letter AA codes and sorting them alphabetically.
534 restypes = [
535 'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P',
536 'S', 'T', 'W', 'Y', 'V'
537 ]
538 restype_order = {restype: i for i, restype in enumerate(restypes)}
539 restype_num = len(restypes) # := 20.
540 unk_restype_index = restype_num # Catch-all index for unknown restypes.
541
542 restypes_with_x = restypes + ['X']
543 restype_order_with_x = {restype: i for i, restype in enumerate(restypes_with_x)}
544
545
546 def sequence_to_onehot(
547 sequence: str,
548 mapping: Mapping[str, int],
549 map_unknown_to_x: bool = False) -> np.ndarray:
550 """Maps the given sequence into a one-hot encoded matrix.
551
552 Args:
553 sequence: An amino acid sequence.
554 mapping: A dictionary mapping amino acids to integers.
555 map_unknown_to_x: If True, any amino acid that is not in the mapping will be
556 mapped to the unknown amino acid 'X'. If the mapping doesn't contain
557 amino acid 'X', an error will be thrown. If False, any amino acid not in
558 the mapping will throw an error.
559
560 Returns:
561 A numpy array of shape (seq_len, num_unique_aas) with one-hot encoding of
562 the sequence.
563
564 Raises:
565 ValueError: If the mapping doesn't contain values from 0 to
566 num_unique_aas - 1 without any gaps.
567 """
568 num_entries = max(mapping.values()) + 1
569
570 if sorted(set(mapping.values())) != list(range(num_entries)):
571 raise ValueError('The mapping must have values from 0 to num_unique_aas-1 '
572 'without any gaps. Got: %s' % sorted(mapping.values()))
573
574 one_hot_arr = np.zeros((len(sequence), num_entries), dtype=np.int32)
575
576 for aa_index, aa_type in enumerate(sequence):
577 if map_unknown_to_x:
578 if aa_type.isalpha() and aa_type.isupper():
579 aa_id = mapping.get(aa_type, mapping['X'])
580 else:
581 raise ValueError(f'Invalid character in the sequence: {aa_type}')
582 else:
583 aa_id = mapping[aa_type]
584 one_hot_arr[aa_index, aa_id] = 1
585
586 return one_hot_arr
587
588
589 restype_1to3 = {
590 'A': 'ALA',
591 'R': 'ARG',
592 'N': 'ASN',
593 'D': 'ASP',
594 'C': 'CYS',
595 'Q': 'GLN',
596 'E': 'GLU',
597 'G': 'GLY',
598 'H': 'HIS',
599 'I': 'ILE',
600 'L': 'LEU',
601 'K': 'LYS',
602 'M': 'MET',
603 'F': 'PHE',
604 'P': 'PRO',
605 'S': 'SER',
606 'T': 'THR',
607 'W': 'TRP',
608 'Y': 'TYR',
609 'V': 'VAL',
610 }
611
612
613 # NB: restype_3to1 differs from Bio.PDB.protein_letters_3to1 by being a simple
614 # 1-to-1 mapping of 3 letter names to one letter names. The latter contains
615 # many more, and less common, three letter names as keys and maps many of these
616 # to the same one letter name (including 'X' and 'U' which we don't use here).
617 restype_3to1 = {v: k for k, v in restype_1to3.items()}
618
619 # Define a restype name for all unknown residues.
620 unk_restype = 'UNK'
621
622 resnames = [restype_1to3[r] for r in restypes] + [unk_restype]
623 resname_to_idx = {resname: i for i, resname in enumerate(resnames)}
624
625
626 # The mapping here uses hhblits convention, so that B is mapped to D, J and O
627 # are mapped to X, U is mapped to C, and Z is mapped to E. Other than that the
628 # remaining 20 amino acids are kept in alphabetical order.
629 # There are 2 non-amino acid codes, X (representing any amino acid) and
630 # "-" representing a missing amino acid in an alignment. The id for these
631 # codes is put at the end (20 and 21) so that they can easily be ignored if
632 # desired.
633 HHBLITS_AA_TO_ID = {
634 'A': 0,
635 'B': 2,
636 'C': 1,
637 'D': 2,
638 'E': 3,
639 'F': 4,
640 'G': 5,
641 'H': 6,
642 'I': 7,
643 'J': 20,
644 'K': 8,
645 'L': 9,
646 'M': 10,
647 'N': 11,
648 'O': 20,
649 'P': 12,
650 'Q': 13,
651 'R': 14,
652 'S': 15,
653 'T': 16,
654 'U': 1,
655 'V': 17,
656 'W': 18,
657 'X': 20,
658 'Y': 19,
659 'Z': 3,
660 '-': 21,
661 }
662
663 # Partial inversion of HHBLITS_AA_TO_ID.
664 ID_TO_HHBLITS_AA = {
665 0: 'A',
666 1: 'C', # Also U.
667 2: 'D', # Also B.
668 3: 'E', # Also Z.
669 4: 'F',
670 5: 'G',
671 6: 'H',
672 7: 'I',
673 8: 'K',
674 9: 'L',
675 10: 'M',
676 11: 'N',
677 12: 'P',
678 13: 'Q',
679 14: 'R',
680 15: 'S',
681 16: 'T',
682 17: 'V',
683 18: 'W',
684 19: 'Y',
685 20: 'X', # Includes J and O.
686 21: '-',
687 }
688
689 restypes_with_x_and_gap = restypes + ['X', '-']
690 MAP_HHBLITS_AATYPE_TO_OUR_AATYPE = tuple(
691 restypes_with_x_and_gap.index(ID_TO_HHBLITS_AA[i])
692 for i in range(len(restypes_with_x_and_gap)))
693
694
695 def _make_standard_atom_mask() -> np.ndarray:
696 """Returns [num_res_types, num_atom_types] mask array."""
697 # +1 to account for unknown (all 0s).
698 mask = np.zeros([restype_num + 1, atom_type_num], dtype=np.int32)
699 for restype, restype_letter in enumerate(restypes):
700 restype_name = restype_1to3[restype_letter]
701 atom_names = residue_atoms[restype_name]
702 for atom_name in atom_names:
703 atom_type = atom_order[atom_name]
704 mask[restype, atom_type] = 1
705 return mask
706
707
708 STANDARD_ATOM_MASK = _make_standard_atom_mask()
709
710
711 # A one hot representation for the first and second atoms defining the axis
712 # of rotation for each chi-angle in each residue.
713 def chi_angle_atom(atom_index: int) -> np.ndarray:
714 """Define chi-angle rigid groups via one-hot representations."""
715 chi_angles_index = {}
716 one_hots = []
717
718 for k, v in chi_angles_atoms.items():
719 indices = [atom_types.index(s[atom_index]) for s in v]
720 indices.extend([-1]*(4-len(indices)))
721 chi_angles_index[k] = indices
722
723 for r in restypes:
724 res3 = restype_1to3[r]
725 one_hot = np.eye(atom_type_num)[chi_angles_index[res3]]
726 one_hots.append(one_hot)
727
728 one_hots.append(np.zeros([4, atom_type_num])) # Add zeros for residue `X`.
729 one_hot = np.stack(one_hots, axis=0)
730 one_hot = np.transpose(one_hot, [0, 2, 1])
731
732 return one_hot
733
734 chi_atom_1_one_hot = chi_angle_atom(1)
735 chi_atom_2_one_hot = chi_angle_atom(2)
736
737 # An array like chi_angles_atoms but using indices rather than names.
738 chi_angles_atom_indices = [chi_angles_atoms[restype_1to3[r]] for r in restypes]
739 chi_angles_atom_indices = tree.map_structure(
740 lambda atom_name: atom_order[atom_name], chi_angles_atom_indices)
741 chi_angles_atom_indices = np.array([
742 chi_atoms + ([[0, 0, 0, 0]] * (4 - len(chi_atoms)))
743 for chi_atoms in chi_angles_atom_indices])
744
745 # Mapping from (res_name, atom_name) pairs to the atom's chi group index
746 # and atom index within that group.
747 chi_groups_for_atom = collections.defaultdict(list)
748 for res_name, chi_angle_atoms_for_res in chi_angles_atoms.items():
749 for chi_group_i, chi_group in enumerate(chi_angle_atoms_for_res):
750 for atom_i, atom in enumerate(chi_group):
751 chi_groups_for_atom[(res_name, atom)].append((chi_group_i, atom_i))
752 chi_groups_for_atom = dict(chi_groups_for_atom)
753
754
755 def _make_rigid_transformation_4x4(ex, ey, translation):
756 """Create a rigid 4x4 transformation matrix from two axes and transl."""
757 # Normalize ex.
758 ex_normalized = ex / np.linalg.norm(ex)
759
760 # make ey perpendicular to ex
761 ey_normalized = ey - np.dot(ey, ex_normalized) * ex_normalized
762 ey_normalized /= np.linalg.norm(ey_normalized)
763
764 # compute ez as cross product
765 eznorm = np.cross(ex_normalized, ey_normalized)
766 m = np.stack([ex_normalized, ey_normalized, eznorm, translation]).transpose()
767 m = np.concatenate([m, [[0., 0., 0., 1.]]], axis=0)
768 return m
769
770
771 # create an array with (restype, atomtype) --> rigid_group_idx
772 # and an array with (restype, atomtype, coord) for the atom positions
773 # and compute affine transformation matrices (4,4) from one rigid group to the
774 # previous group
775 restype_atom37_to_rigid_group = np.zeros([21, 37], dtype=np.int)
776 restype_atom37_mask = np.zeros([21, 37], dtype=np.float32)
777 restype_atom37_rigid_group_positions = np.zeros([21, 37, 3], dtype=np.float32)
778 restype_atom14_to_rigid_group = np.zeros([21, 14], dtype=np.int)
779 restype_atom14_mask = np.zeros([21, 14], dtype=np.float32)
780 restype_atom14_rigid_group_positions = np.zeros([21, 14, 3], dtype=np.float32)
781 restype_rigid_group_default_frame = np.zeros([21, 8, 4, 4], dtype=np.float32)
782
783
784 def _make_rigid_group_constants():
785 """Fill the arrays above."""
786 for restype, restype_letter in enumerate(restypes):
787 resname = restype_1to3[restype_letter]
788 for atomname, group_idx, atom_position in rigid_group_atom_positions[
789 resname]:
790 atomtype = atom_order[atomname]
791 restype_atom37_to_rigid_group[restype, atomtype] = group_idx
792 restype_atom37_mask[restype, atomtype] = 1
793 restype_atom37_rigid_group_positions[restype, atomtype, :] = atom_position
794
795 atom14idx = restype_name_to_atom14_names[resname].index(atomname)
796 restype_atom14_to_rigid_group[restype, atom14idx] = group_idx
797 restype_atom14_mask[restype, atom14idx] = 1
798 restype_atom14_rigid_group_positions[restype,
799 atom14idx, :] = atom_position
800
801 for restype, restype_letter in enumerate(restypes):
802 resname = restype_1to3[restype_letter]
803 atom_positions = {name: np.array(pos) for name, _, pos
804 in rigid_group_atom_positions[resname]}
805
806 # backbone to backbone is the identity transform
807 restype_rigid_group_default_frame[restype, 0, :, :] = np.eye(4)
808
809 # pre-omega-frame to backbone (currently dummy identity matrix)
810 restype_rigid_group_default_frame[restype, 1, :, :] = np.eye(4)
811
812 # phi-frame to backbone
813 mat = _make_rigid_transformation_4x4(
814 ex=atom_positions['N'] - atom_positions['CA'],
815 ey=np.array([1., 0., 0.]),
816 translation=atom_positions['N'])
817 restype_rigid_group_default_frame[restype, 2, :, :] = mat
818
819 # psi-frame to backbone
820 mat = _make_rigid_transformation_4x4(
821 ex=atom_positions['C'] - atom_positions['CA'],
822 ey=atom_positions['CA'] - atom_positions['N'],
823 translation=atom_positions['C'])
824 restype_rigid_group_default_frame[restype, 3, :, :] = mat
825
826 # chi1-frame to backbone
827 if chi_angles_mask[restype][0]:
828 base_atom_names = chi_angles_atoms[resname][0]
829 base_atom_positions = [atom_positions[name] for name in base_atom_names]
830 mat = _make_rigid_transformation_4x4(
831 ex=base_atom_positions[2] - base_atom_positions[1],
832 ey=base_atom_positions[0] - base_atom_positions[1],
833 translation=base_atom_positions[2])
834 restype_rigid_group_default_frame[restype, 4, :, :] = mat
835
836 # chi2-frame to chi1-frame
837 # chi3-frame to chi2-frame
838 # chi4-frame to chi3-frame
839 # luckily all rotation axes for the next frame start at (0,0,0) of the
840 # previous frame
841 for chi_idx in range(1, 4):
842 if chi_angles_mask[restype][chi_idx]:
843 axis_end_atom_name = chi_angles_atoms[resname][chi_idx][2]
844 axis_end_atom_position = atom_positions[axis_end_atom_name]
845 mat = _make_rigid_transformation_4x4(
846 ex=axis_end_atom_position,
847 ey=np.array([-1., 0., 0.]),
848 translation=axis_end_atom_position)
849 restype_rigid_group_default_frame[restype, 4 + chi_idx, :, :] = mat
850
851
852 _make_rigid_group_constants()
853
854
855 def make_atom14_dists_bounds(overlap_tolerance=1.5,
856 bond_length_tolerance_factor=15):
857 """compute upper and lower bounds for bonds to assess violations."""
858 restype_atom14_bond_lower_bound = np.zeros([21, 14, 14], np.float32)
859 restype_atom14_bond_upper_bound = np.zeros([21, 14, 14], np.float32)
860 restype_atom14_bond_stddev = np.zeros([21, 14, 14], np.float32)
861 residue_bonds, residue_virtual_bonds, _ = load_stereo_chemical_props()
862 for restype, restype_letter in enumerate(restypes):
863 resname = restype_1to3[restype_letter]
864 atom_list = restype_name_to_atom14_names[resname]
865
866 # create lower and upper bounds for clashes
867 for atom1_idx, atom1_name in enumerate(atom_list):
868 if not atom1_name:
869 continue
870 atom1_radius = van_der_waals_radius[atom1_name[0]]
871 for atom2_idx, atom2_name in enumerate(atom_list):
872 if (not atom2_name) or atom1_idx == atom2_idx:
873 continue
874 atom2_radius = van_der_waals_radius[atom2_name[0]]
875 lower = atom1_radius + atom2_radius - overlap_tolerance
876 upper = 1e10
877 restype_atom14_bond_lower_bound[restype, atom1_idx, atom2_idx] = lower
878 restype_atom14_bond_lower_bound[restype, atom2_idx, atom1_idx] = lower
879 restype_atom14_bond_upper_bound[restype, atom1_idx, atom2_idx] = upper
880 restype_atom14_bond_upper_bound[restype, atom2_idx, atom1_idx] = upper
881
882 # overwrite lower and upper bounds for bonds and angles
883 for b in residue_bonds[resname] + residue_virtual_bonds[resname]:
884 atom1_idx = atom_list.index(b.atom1_name)
885 atom2_idx = atom_list.index(b.atom2_name)
886 lower = b.length - bond_length_tolerance_factor * b.stddev
887 upper = b.length + bond_length_tolerance_factor * b.stddev
888 restype_atom14_bond_lower_bound[restype, atom1_idx, atom2_idx] = lower
889 restype_atom14_bond_lower_bound[restype, atom2_idx, atom1_idx] = lower
890 restype_atom14_bond_upper_bound[restype, atom1_idx, atom2_idx] = upper
891 restype_atom14_bond_upper_bound[restype, atom2_idx, atom1_idx] = upper
892 restype_atom14_bond_stddev[restype, atom1_idx, atom2_idx] = b.stddev
893 restype_atom14_bond_stddev[restype, atom2_idx, atom1_idx] = b.stddev
894 return {'lower_bound': restype_atom14_bond_lower_bound, # shape (21,14,14)
895 'upper_bound': restype_atom14_bond_upper_bound, # shape (21,14,14)
896 'stddev': restype_atom14_bond_stddev, # shape (21,14,14)
897 }