diff docker/alphafold/alphafold/common/protein.py @ 1:6c92e000d684 draft

"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86"
author galaxy-australia
date Tue, 01 Mar 2022 02:53:05 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/docker/alphafold/alphafold/common/protein.py	Tue Mar 01 02:53:05 2022 +0000
@@ -0,0 +1,278 @@
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Protein data type."""
+import dataclasses
+import io
+from typing import Any, Mapping, Optional
+from alphafold.common import residue_constants
+from Bio.PDB import PDBParser
+import numpy as np
+
+FeatureDict = Mapping[str, np.ndarray]
+ModelOutput = Mapping[str, Any]  # Is a nested dict.
+
+# Complete sequence of chain IDs supported by the PDB format.
+PDB_CHAIN_IDS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
+PDB_MAX_CHAINS = len(PDB_CHAIN_IDS)  # := 62.
+
+
+@dataclasses.dataclass(frozen=True)
+class Protein:
+  """Protein structure representation."""
+
+  # Cartesian coordinates of atoms in angstroms. The atom types correspond to
+  # residue_constants.atom_types, i.e. the first three are N, CA, CB.
+  atom_positions: np.ndarray  # [num_res, num_atom_type, 3]
+
+  # Amino-acid type for each residue represented as an integer between 0 and
+  # 20, where 20 is 'X'.
+  aatype: np.ndarray  # [num_res]
+
+  # Binary float mask to indicate presence of a particular atom. 1.0 if an atom
+  # is present and 0.0 if not. This should be used for loss masking.
+  atom_mask: np.ndarray  # [num_res, num_atom_type]
+
+  # Residue index as used in PDB. It is not necessarily continuous or 0-indexed.
+  residue_index: np.ndarray  # [num_res]
+
+  # 0-indexed number corresponding to the chain in the protein that this residue
+  # belongs to.
+  chain_index: np.ndarray  # [num_res]
+
+  # B-factors, or temperature factors, of each residue (in sq. angstroms units),
+  # representing the displacement of the residue from its ground truth mean
+  # value.
+  b_factors: np.ndarray  # [num_res, num_atom_type]
+
+  def __post_init__(self):
+    if len(np.unique(self.chain_index)) > PDB_MAX_CHAINS:
+      raise ValueError(
+          f'Cannot build an instance with more than {PDB_MAX_CHAINS} chains '
+          'because these cannot be written to PDB format.')
+
+
+def from_pdb_string(pdb_str: str, chain_id: Optional[str] = None) -> Protein:
+  """Takes a PDB string and constructs a Protein object.
+
+  WARNING: All non-standard residue types will be converted into UNK. All
+    non-standard atoms will be ignored.
+
+  Args:
+    pdb_str: The contents of the pdb file
+    chain_id: If chain_id is specified (e.g. A), then only that chain
+      is parsed. Otherwise all chains are parsed.
+
+  Returns:
+    A new `Protein` parsed from the pdb contents.
+  """
+  pdb_fh = io.StringIO(pdb_str)
+  parser = PDBParser(QUIET=True)
+  structure = parser.get_structure('none', pdb_fh)
+  models = list(structure.get_models())
+  if len(models) != 1:
+    raise ValueError(
+        f'Only single model PDBs are supported. Found {len(models)} models.')
+  model = models[0]
+
+  atom_positions = []
+  aatype = []
+  atom_mask = []
+  residue_index = []
+  chain_ids = []
+  b_factors = []
+
+  for chain in model:
+    if chain_id is not None and chain.id != chain_id:
+      continue
+    for res in chain:
+      if res.id[2] != ' ':
+        raise ValueError(
+            f'PDB contains an insertion code at chain {chain.id} and residue '
+            f'index {res.id[1]}. These are not supported.')
+      res_shortname = residue_constants.restype_3to1.get(res.resname, 'X')
+      restype_idx = residue_constants.restype_order.get(
+          res_shortname, residue_constants.restype_num)
+      pos = np.zeros((residue_constants.atom_type_num, 3))
+      mask = np.zeros((residue_constants.atom_type_num,))
+      res_b_factors = np.zeros((residue_constants.atom_type_num,))
+      for atom in res:
+        if atom.name not in residue_constants.atom_types:
+          continue
+        pos[residue_constants.atom_order[atom.name]] = atom.coord
+        mask[residue_constants.atom_order[atom.name]] = 1.
+        res_b_factors[residue_constants.atom_order[atom.name]] = atom.bfactor
+      if np.sum(mask) < 0.5:
+        # If no known atom positions are reported for the residue then skip it.
+        continue
+      aatype.append(restype_idx)
+      atom_positions.append(pos)
+      atom_mask.append(mask)
+      residue_index.append(res.id[1])
+      chain_ids.append(chain.id)
+      b_factors.append(res_b_factors)
+
+  # Chain IDs are usually characters so map these to ints.
+  unique_chain_ids = np.unique(chain_ids)
+  chain_id_mapping = {cid: n for n, cid in enumerate(unique_chain_ids)}
+  chain_index = np.array([chain_id_mapping[cid] for cid in chain_ids])
+
+  return Protein(
+      atom_positions=np.array(atom_positions),
+      atom_mask=np.array(atom_mask),
+      aatype=np.array(aatype),
+      residue_index=np.array(residue_index),
+      chain_index=chain_index,
+      b_factors=np.array(b_factors))
+
+
+def _chain_end(atom_index, end_resname, chain_name, residue_index) -> str:
+  chain_end = 'TER'
+  return (f'{chain_end:<6}{atom_index:>5}      {end_resname:>3} '
+          f'{chain_name:>1}{residue_index:>4}')
+
+
+def to_pdb(prot: Protein) -> str:
+  """Converts a `Protein` instance to a PDB string.
+
+  Args:
+    prot: The protein to convert to PDB.
+
+  Returns:
+    PDB string.
+  """
+  restypes = residue_constants.restypes + ['X']
+  res_1to3 = lambda r: residue_constants.restype_1to3.get(restypes[r], 'UNK')
+  atom_types = residue_constants.atom_types
+
+  pdb_lines = []
+
+  atom_mask = prot.atom_mask
+  aatype = prot.aatype
+  atom_positions = prot.atom_positions
+  residue_index = prot.residue_index.astype(np.int32)
+  chain_index = prot.chain_index.astype(np.int32)
+  b_factors = prot.b_factors
+
+  if np.any(aatype > residue_constants.restype_num):
+    raise ValueError('Invalid aatypes.')
+
+  # Construct a mapping from chain integer indices to chain ID strings.
+  chain_ids = {}
+  for i in np.unique(chain_index):  # np.unique gives sorted output.
+    if i >= PDB_MAX_CHAINS:
+      raise ValueError(
+          f'The PDB format supports at most {PDB_MAX_CHAINS} chains.')
+    chain_ids[i] = PDB_CHAIN_IDS[i]
+
+  pdb_lines.append('MODEL     1')
+  atom_index = 1
+  last_chain_index = chain_index[0]
+  # Add all atom sites.
+  for i in range(aatype.shape[0]):
+    # Close the previous chain if in a multichain PDB.
+    if last_chain_index != chain_index[i]:
+      pdb_lines.append(_chain_end(
+          atom_index, res_1to3(aatype[i - 1]), chain_ids[chain_index[i - 1]],
+          residue_index[i - 1]))
+      last_chain_index = chain_index[i]
+      atom_index += 1  # Atom index increases at the TER symbol.
+
+    res_name_3 = res_1to3(aatype[i])
+    for atom_name, pos, mask, b_factor in zip(
+        atom_types, atom_positions[i], atom_mask[i], b_factors[i]):
+      if mask < 0.5:
+        continue
+
+      record_type = 'ATOM'
+      name = atom_name if len(atom_name) == 4 else f' {atom_name}'
+      alt_loc = ''
+      insertion_code = ''
+      occupancy = 1.00
+      element = atom_name[0]  # Protein supports only C, N, O, S, this works.
+      charge = ''
+      # PDB is a columnar format, every space matters here!
+      atom_line = (f'{record_type:<6}{atom_index:>5} {name:<4}{alt_loc:>1}'
+                   f'{res_name_3:>3} {chain_ids[chain_index[i]]:>1}'
+                   f'{residue_index[i]:>4}{insertion_code:>1}   '
+                   f'{pos[0]:>8.3f}{pos[1]:>8.3f}{pos[2]:>8.3f}'
+                   f'{occupancy:>6.2f}{b_factor:>6.2f}          '
+                   f'{element:>2}{charge:>2}')
+      pdb_lines.append(atom_line)
+      atom_index += 1
+
+  # Close the final chain.
+  pdb_lines.append(_chain_end(atom_index, res_1to3(aatype[-1]),
+                              chain_ids[chain_index[-1]], residue_index[-1]))
+  pdb_lines.append('ENDMDL')
+  pdb_lines.append('END')
+
+  # Pad all lines to 80 characters.
+  pdb_lines = [line.ljust(80) for line in pdb_lines]
+  return '\n'.join(pdb_lines) + '\n'  # Add terminating newline.
+
+
+def ideal_atom_mask(prot: Protein) -> np.ndarray:
+  """Computes an ideal atom mask.
+
+  `Protein.atom_mask` typically is defined according to the atoms that are
+  reported in the PDB. This function computes a mask according to heavy atoms
+  that should be present in the given sequence of amino acids.
+
+  Args:
+    prot: `Protein` whose fields are `numpy.ndarray` objects.
+
+  Returns:
+    An ideal atom mask.
+  """
+  return residue_constants.STANDARD_ATOM_MASK[prot.aatype]
+
+
+def from_prediction(
+    features: FeatureDict,
+    result: ModelOutput,
+    b_factors: Optional[np.ndarray] = None,
+    remove_leading_feature_dimension: bool = True) -> Protein:
+  """Assembles a protein from a prediction.
+
+  Args:
+    features: Dictionary holding model inputs.
+    result: Dictionary holding model outputs.
+    b_factors: (Optional) B-factors to use for the protein.
+    remove_leading_feature_dimension: Whether to remove the leading dimension
+      of the `features` values.
+
+  Returns:
+    A protein instance.
+  """
+  fold_output = result['structure_module']
+
+  def _maybe_remove_leading_dim(arr: np.ndarray) -> np.ndarray:
+    return arr[0] if remove_leading_feature_dimension else arr
+
+  if 'asym_id' in features:
+    chain_index = _maybe_remove_leading_dim(features['asym_id'])
+  else:
+    chain_index = np.zeros_like(_maybe_remove_leading_dim(features['aatype']))
+
+  if b_factors is None:
+    b_factors = np.zeros_like(fold_output['final_atom_mask'])
+
+  return Protein(
+      aatype=_maybe_remove_leading_dim(features['aatype']),
+      atom_positions=fold_output['final_atom_positions'],
+      atom_mask=fold_output['final_atom_mask'],
+      residue_index=_maybe_remove_leading_dim(features['residue_index']) + 1,
+      chain_index=chain_index,
+      b_factors=b_factors)