diff docker/alphafold/docker/run_docker.py @ 1:6c92e000d684 draft

"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit a510e97ebd604a5e30b1f16e5031f62074f23e86"
author galaxy-australia
date Tue, 01 Mar 2022 02:53:05 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/docker/alphafold/docker/run_docker.py	Tue Mar 01 02:53:05 2022 +0000
@@ -0,0 +1,231 @@
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Docker launch script for Alphafold docker image."""
+
+import os
+import pathlib
+import signal
+from typing import Tuple
+
+from absl import app
+from absl import flags
+from absl import logging
+import docker
+from docker import types
+
+
+flags.DEFINE_bool(
+    'use_gpu', True, 'Enable NVIDIA runtime to run with GPUs.')
+flags.DEFINE_string(
+    'gpu_devices', 'all',
+    'Comma separated list of devices to pass to NVIDIA_VISIBLE_DEVICES.')
+flags.DEFINE_list(
+    'fasta_paths', None, 'Paths to FASTA files, each containing a prediction '
+    'target that will be folded one after another. If a FASTA file contains '
+    'multiple sequences, then it will be folded as a multimer. Paths should be '
+    'separated by commas. All FASTA paths must have a unique basename as the '
+    'basename is used to name the output directories for each prediction.')
+flags.DEFINE_list(
+    'is_prokaryote_list', None, 'Optional for multimer system, not used by the '
+    'single chain system. This list should contain a boolean for each fasta '
+    'specifying true where the target complex is from a prokaryote, and false '
+    'where it is not, or where the origin is unknown. These values determine '
+    'the pairing method for the MSA.')
+flags.DEFINE_string(
+    'output_dir', '/tmp/alphafold',
+    'Path to a directory that will store the results.')
+flags.DEFINE_string(
+    'data_dir', None,
+    'Path to directory with supporting data: AlphaFold parameters and genetic '
+    'and template databases. Set to the target of download_all_databases.sh.')
+flags.DEFINE_string(
+    'docker_image_name', 'alphafold', 'Name of the AlphaFold Docker image.')
+flags.DEFINE_string(
+    'max_template_date', None,
+    'Maximum template release date to consider (ISO-8601 format: YYYY-MM-DD). '
+    'Important if folding historical test sets.')
+flags.DEFINE_enum(
+    'db_preset', 'full_dbs', ['full_dbs', 'reduced_dbs'],
+    'Choose preset MSA database configuration - smaller genetic database '
+    'config (reduced_dbs) or full genetic database config (full_dbs)')
+flags.DEFINE_enum(
+    'model_preset', 'monomer',
+    ['monomer', 'monomer_casp14', 'monomer_ptm', 'multimer'],
+    'Choose preset model configuration - the monomer model, the monomer model '
+    'with extra ensembling, monomer model with pTM head, or multimer model')
+flags.DEFINE_boolean(
+    'benchmark', False,
+    'Run multiple JAX model evaluations to obtain a timing that excludes the '
+    'compilation time, which should be more indicative of the time required '
+    'for inferencing many proteins.')
+flags.DEFINE_boolean(
+    'use_precomputed_msas', False,
+    'Whether to read MSAs that have been written to disk. WARNING: This will '
+    'not check if the sequence, database or configuration have changed.')
+
+FLAGS = flags.FLAGS
+
+_ROOT_MOUNT_DIRECTORY = '/mnt/'
+
+
+def _create_mount(mount_name: str, path: str) -> Tuple[types.Mount, str]:
+  path = os.path.abspath(path)
+  source_path = os.path.dirname(path)
+  target_path = os.path.join(_ROOT_MOUNT_DIRECTORY, mount_name)
+  logging.info('Mounting %s -> %s', source_path, target_path)
+  mount = types.Mount(target_path, source_path, type='bind', read_only=True)
+  return mount, os.path.join(target_path, os.path.basename(path))
+
+
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError('Too many command-line arguments.')
+
+  # You can individually override the following paths if you have placed the
+  # data in locations other than the FLAGS.data_dir.
+
+  # Path to the Uniref90 database for use by JackHMMER.
+  uniref90_database_path = os.path.join(
+      FLAGS.data_dir, 'uniref90', 'uniref90.fasta')
+
+  # Path to the Uniprot database for use by JackHMMER.
+  uniprot_database_path = os.path.join(
+      FLAGS.data_dir, 'uniprot', 'uniprot.fasta')
+
+  # Path to the MGnify database for use by JackHMMER.
+  mgnify_database_path = os.path.join(
+      FLAGS.data_dir, 'mgnify', 'mgy_clusters_2018_12.fa')
+
+  # Path to the BFD database for use by HHblits.
+  bfd_database_path = os.path.join(
+      FLAGS.data_dir, 'bfd',
+      'bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt')
+
+  # Path to the Small BFD database for use by JackHMMER.
+  small_bfd_database_path = os.path.join(
+      FLAGS.data_dir, 'small_bfd', 'bfd-first_non_consensus_sequences.fasta')
+
+  # Path to the Uniclust30 database for use by HHblits.
+  uniclust30_database_path = os.path.join(
+      FLAGS.data_dir, 'uniclust30', 'uniclust30_2018_08', 'uniclust30_2018_08')
+
+  # Path to the PDB70 database for use by HHsearch.
+  pdb70_database_path = os.path.join(FLAGS.data_dir, 'pdb70', 'pdb70')
+
+  # Path to the PDB seqres database for use by hmmsearch.
+  pdb_seqres_database_path = os.path.join(
+      FLAGS.data_dir, 'pdb_seqres', 'pdb_seqres.txt')
+
+  # Path to a directory with template mmCIF structures, each named <pdb_id>.cif.
+  template_mmcif_dir = os.path.join(FLAGS.data_dir, 'pdb_mmcif', 'mmcif_files')
+
+  # Path to a file mapping obsolete PDB IDs to their replacements.
+  obsolete_pdbs_path = os.path.join(FLAGS.data_dir, 'pdb_mmcif', 'obsolete.dat')
+
+  alphafold_path = pathlib.Path(__file__).parent.parent
+  data_dir_path = pathlib.Path(FLAGS.data_dir)
+  if alphafold_path == data_dir_path or alphafold_path in data_dir_path.parents:
+    raise app.UsageError(
+        f'The download directory {FLAGS.data_dir} should not be a subdirectory '
+        f'in the AlphaFold repository directory. If it is, the Docker build is '
+        f'slow since the large databases are copied during the image creation.')
+
+  mounts = []
+  command_args = []
+
+  # Mount each fasta path as a unique target directory.
+  target_fasta_paths = []
+  for i, fasta_path in enumerate(FLAGS.fasta_paths):
+    mount, target_path = _create_mount(f'fasta_path_{i}', fasta_path)
+    mounts.append(mount)
+    target_fasta_paths.append(target_path)
+  command_args.append(f'--fasta_paths={",".join(target_fasta_paths)}')
+
+  database_paths = [
+      ('uniref90_database_path', uniref90_database_path),
+      ('mgnify_database_path', mgnify_database_path),
+      ('data_dir', FLAGS.data_dir),
+      ('template_mmcif_dir', template_mmcif_dir),
+      ('obsolete_pdbs_path', obsolete_pdbs_path),
+  ]
+
+  if FLAGS.model_preset == 'multimer':
+    database_paths.append(('uniprot_database_path', uniprot_database_path))
+    database_paths.append(('pdb_seqres_database_path',
+                           pdb_seqres_database_path))
+  else:
+    database_paths.append(('pdb70_database_path', pdb70_database_path))
+
+  if FLAGS.db_preset == 'reduced_dbs':
+    database_paths.append(('small_bfd_database_path', small_bfd_database_path))
+  else:
+    database_paths.extend([
+        ('uniclust30_database_path', uniclust30_database_path),
+        ('bfd_database_path', bfd_database_path),
+    ])
+  for name, path in database_paths:
+    if path:
+      mount, target_path = _create_mount(name, path)
+      mounts.append(mount)
+      command_args.append(f'--{name}={target_path}')
+
+  output_target_path = os.path.join(_ROOT_MOUNT_DIRECTORY, 'output')
+  mounts.append(types.Mount(output_target_path, FLAGS.output_dir, type='bind'))
+
+  command_args.extend([
+      f'--output_dir={output_target_path}',
+      f'--max_template_date={FLAGS.max_template_date}',
+      f'--db_preset={FLAGS.db_preset}',
+      f'--model_preset={FLAGS.model_preset}',
+      f'--benchmark={FLAGS.benchmark}',
+      f'--use_precomputed_msas={FLAGS.use_precomputed_msas}',
+      '--logtostderr',
+  ])
+
+  if FLAGS.is_prokaryote_list:
+    command_args.append(
+        f'--is_prokaryote_list={",".join(FLAGS.is_prokaryote_list)}')
+
+  client = docker.from_env()
+  container = client.containers.run(
+      image=FLAGS.docker_image_name,
+      command=command_args,
+      runtime='nvidia' if FLAGS.use_gpu else None,
+      remove=True,
+      detach=True,
+      mounts=mounts,
+      environment={
+          'NVIDIA_VISIBLE_DEVICES': FLAGS.gpu_devices,
+          # The following flags allow us to make predictions on proteins that
+          # would typically be too long to fit into GPU memory.
+          'TF_FORCE_UNIFIED_MEMORY': '1',
+          'XLA_PYTHON_CLIENT_MEM_FRACTION': '4.0',
+      })
+
+  # Add signal handler to ensure CTRL+C also stops the running container.
+  signal.signal(signal.SIGINT,
+                lambda unused_sig, unused_frame: container.kill())
+
+  for line in container.logs(stream=True):
+    logging.info(line.strip().decode('utf-8'))
+
+
+if __name__ == '__main__':
+  flags.mark_flags_as_required([
+      'data_dir',
+      'fasta_paths',
+      'max_template_date',
+  ])
+  app.run(main)