Previous changeset 13:c0e71cb2bd1b (2022-10-12) Next changeset 15:a58f7eb0df2c (2023-03-10) |
Commit message:
planemo upload for repository https://github.com/usegalaxy-au/tools-au commit d490defa32d9c318137d2d781243b392cb14110d-dirty |
modified:
README.rst alphafold.html alphafold.xml validate_fasta.py |
added:
outputs.py |
removed:
gen_extra_outputs.py |
b |
diff -r c0e71cb2bd1b -r d00e15139065 README.rst --- a/README.rst Wed Oct 12 22:25:20 2022 +0000 +++ b/README.rst Tue Feb 28 01:15:42 2023 +0000 |
b |
@@ -75,27 +75,41 @@ ~~~~~~~~~~~~~~ Alphafold needs reference data to run. The wrapper expects this data to -be present at ``/data/alphafold_databases``. A custom DB root can be read from -the ALPHAFOLD_DB environment variable, if set. To download the AlphaFold, -reference data, run the following shell script command in the tool directory. +be present at ``/data/alphafold_databases``. A custom path will be read from +the ``ALPHAFOLD_DB`` environment variable, if set. + +To download the AlphaFold reference DBs: :: - # Set databases root - ALPHAFOLD_DB_ROOT=/data/alphafold_databases + # Set your AlphaFold DB path + ALPHAFOLD_DB=/data/alphafold_databases + + # Set your target AlphaFold version + ALPHAFOLD_VERSION= # e.g. 2.1.2 + + # Download repo + wget https://github.com/deepmind/alphafold/releases/tag/v${ALPHAFOLD_VERSION}.tar.gz + tar xzf v${ALPHAFOLD_VERSION}.tar.gz - # make folders if needed - mkdir -p $ALPHAFOLD_DB_ROOT + # Ensure dirs + mkdir -p $ALPHAFOLD_DB - # download ref data - bash scripts/download_all_data.sh $ALPHAFOLD_DB_ROOT + # Download + bash alphafold*/scripts/download_all_data.sh $ALPHAFOLD_DB -This will install the reference data to ``/data/alphafold_databases``. +You will most likely want to run this as a background job, as it will take a +very long time (7+ days in Australia). + +This will install the reference data to your ``$ALPHAFOLD_DB``. To check this has worked, ensure the final folder structure is as follows: :: + # NOTE: this structure will change between minor AlphaFold versions + # The tree shown below was updated for v2.3.1 + data/alphafold_databases ├── bfd │ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffdata @@ -105,18 +119,23 @@ │ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffdata │ └── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffindex ├── mgnify - │ └── mgy_clusters_2018_12.fa + │ └── mgy_clusters_2022_05.fa ├── params │ ├── LICENSE │ ├── params_model_1.npz + │ ├── params_model_1_multimer_v3.npz │ ├── params_model_1_ptm.npz │ ├── params_model_2.npz + │ ├── params_model_2_multimer_v3.npz │ ├── params_model_2_ptm.npz │ ├── params_model_3.npz + │ ├── params_model_3_multimer_v3.npz │ ├── params_model_3_ptm.npz │ ├── params_model_4.npz + │ ├── params_model_4_multimer_v3.npz │ ├── params_model_4_ptm.npz │ ├── params_model_5.npz + │ ├── params_model_5_multimer_v3.npz │ └── params_model_5_ptm.npz ├── pdb70 │ ├── md5sum @@ -131,10 +150,20 @@ ├── pdb_mmcif │ ├── mmcif_files │ └── obsolete.dat - ├── uniclust30 - │ └── uniclust30_2018_08 + ├── pdb_seqres + │ └── pdb_seqres.txt + ├── uniprot + │ └── uniprot.fasta + ├── uniref30 + │ ├── UniRef30_2021_03.md5sums + │ ├── UniRef30_2021_03_a3m.ffdata + │ ├── UniRef30_2021_03_a3m.ffindex + │ ├── UniRef30_2021_03_cs219.ffdata + │ ├── UniRef30_2021_03_cs219.ffindex + │ ├── UniRef30_2021_03_hhm.ffdata + │ └── UniRef30_2021_03_hhm.ffindex └── uniref90 - └── uniref90.fasta + └── uniref90.fasta In more recent releases of the AlphaFold tool, you will need to download an additional file to allow the ``reduced_dbs`` option: @@ -152,6 +181,25 @@ │ └── bfd-first_non_consensus_sequences.fasta +**Upgrading database versions** + +When upgrading to a new minor version of AlphaFold, you will most likely have to +upgrade the reference database. This can be a pain, due to the size of the +databases and the obscurity around what has changed. The simplest way to do +this is simply create a new directory and download the DBs from scratch. +However, you can save a considerable amount of time by downloading only the +components that have changed. + +If you wish to continue hosting prior versions of the tool, you must maintain +the reference DBs for each version. The ``ALPHAFOLD_DB`` environment variable +must then be set respectively for each tool version in your job conf (on Galaxy +AU this is currently `configured with TPV<https://github.com/usegalaxy-au/infrastructure/blob/master/files/galaxy/dynamic_job_rules/production/total_perspective_vortex/tools.yml#L1515-L1554>`_). + +To minimize redundancy between DB version, we have symlinked the database +components that are unchanging between versions. In ``v2.1.2 -> v2.3.1`` the BFD +database is the only component that is persistent, but they are by far the +largest on disk. + JOB DESTINATION ~~~~~~~~~~~~~~~ |
b |
diff -r c0e71cb2bd1b -r d00e15139065 alphafold.html --- a/alphafold.html Wed Oct 12 22:25:20 2022 +0000 +++ b/alphafold.html Tue Feb 28 01:15:42 2023 +0000 |
b |
@@ -336,26 +336,26 @@ <div class="flex col controls"> <div class="box text-center"> <h3> Select model </h3> - <p>The top five structures predicted by Alphafold</p> + <p>The top-ranked structures predicted by Alphafold</p> <div> <button class="btn selected" id="btn-ranked_0" onclick="setModel(0);"> - Model 1 + Ranked 0 </button> <button class="btn" id="btn-ranked_1" onclick="setModel(1);"> - Model 2 + Ranked 1 </button> <button class="btn" id="btn-ranked_2" onclick="setModel(2);"> - Model 3 + Ranked 2 </button> <button class="btn" id="btn-ranked_3" onclick="setModel(3);"> - Model 4 + Ranked 3 </button> <button class="btn" id="btn-ranked_4" onclick="setModel(4);"> - Model 5 + Ranked 4 </button> </div> </div> |
b |
diff -r c0e71cb2bd1b -r d00e15139065 alphafold.xml --- a/alphafold.xml Wed Oct 12 22:25:20 2022 +0000 +++ b/alphafold.xml Tue Feb 28 01:15:42 2023 +0000 |
[ |
b'@@ -1,8 +1,8 @@\n <tool id="alphafold" name="Alphafold 2" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01">\n <description> - AI-guided 3D structural prediction of proteins</description>\n <macros>\n- <token name="@TOOL_VERSION@">2.1.2</token>\n- <token name="@VERSION_SUFFIX@">4</token>\n+ <token name="@TOOL_VERSION@">2.3.1</token>\n+ <token name="@VERSION_SUFFIX@">0</token>\n </macros>\n <edam_topics>\n <edam_topic>topic_0082</edam_topic>\n@@ -11,92 +11,93 @@\n <edam_operation>operation_0474</edam_operation>\n </edam_operations>\n <xrefs>\n- <xref type="bio.tools">alphafold_2.0</xref>\n+ <xref type="bio.tools">alphafold_2</xref>\n </xrefs>\n <requirements>\n- <container type="docker">neoformit/alphafold:v2.1.2_0</container>\n+ <container type="docker">neoformit/alphafold:v2.3.1_1</container>\n </requirements>\n <command detect_errors="exit_code"><![CDATA[\n \n ## $ALPHAFOLD_DB variable should point to the location of the AlphaFold\n ## databases - defaults to /data\n \n-## fasta setup ----------------------------\n+## Read FASTA input ----------------------------\n #if $fasta_or_text.input_mode == \'history\':\n- cp \'$fasta_or_text.fasta_file\' input.fasta &&\n+ cp \'$fasta_or_text.fasta_file\' input.fasta\n \n #elif $fasta_or_text.input_mode == \'textbox\':\n- echo \'$fasta_or_text.fasta_text\' > input.fasta &&\n+ echo \'$fasta_or_text.fasta_text\' > input.fasta\n #end if\n \n-python3 \'$__tool_directory__/validate_fasta.py\' input.fasta\n+&& python3 \'$__tool_directory__/validate_fasta.py\' input.fasta\n --min_length \\${ALPHAFOLD_AA_LENGTH_MIN:-0}\n --max_length \\${ALPHAFOLD_AA_LENGTH_MAX:-0}\n #if $multimer:\n --multimer\n #end if\n-> alphafold.fasta &&\n-\n-## env vars -------------------------------\n-export TF_FORCE_UNIFIED_MEMORY=1 &&\n-export XLA_PYTHON_CLIENT_MEM_FRACTION=4.0 &&\n-export DATE=`date +"%Y-%m-%d"` &&\n+> alphafold.fasta\n \n-## run alphafold -------------------------\n-python /app/alphafold/run_alphafold.py\n---fasta_paths alphafold.fasta\n---output_dir output\n---data_dir \\${ALPHAFOLD_DB:-/data}\n---max_template_date=\\$DATE\n+## Env vars -------------------------------\n+&& export TF_FORCE_UNIFIED_MEMORY=1\n+&& export XLA_PYTHON_CLIENT_MEM_FRACTION=4.0\n+&& export TODAY=`date +"%Y-%m-%d"`\n+\n+## Run alphafold -------------------------\n+&& python /app/alphafold/run_alphafold.py\n+ --fasta_paths alphafold.fasta\n+ --output_dir output\n+ --data_dir \\${ALPHAFOLD_DB:-/data}\n \n-## Set reference data explicitly\n---uniref90_database_path \\${ALPHAFOLD_DB:-/data}/uniref90/uniref90.fasta\n---mgnify_database_path \\${ALPHAFOLD_DB:-/data}/mgnify/mgy_clusters_2018_12.fa\n---template_mmcif_dir \\${ALPHAFOLD_DB:-/data}/pdb_mmcif/mmcif_files\n---obsolete_pdbs_path \\${ALPHAFOLD_DB:-/data}/pdb_mmcif/obsolete.dat\n-#if $dbs == \'full\':\n---bfd_database_path \\${ALPHAFOLD_DB:-/data}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt\n---uniclust30_database_path \\${ALPHAFOLD_DB:-/data}/uniclust30/uniclust30_2018_08/uniclust30_2018_08\n-#else\n---db_preset=reduced_dbs\n---small_bfd_database_path \\${ALPHAFOLD_DB:-/data}/small_bfd/bfd-first_non_consensus_sequences.fasta\n-#end if\n+ ## Set reference database paths\n+ --uniref90_database_path \\${ALPHAFOLD_DB:-/data}/uniref90/uniref90.fasta\n+ --mgnify_database_path \\${ALPHAFOLD_DB:-/data}/mgnify/mgy_clusters_2022_05.fa\n+ --template_mmcif_dir \\${ALPHAFOLD_DB:-/data}/pdb_mmcif/mmcif_files\n+ --obsolete_pdbs_path \\${ALPHAFOLD_DB:-/data}/pdb_mmcif/obsolete.dat\n+ #if $dbs == \'full\':\n+ --bfd_database_path \\${ALPHAFOLD_DB:-/data}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt\n+ --uniref30_database_path \\${ALPHAFOLD_DB:-/data}/uniref30/UniRef30_2021_03\n+ #else\n+ --db_preset=reduced_dbs\n+ --small_bfd_database_path \\${ALPHAFOLD_DB:-/data}/small_bfd/bfd-first_non_consensus_sequences.fasta\n+ #end if\n \n-## Param introduced in AlphaFold v2.1.2:'..b' | These files describe the molecular structures and can be used for downstream analysis. e.g. *in silico* molecular docking.\n+ | **PLEASE NOTE** that all outputs have been renamed to their respective rank order, including model and model.pkl files.\n |\n \n *Model confidence scores (optional)*\n@@ -280,16 +386,28 @@\n |\n |\n \n+ *Model data files (ranked_n.pkl)*\n+\n+ | Per-model data stored in pickle files (a Python binary data format). These files can be used as inputs to downstream analysis software (such as Chimera X) for visualizing structures and computing kinetics between protein multimers and domains.\n+ | The tool will produce one ``.pkl`` output for each of the PDB models.\n+ |\n+ |\n+\n+ *relax_metrics.json (optional)*\n+\n+ | A JSON-formatted text file containing relax metrics (mostly remaining violations).\n+ |\n+\n **AlphaFold configuration**\n \n | We have configured AlphaFold to run with the parameters suggested by default on `AlphaFold\'s GitHub <https://github.com/deepmind/alphafold>`_.\n- | This means that it runs against the full database with Amber relaxation, with ``max_template_date`` set to today\'s date. If there are additonal parameters that you would like to interact with, please `send a support request to Galaxy AU <https://site.usegalaxy.org.au/request/support>`_, or open an issue on `our GitHub <https://github.com/usegalaxy-au/tools-au>`_.\n+ | This means that it runs with Amber relaxation enabled, with relaxed PDB models collected as output datasets. If there are additonal parameters that you would like to interact with, please `send a support request to Galaxy AU <https://site.usegalaxy.org.au/request/support>`_, or open an issue on `our GitHub <https://github.com/usegalaxy-au/tools-au>`_.\n |\n |\n \n **External Resources**\n \n- We HIGHLY recommend checking out the\n+ We highly recommend checking out the\n `Alphafold Protein Structure Database <https://alphafold.ebi.ac.uk/>`_,\n which contains pre-computed structures for over 200 million known proteins.\n See also:\n@@ -297,6 +415,21 @@\n - `Google Deepmind\'s article on AlphaFold <https://deepmind.com/blog/article/alphafold-a-solution-to-a-50-year-old-grand-challenge-in-biology>`_\n - `AlphaFold source code on GitHub <https://github.com/deepmind/alphafold>`_\n \n+ *Downstream analysis*\n+\n+ | Obtaining a protein structure prediction is the first step in many analyses.\n+ | The 3D models created by AlphaFold can be used in downstream analysis, including the following:\n+ |\n+\n+ - Inspecting protein features\n+ 3D viewers (pymol, chimera, ngl, blender) can be used to inspect active sites, regulatory domains, binding sites.\n+ - Molecular docking\n+ 3D structures can be used to predict the binding affinity of different compounds.\n+ This is especially useful in screening drug candidates.\n+ - Protein-protein interactions\n+ Proteins associate in many biological processes, including intracellular signalling pathways and protein complex formation.\n+ To predict these interactions, other programs may ingest 3D models predicted by AlphaFold. Proprietary softwares include `GOLD <https://www.ccdc.cam.ac.uk/solutions/csd-discovery/components/gold/>`_ and `SeeSAR <https://www.biosolveit.de/SeeSAR>`_, but many `free and open-source options <https://en.wikipedia.org/wiki/List_of_protein-ligand_docking_software>`_ are available such as `AutoDock <https://autodock.scripps.edu/>`_, `SwissDock <http://www.swissdock.ch/>`_, `DockQ <https://github.com/bjornwallner/DockQ>`_, `MM-Align <https://zhanggroup.org/MM-align/>`_ and `TM-Align <https://zhanggroup.org/TM-align/>`_. Protein-protein interactions are often inferred from AlphaFold-Multimer predictions, which provide a level of confidence in binding affinity between homomer/heteromer subunits.\n+\n ]]></help>\n <citations>\n <citation type="doi">https://doi.org/10.1038/s41586-021-03819-2</citation>\n' |
b |
diff -r c0e71cb2bd1b -r d00e15139065 gen_extra_outputs.py --- a/gen_extra_outputs.py Wed Oct 12 22:25:20 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
@@ -1,197 +0,0 @@ -"""Generate additional output files not produced by AlphaFold.""" - -import json -import pickle -import argparse -from typing import Any, Dict, List - -# Keys for accessing confidence data from JSON/pkl files -# They change depending on whether the run was monomer or multimer -CONTEXT_KEY = { - 'monomer': 'plddts', - 'multimer': 'iptm+ptm', -} - - -class Settings: - """parses then keeps track of program settings""" - def __init__(self): - self.workdir = None - self.output_confidence_scores = True - self.output_residue_scores = False - self.is_multimer = False - - def parse_settings(self) -> None: - parser = argparse.ArgumentParser() - parser.add_argument( - "workdir", - help="alphafold output directory", - type=str - ) - parser.add_argument( - "-p", - "--plddts", - help="output per-residue confidence scores (pLDDTs)", - action="store_true" - ) - parser.add_argument( - "-m", - "--multimer", - help="parse output from AlphaFold multimer", - action="store_true" - ) - args = parser.parse_args() - self.workdir = args.workdir.rstrip('/') - self.output_residue_scores = args.plddts - self.is_multimer = False - self.is_multimer = args.multimer - - -class ExecutionContext: - """uses program settings to get paths to files etc""" - def __init__(self, settings: Settings): - self.settings = settings - - def get_model_key(self, ix): - """Return json key for model index.""" - if self.settings.is_multimer: - return f'model_{ix}_multimer' - return f'model_{ix}' - - @property - def ranking_debug(self) -> str: - return f'{self.settings.workdir}/ranking_debug.json' - - @property - def model_pkls(self) -> List[str]: - ext = '.pkl' - if self.settings.is_multimer: - ext = '_multimer.pkl' - return [ - f'{self.settings.workdir}/result_model_{i}{ext}' - for i in range(1, 6) - ] - - @property - def model_conf_score_output(self) -> str: - return f'{self.settings.workdir}/model_confidence_scores.tsv' - - @property - def plddt_output(self) -> str: - return f'{self.settings.workdir}/plddts.tsv' - - -class FileLoader: - """loads file data for use by other classes""" - - def __init__(self, context: ExecutionContext): - self.context = context - - @property - def confidence_key(self) -> str: - """Return the correct key for confidence data.""" - if self.context.settings.is_multimer: - return CONTEXT_KEY['multimer'] - return CONTEXT_KEY['monomer'] - - def get_model_mapping(self) -> Dict[str, int]: - data = self.load_ranking_debug() - return {name: int(rank) + 1 - for (rank, name) in enumerate(data['order'])} - - def get_conf_scores(self) -> Dict[str, float]: - data = self.load_ranking_debug() - return { - name: float(f'{score:.2f}') - for name, score in data[self.confidence_key].items() - } - - def load_ranking_debug(self) -> Dict[str, Any]: - with open(self.context.ranking_debug, 'r') as fp: - return json.load(fp) - - def get_model_plddts(self) -> Dict[str, List[float]]: - plddts: Dict[str, List[float]] = {} - model_pkls = self.context.model_pkls - for i in range(len(model_pkls)): - pklfile = model_pkls[i] - with open(pklfile, 'rb') as fp: - data = pickle.load(fp) - plddts[self.context.get_model_key(i+1)] = [ - float(f'{x:.2f}') - for x in data['plddt'] - ] - return plddts - - -class OutputGenerator: - """generates the output data we are interested in creating""" - def __init__(self, loader: FileLoader): - self.loader = loader - self.context = loader.context - - def gen_conf_scores(self): - mapping = self.loader.get_model_mapping() - scores = self.loader.get_conf_scores() - ranked = list(scores.items()) - ranked.sort(key=lambda x: x[1], reverse=True) - return { - self.context.get_model_key(mapping[name]): score - for name, score in ranked - } - - def gen_residue_scores(self) -> Dict[str, List[float]]: - mapping = self.loader.get_model_mapping() - model_plddts = self.loader.get_model_plddts() - return { - self.context.get_model_key(mapping[name]): plddts - for name, plddts in model_plddts.items() - } - - -class OutputWriter: - """writes generated data to files""" - def __init__(self, context: ExecutionContext): - self.context = context - - def write_conf_scores(self, data: Dict[str, float]) -> None: - outfile = self.context.model_conf_score_output - with open(outfile, 'w') as fp: - for model, score in data.items(): - fp.write(f'{model}\t{score}\n') - - def write_residue_scores(self, data: Dict[str, List[float]]) -> None: - outfile = self.context.plddt_output - model_plddts = list(data.items()) - model_plddts.sort() - - with open(outfile, 'w') as fp: - for model, plddts in model_plddts: - plddt_str_list = [str(x) for x in plddts] - plddt_str = ','.join(plddt_str_list) - fp.write(f'{model}\t{plddt_str}\n') - - -def main(): - # setup - settings = Settings() - settings.parse_settings() - context = ExecutionContext(settings) - loader = FileLoader(context) - - # generate & write outputs - generator = OutputGenerator(loader) - writer = OutputWriter(context) - - # confidence scores - conf_scores = generator.gen_conf_scores() - writer.write_conf_scores(conf_scores) - - # per-residue plddts - if settings.output_residue_scores: - residue_scores = generator.gen_residue_scores() - writer.write_residue_scores(residue_scores) - - -if __name__ == '__main__': - main() |
b |
diff -r c0e71cb2bd1b -r d00e15139065 outputs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/outputs.py Tue Feb 28 01:15:42 2023 +0000 |
[ |
b'@@ -0,0 +1,245 @@\n+"""Generate additional output files not produced by AlphaFold.\r\n+\r\n+Currently this is includes:\r\n+- model confidence scores\r\n+- per-residue confidence scores (pLDDTs - optional output)\r\n+- model_*.pkl files renamed with rank order\r\n+\r\n+N.B. There have been issues with this script breaking between AlphaFold\r\n+versions due to minor changes in the output directory structure across minor\r\n+versions. It will likely need updating with future releases of AlphaFold.\r\n+\r\n+This code is more complex than you might expect due to the output files\r\n+\'moving around\' considerably, depending on run parameters. You will see that\r\n+several output paths are determined dynamically.\r\n+"""\r\n+\r\n+import argparse\r\n+import json\r\n+import os\r\n+import pickle as pk\r\n+import shutil\r\n+from pathlib import Path\r\n+from typing import List\r\n+\r\n+# Output file names\r\n+OUTPUT_DIR = \'extra\'\r\n+OUTPUTS = {\r\n+ \'model_pkl\': OUTPUT_DIR + \'/ranked_{rank}.pkl\',\r\n+ \'model_confidence_scores\': OUTPUT_DIR + \'/model_confidence_scores.tsv\',\r\n+ \'plddts\': OUTPUT_DIR + \'/plddts.tsv\',\r\n+ \'relax\': OUTPUT_DIR + \'/relax_metrics_ranked.json\',\r\n+}\r\n+\r\n+# Keys for accessing confidence data from JSON/pkl files\r\n+# They change depending on whether the run was monomer or multimer\r\n+PLDDT_KEY = {\r\n+ \'monomer\': \'plddts\',\r\n+ \'multimer\': \'iptm+ptm\',\r\n+}\r\n+\r\n+\r\n+class Settings:\r\n+ """Parse and store settings/config."""\r\n+ def __init__(self):\r\n+ self.workdir = None\r\n+ self.output_confidence_scores = True\r\n+ self.output_residue_scores = False\r\n+ self.is_multimer = False\r\n+\r\n+ def parse_settings(self) -> None:\r\n+ parser = argparse.ArgumentParser()\r\n+ parser.add_argument(\r\n+ "workdir",\r\n+ help="alphafold output directory",\r\n+ type=str\r\n+ )\r\n+ parser.add_argument(\r\n+ "-p",\r\n+ "--plddts",\r\n+ help="output per-residue confidence scores (pLDDTs)",\r\n+ action="store_true"\r\n+ )\r\n+ parser.add_argument(\r\n+ "-m",\r\n+ "--multimer",\r\n+ help="parse output from AlphaFold multimer",\r\n+ action="store_true"\r\n+ )\r\n+ parser.add_argument(\r\n+ "--model-pkl",\r\n+ dest="model_pkl",\r\n+ help="rename model pkl outputs with rank order",\r\n+ action="store_true"\r\n+ )\r\n+ args = parser.parse_args()\r\n+ self.workdir = Path(args.workdir.rstrip(\'/\'))\r\n+ self.output_residue_scores = args.plddts\r\n+ self.output_model_pkls = args.model_pkl\r\n+ self.is_multimer = args.multimer\r\n+ self.output_dir = self.workdir / OUTPUT_DIR\r\n+ os.makedirs(self.output_dir, exist_ok=True)\r\n+\r\n+\r\n+class ExecutionContext:\r\n+ """Collect file paths etc."""\r\n+ def __init__(self, settings: Settings):\r\n+ self.settings = settings\r\n+ if settings.is_multimer:\r\n+ self.plddt_key = PLDDT_KEY[\'multimer\']\r\n+ else:\r\n+ self.plddt_key = PLDDT_KEY[\'monomer\']\r\n+\r\n+ def get_model_key(self, ix: int) -> str:\r\n+ """Return json key for model index.\r\n+\r\n+ The key format changed between minor AlphaFold versions so this\r\n+ function determines the correct key.\r\n+ """\r\n+ with open(self.ranking_debug) as f:\r\n+ data = json.load(f)\r\n+ model_keys = list(data[self.plddt_key].keys())\r\n+ for k in model_keys:\r\n+ if k.startswith(f"model_{ix}_"):\r\n+ return k\r\n+ return KeyError(\r\n+ f\'Could not find key for index={ix} in\'\r\n+ \' ranking_debug.json\')\r\n+\r\n+ @property\r\n+ def ranking_debug(self) -> str:\r\n+ return self.settings.workdir / \'ranking_debug.json\'\r\n+\r\n+ @property\r\n+ def relax_metrics(self) -> str:\r\n+ return self.settings.workdir / \'relax_metrics.json\'\r\n+\r\n+ @property\r\n+ def relax_metrics_ranked(self) -> str:\r\n+ return self.settings.workdir / \'relax_metrics_'..b' self.context = context\r\n+ self.path = path\r\n+ self.name = os.path.basename(path).replace(\'result_\', \'\').split(\'.\')[0]\r\n+ with open(path, \'rb\') as path:\r\n+ self.data = pk.load(path)\r\n+\r\n+ @property\r\n+ def plddts(self) -> List[float]:\r\n+ """Return pLDDT scores for each residue."""\r\n+ return list(self.data[\'plddt\'])\r\n+\r\n+\r\n+class ResultRanking:\r\n+ """Load and manipulate data from ranking_debug.json file."""\r\n+\r\n+ def __init__(self, context: ExecutionContext):\r\n+ self.path = context.ranking_debug\r\n+ self.context = context\r\n+ with open(self.path, \'r\') as f:\r\n+ self.data = json.load(f)\r\n+\r\n+ @property\r\n+ def order(self) -> List[str]:\r\n+ """Return ordered list of model indexes."""\r\n+ return self.data[\'order\']\r\n+\r\n+ def get_plddt_for_rank(self, rank: int) -> List[float]:\r\n+ """Get pLDDT score for model instance."""\r\n+ return self.data[self.context.plddt_key][self.data[\'order\'][rank - 1]]\r\n+\r\n+ def get_rank_for_model(self, model_name: str) -> int:\r\n+ """Return 0-indexed rank for given model name.\r\n+\r\n+ Model names are expressed in result_model_*.pkl file names.\r\n+ """\r\n+ return self.data[\'order\'].index(model_name)\r\n+\r\n+\r\n+def write_confidence_scores(ranking: ResultRanking, context: ExecutionContext):\r\n+ """Write per-model confidence scores."""\r\n+ path = context.settings.workdir / OUTPUTS[\'model_confidence_scores\']\r\n+ with open(path, \'w\') as f:\r\n+ for rank in range(1, 6):\r\n+ score = ranking.get_plddt_for_rank(rank)\r\n+ f.write(f\'ranked_{rank - 1}\\t{score:.2f}\\n\')\r\n+\r\n+\r\n+def write_per_residue_scores(\r\n+ ranking: ResultRanking,\r\n+ context: ExecutionContext,\r\n+):\r\n+ """Write per-residue plddts for each model.\r\n+\r\n+ A row of plddt values is written for each model in tabular format.\r\n+ """\r\n+ model_plddts = {}\r\n+ for i, path in enumerate(context.model_pkl_paths):\r\n+ model = ResultModelPrediction(path, context)\r\n+ rank = ranking.get_rank_for_model(model.name)\r\n+ model_plddts[rank] = model.plddts\r\n+\r\n+ path = context.settings.workdir / OUTPUTS[\'plddts\']\r\n+ with open(path, \'w\') as f:\r\n+ for i in sorted(list(model_plddts.keys())):\r\n+ row = [f\'ranked_{i}\'] + [\r\n+ str(x) for x in model_plddts[i]\r\n+ ]\r\n+ f.write(\'\\t\'.join(row) + \'\\n\')\r\n+\r\n+\r\n+def rename_model_pkls(ranking: ResultRanking, context: ExecutionContext):\r\n+ """Rename model.pkl files so the rank order is implicit."""\r\n+ for path in context.model_pkl_paths:\r\n+ model = ResultModelPrediction(path, context)\r\n+ rank = ranking.get_rank_for_model(model.name)\r\n+ new_path = (\r\n+ context.settings.workdir\r\n+ / OUTPUTS[\'model_pkl\'].format(rank=rank)\r\n+ )\r\n+ shutil.copyfile(path, new_path)\r\n+\r\n+\r\n+def rekey_relax_metrics(ranking: ResultRanking, context: ExecutionContext):\r\n+ """Replace keys in relax_metrics.json with 0-indexed rank."""\r\n+ with open(context.relax_metrics) as f:\r\n+ data = json.load(f)\r\n+ for k in list(data.keys()):\r\n+ rank = ranking.get_rank_for_model(k)\r\n+ data[f\'ranked_{rank}\'] = data.pop(k)\r\n+ new_path = context.settings.workdir / OUTPUTS[\'relax\']\r\n+ with open(new_path, \'w\') as f:\r\n+ json.dump(data, f)\r\n+\r\n+\r\n+def main():\r\n+ """Parse output files and generate additional output files."""\r\n+ settings = Settings()\r\n+ settings.parse_settings()\r\n+ context = ExecutionContext(settings)\r\n+ ranking = ResultRanking(context)\r\n+ write_confidence_scores(ranking, context)\r\n+ rekey_relax_metrics(ranking, context)\r\n+\r\n+ # Optional outputs\r\n+ if settings.output_model_pkls:\r\n+ rename_model_pkls(ranking, context)\r\n+\r\n+ if settings.output_residue_scores:\r\n+ write_per_residue_scores(ranking, context)\r\n+\r\n+\r\n+if __name__ == \'__main__\':\r\n+ main()\r\n' |
b |
diff -r c0e71cb2bd1b -r d00e15139065 validate_fasta.py --- a/validate_fasta.py Wed Oct 12 22:25:20 2022 +0000 +++ b/validate_fasta.py Tue Feb 28 01:15:42 2023 +0000 |
b |
@@ -1,8 +1,8 @@ """Validate input FASTA sequence.""" +import argparse import re import sys -import argparse from typing import List MULTIMER_MAX_SEQUENCE_COUNT = 10 |