Previous changeset 15:a58f7eb0df2c (2023-03-10) Next changeset 17:5b85006245f3 (2023-05-31) |
Commit message:
planemo upload for repository https://github.com/usegalaxy-au/tools-au commit ee77734f1800350fa2a6ef28b2b8eade304a456f-dirty |
modified:
README.rst alphafold.xml |
added:
scripts/outputs.py scripts/validate_fasta.py |
removed:
outputs.py validate_fasta.py |
b |
diff -r a58f7eb0df2c -r f9eb041c518c README.rst --- a/README.rst Fri Mar 10 02:48:07 2023 +0000 +++ b/README.rst Mon Apr 03 01:00:42 2023 +0000 |
b |
@@ -75,18 +75,20 @@ ~~~~~~~~~~~~~~ Alphafold needs reference data to run. The wrapper expects this data to -be present at ``/data/alphafold_databases``. A custom path will be read from -the ``ALPHAFOLD_DB`` environment variable, if set. +be present at ``/$ALPHAFOLD_DB/TOOL_MINOR_VERSION``. +Where ``ALPHAFOLD_DB`` is a custom path that will be read from +the ``ALPHAFOLD_DB`` environment variable (defaulting to ``/data``). +And TOOL_MINOR_VERSION is the alphafold version, e.g. ``2.3.1``. To download the AlphaFold reference DBs: :: # Set your AlphaFold DB path - ALPHAFOLD_DB=/data/alphafold_databases + ALPHAFOLD_DB=/data/alphafold_databases/2.3.1 # Set your target AlphaFold version - ALPHAFOLD_VERSION= # e.g. 2.1.2 + ALPHAFOLD_VERSION=2.3.1 # Download repo wget https://github.com/deepmind/alphafold/releases/tag/v${ALPHAFOLD_VERSION}.tar.gz @@ -110,7 +112,7 @@ # NOTE: this structure will change between minor AlphaFold versions # The tree shown below was updated for v2.3.1 - data/alphafold_databases + data/alphafold_databases/2.3.1/ ├── bfd │ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffdata │ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffindex @@ -176,7 +178,7 @@ :: - data/alphafold_databases + data/alphafold_databases/2.3.1/ ├── small_bfd │ └── bfd-first_non_consensus_sequences.fasta @@ -193,7 +195,7 @@ If you wish to continue hosting prior versions of the tool, you must maintain the reference DBs for each version. The ``ALPHAFOLD_DB`` environment variable must then be set respectively for each tool version in your job conf (on Galaxy -AU this is currently `configured with TPV<https://github.com/usegalaxy-au/infrastructure/blob/master/files/galaxy/dynamic_job_rules/production/total_perspective_vortex/tools.yml#L1515-L1554>`_). +AU this is currently `configured with TPV <https://github.com/usegalaxy-au/infrastructure/blob/master/files/galaxy/dynamic_job_rules/production/total_perspective_vortex/tools.yml#L1515-L1554>`_). To minimize redundancy between DB version, we have symlinked the database components that are unchanging between versions. In ``v2.1.2 -> v2.3.1`` the BFD |
b |
diff -r a58f7eb0df2c -r f9eb041c518c alphafold.xml --- a/alphafold.xml Fri Mar 10 02:48:07 2023 +0000 +++ b/alphafold.xml Mon Apr 03 01:00:42 2023 +0000 |
b |
@@ -2,7 +2,8 @@ <description> - AI-guided 3D structural prediction of proteins</description> <macros> <token name="@TOOL_VERSION@">2.3.1</token> - <token name="@VERSION_SUFFIX@">1</token> + <token name="@TOOL_MINOR_VERSION@">2.3</token> + <token name="@VERSION_SUFFIX@">2</token> <import>macro_output.xml</import> <import>macro_test_output.xml</import> </macros> @@ -24,8 +25,11 @@ ## in planemo's gx_venv_n/bin/activate script. AlphaFold outputs will be copied ## from the test-data directory instead of running the tool. -## $ALPHAFOLD_DB variable should point to the location of the AlphaFold -## databases - defaults to /data +## $ALPHAFOLD_DB variable should point to the location containing the versioned +## AlphaFold databases - defaults to /data +## that is the directory should contain a subdir / symlink named identical as +## the value of the TOOL_MINOR_VERSION token which contains the AF reference data +## for the corresponding version ## Read FASTA input ----------------------------------------------------------- #if $fasta_or_text.input_mode == 'history': @@ -34,7 +38,7 @@ echo '$fasta_or_text.fasta_text' > input.fasta #end if -&& python3 '$__tool_directory__/validate_fasta.py' input.fasta +&& python3 '$__tool_directory__/scripts/validate_fasta.py' input.fasta --min_length \${ALPHAFOLD_AA_LENGTH_MIN:-0} --max_length \${ALPHAFOLD_AA_LENGTH_MAX:-0} #if $model_preset == 'multimer': @@ -51,26 +55,26 @@ #if os.environ.get('PLANEMO_TESTING'): ## Run in testing mode (mocks a successful AlphaFold run by copying outputs) && echo "Creating dummy outputs for model_preset=$model_preset..." - && bash '$__tool_directory__/mock_alphafold.sh' $model_preset + && bash '$__tool_directory__/scripts/mock_alphafold.sh' $model_preset #else: ## Run AlphaFold && python /app/alphafold/run_alphafold.py --fasta_paths alphafold.fasta --output_dir output - --data_dir \${ALPHAFOLD_DB:-/data} + --data_dir \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/ --model_preset=$model_preset ## Set reference database paths - --uniref90_database_path \${ALPHAFOLD_DB:-/data}/uniref90/uniref90.fasta - --mgnify_database_path \${ALPHAFOLD_DB:-/data}/mgnify/mgy_clusters_2022_05.fa - --template_mmcif_dir \${ALPHAFOLD_DB:-/data}/pdb_mmcif/mmcif_files - --obsolete_pdbs_path \${ALPHAFOLD_DB:-/data}/pdb_mmcif/obsolete.dat + --uniref90_database_path \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/uniref90/uniref90.fasta + --mgnify_database_path \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/mgnify/mgy_clusters_2022_05.fa + --template_mmcif_dir \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/pdb_mmcif/mmcif_files + --obsolete_pdbs_path \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/pdb_mmcif/obsolete.dat #if $dbs == 'full': - --bfd_database_path \${ALPHAFOLD_DB:-/data}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt - --uniref30_database_path \${ALPHAFOLD_DB:-/data}/uniref30/UniRef30_2021_03 + --bfd_database_path \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt + --uniref30_database_path \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/uniref30/UniRef30_2021_03 #else --db_preset=reduced_dbs - --small_bfd_database_path \${ALPHAFOLD_DB:-/data}/small_bfd/bfd-first_non_consensus_sequences.fasta + --small_bfd_database_path \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/small_bfd/bfd-first_non_consensus_sequences.fasta #end if #if $max_template_date: @@ -82,16 +86,16 @@ --use_gpu_relax=\${ALPHAFOLD_USE_GPU:-True} ## introduced in v2.1.2 #if $model_preset == 'multimer': - --pdb_seqres_database_path=\${ALPHAFOLD_DB:-/data}/pdb_seqres/pdb_seqres.txt - --uniprot_database_path=\${ALPHAFOLD_DB:-/data}/uniprot/uniprot.fasta + --pdb_seqres_database_path=\${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/pdb_seqres/pdb_seqres.txt + --uniprot_database_path=\${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/uniprot/uniprot.fasta --num_multimer_predictions_per_model=1 ## introduced in v2.2.0 #else - --pdb70_database_path \${ALPHAFOLD_DB:-/data}/pdb70/pdb70 + --pdb70_database_path \${ALPHAFOLD_DB:-/data}/@TOOL_MINOR_VERSION@/pdb70/pdb70 #end if #end if ## Generate additional outputs ------------------------------------------------ -&& python3 '$__tool_directory__/outputs.py' output/alphafold +&& python3 '$__tool_directory__/scripts/outputs.py' output/alphafold $outputs.plddts $outputs.model_pkls $outputs.pae_csv |
b |
diff -r a58f7eb0df2c -r f9eb041c518c outputs.py --- a/outputs.py Fri Mar 10 02:48:07 2023 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,323 +0,0 @@\n-"""Generate additional output files not produced by AlphaFold.\r\n-\r\n-Currently this is includes:\r\n-- model confidence scores\r\n-- per-residue confidence scores (pLDDTs - optional output)\r\n-- model_*.pkl files renamed with rank order\r\n-\r\n-N.B. There have been issues with this script breaking between AlphaFold\r\n-versions due to minor changes in the output directory structure across minor\r\n-versions. It will likely need updating with future releases of AlphaFold.\r\n-\r\n-This code is more complex than you might expect due to the output files\r\n-\'moving around\' considerably, depending on run parameters. You will see that\r\n-several output paths are determined dynamically.\r\n-"""\r\n-\r\n-import argparse\r\n-import json\r\n-import os\r\n-import pickle as pk\r\n-import shutil\r\n-from matplotlib import pyplot as plt\r\n-from pathlib import Path\r\n-from typing import List\r\n-\r\n-# Output file paths\r\n-OUTPUT_DIR = \'extra\'\r\n-OUTPUTS = {\r\n- \'model_pkl\': OUTPUT_DIR + \'/ranked_{rank}.pkl\',\r\n- \'model_pae\': OUTPUT_DIR + \'/pae_ranked_{rank}.csv\',\r\n- \'model_plot\': OUTPUT_DIR + \'/ranked_{rank}.png\',\r\n- \'model_confidence_scores\': OUTPUT_DIR + \'/model_confidence_scores.tsv\',\r\n- \'plddts\': OUTPUT_DIR + \'/plddts.tsv\',\r\n- \'relax\': OUTPUT_DIR + \'/relax_metrics_ranked.json\',\r\n-}\r\n-\r\n-# Keys for accessing confidence data from JSON/pkl files\r\n-# They change depending on whether the run was monomer or multimer\r\n-PLDDT_KEY = {\r\n- \'monomer\': \'plddts\',\r\n- \'multimer\': \'iptm+ptm\',\r\n-}\r\n-\r\n-\r\n-class Settings:\r\n- """Parse and store settings/config."""\r\n- def __init__(self):\r\n- self.workdir = None\r\n- self.output_confidence_scores = True\r\n- self.output_residue_scores = False\r\n- self.is_multimer = False\r\n- self.parse()\r\n-\r\n- def parse(self) -> None:\r\n- parser = argparse.ArgumentParser()\r\n- parser.add_argument(\r\n- "workdir",\r\n- help="alphafold output directory",\r\n- type=str\r\n- )\r\n- parser.add_argument(\r\n- "-p",\r\n- "--plddts",\r\n- help="output per-residue confidence scores (pLDDTs)",\r\n- action="store_true"\r\n- )\r\n- parser.add_argument(\r\n- "-m",\r\n- "--multimer",\r\n- help="parse output from AlphaFold multimer",\r\n- action="store_true"\r\n- )\r\n- parser.add_argument(\r\n- "--pkl",\r\n- help="rename model pkl outputs with rank order",\r\n- action="store_true"\r\n- )\r\n- parser.add_argument(\r\n- "--pae",\r\n- help="extract PAE from pkl files to CSV format",\r\n- action="store_true"\r\n- )\r\n- parser.add_argument(\r\n- "--plot",\r\n- help="Plot pLDDT and PAE for each model",\r\n- action="store_true"\r\n- )\r\n- args = parser.parse_args()\r\n- self.workdir = Path(args.workdir.rstrip(\'/\'))\r\n- self.output_residue_scores = args.plddts\r\n- self.output_model_pkls = args.pkl\r\n- self.output_model_plots = args.plot\r\n- self.output_pae = args.pae\r\n- self.is_multimer = args.multimer\r\n- self.output_dir = self.workdir / OUTPUT_DIR\r\n- os.makedirs(self.output_dir, exist_ok=True)\r\n-\r\n-\r\n-class ExecutionContext:\r\n- """Collect file paths etc."""\r\n- def __init__(self, settings: Settings):\r\n- self.settings = settings\r\n- if settings.is_multimer:\r\n- self.plddt_key = PLDDT_KEY[\'multimer\']\r\n- else:\r\n- self.plddt_key = PLDDT_KEY[\'monomer\']\r\n-\r\n- def get_model_key(self, ix: int) -> str:\r\n- """Return json key for model index.\r\n-\r\n- The key format changed between minor AlphaFold versions so this\r\n- function determines the correct key.\r\n- """\r\n- with open(self.ranking_debug) as f:\r\n- data = json.load(f)\r\n- model_keys = list(data[self.plddt_key].keys())\r\n- for k in model_keys:\r\n- if k.starts'..b'\r\n-\r\n-\r\n-def rename_model_pkls(ranking: ResultRanking, context: ExecutionContext):\r\n- """Rename model.pkl files so the rank order is implicit."""\r\n- for path in context.model_pkl_paths:\r\n- model = ResultModelPrediction(path, context)\r\n- rank = ranking.get_rank_for_model(model.name)\r\n- new_path = (\r\n- context.settings.workdir\r\n- / OUTPUTS[\'model_pkl\'].format(rank=rank)\r\n- )\r\n- shutil.copyfile(path, new_path)\r\n-\r\n-\r\n-def extract_pae_to_csv(ranking: ResultRanking, context: ExecutionContext):\r\n- """Extract predicted alignment error matrix from pickle files.\r\n-\r\n- Creates a CSV file for each of five ranked models.\r\n- """\r\n- for path in context.model_pkl_paths:\r\n- model = ResultModelPrediction(path, context)\r\n- rank = ranking.get_rank_for_model(model.name)\r\n- with open(path, \'rb\') as f:\r\n- data = pk.load(f)\r\n- if \'predicted_aligned_error\' not in data:\r\n- print("Skipping PAE output"\r\n- f" - not found in {path}."\r\n- " Running with model_preset=monomer?")\r\n- return\r\n- pae = data[\'predicted_aligned_error\']\r\n- out_path = (\r\n- context.settings.workdir\r\n- / OUTPUTS[\'model_pae\'].format(rank=rank)\r\n- )\r\n- with open(out_path, \'w\') as f:\r\n- for row in pae:\r\n- f.write(\',\'.join([str(x) for x in row]) + \'\\n\')\r\n-\r\n-\r\n-def rekey_relax_metrics(ranking: ResultRanking, context: ExecutionContext):\r\n- """Replace keys in relax_metrics.json with 0-indexed rank."""\r\n- with open(context.relax_metrics) as f:\r\n- data = json.load(f)\r\n- for k in list(data.keys()):\r\n- rank = ranking.get_rank_for_model(k)\r\n- data[f\'ranked_{rank}\'] = data.pop(k)\r\n- new_path = context.settings.workdir / OUTPUTS[\'relax\']\r\n- with open(new_path, \'w\') as f:\r\n- json.dump(data, f)\r\n-\r\n-\r\n-def plddt_pae_plots(ranking: ResultRanking, context: ExecutionContext):\r\n- """Generate a pLDDT + PAE plot for each model."""\r\n- for path in context.model_pkl_paths:\r\n- num_plots = 2\r\n- model = ResultModelPrediction(path, context)\r\n- rank = ranking.get_rank_for_model(model.name)\r\n- png_path = (\r\n- context.settings.workdir\r\n- / OUTPUTS[\'model_plot\'].format(rank=rank)\r\n- )\r\n- plddts = model.data[\'plddt\']\r\n- if \'predicted_aligned_error\' in model.data:\r\n- pae = model.data[\'predicted_aligned_error\']\r\n- max_pae = model.data[\'max_predicted_aligned_error\']\r\n- else:\r\n- num_plots = 1\r\n-\r\n- plt.figure(figsize=[8 * num_plots, 6])\r\n- plt.subplot(1, num_plots, 1)\r\n- plt.plot(plddts)\r\n- plt.title(\'Predicted LDDT\')\r\n- plt.xlabel(\'Residue\')\r\n- plt.ylabel(\'pLDDT\')\r\n-\r\n- if num_plots == 2:\r\n- plt.subplot(1, 2, 2)\r\n- plt.imshow(pae, vmin=0., vmax=max_pae, cmap=\'Greens_r\')\r\n- plt.colorbar(fraction=0.046, pad=0.04)\r\n- plt.title(\'Predicted Aligned Error\')\r\n- plt.xlabel(\'Scored residue\')\r\n- plt.ylabel(\'Aligned residue\')\r\n-\r\n- plt.savefig(png_path)\r\n-\r\n-\r\n-def main():\r\n- """Parse output files and generate additional output files."""\r\n- settings = Settings()\r\n- context = ExecutionContext(settings)\r\n- ranking = ResultRanking(context)\r\n- write_confidence_scores(ranking, context)\r\n- rekey_relax_metrics(ranking, context)\r\n-\r\n- # Optional outputs\r\n- if settings.output_model_pkls:\r\n- rename_model_pkls(ranking, context)\r\n- if settings.output_model_plots:\r\n- plddt_pae_plots(ranking, context)\r\n- if settings.output_pae:\r\n- # Only created by monomer_ptm and multimer models\r\n- extract_pae_to_csv(ranking, context)\r\n- if settings.output_residue_scores:\r\n- write_per_residue_scores(ranking, context)\r\n-\r\n-\r\n-if __name__ == \'__main__\':\r\n- main()\r\n' |
b |
diff -r a58f7eb0df2c -r f9eb041c518c scripts/outputs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/outputs.py Mon Apr 03 01:00:42 2023 +0000 |
[ |
b'@@ -0,0 +1,324 @@\n+"""Generate additional output files not produced by AlphaFold.\r\n+\r\n+Currently this is includes:\r\n+- model confidence scores\r\n+- per-residue confidence scores (pLDDTs - optional output)\r\n+- model_*.pkl files renamed with rank order\r\n+\r\n+N.B. There have been issues with this script breaking between AlphaFold\r\n+versions due to minor changes in the output directory structure across minor\r\n+versions. It will likely need updating with future releases of AlphaFold.\r\n+\r\n+This code is more complex than you might expect due to the output files\r\n+\'moving around\' considerably, depending on run parameters. You will see that\r\n+several output paths are determined dynamically.\r\n+"""\r\n+\r\n+import argparse\r\n+import json\r\n+import os\r\n+import pickle as pk\r\n+import shutil\r\n+from pathlib import Path\r\n+from typing import List\r\n+\r\n+from matplotlib import pyplot as plt\r\n+\r\n+# Output file paths\r\n+OUTPUT_DIR = \'extra\'\r\n+OUTPUTS = {\r\n+ \'model_pkl\': OUTPUT_DIR + \'/ranked_{rank}.pkl\',\r\n+ \'model_pae\': OUTPUT_DIR + \'/pae_ranked_{rank}.csv\',\r\n+ \'model_plot\': OUTPUT_DIR + \'/ranked_{rank}.png\',\r\n+ \'model_confidence_scores\': OUTPUT_DIR + \'/model_confidence_scores.tsv\',\r\n+ \'plddts\': OUTPUT_DIR + \'/plddts.tsv\',\r\n+ \'relax\': OUTPUT_DIR + \'/relax_metrics_ranked.json\',\r\n+}\r\n+\r\n+# Keys for accessing confidence data from JSON/pkl files\r\n+# They change depending on whether the run was monomer or multimer\r\n+PLDDT_KEY = {\r\n+ \'monomer\': \'plddts\',\r\n+ \'multimer\': \'iptm+ptm\',\r\n+}\r\n+\r\n+\r\n+class Settings:\r\n+ """Parse and store settings/config."""\r\n+ def __init__(self):\r\n+ self.workdir = None\r\n+ self.output_confidence_scores = True\r\n+ self.output_residue_scores = False\r\n+ self.is_multimer = False\r\n+ self.parse()\r\n+\r\n+ def parse(self) -> None:\r\n+ parser = argparse.ArgumentParser()\r\n+ parser.add_argument(\r\n+ "workdir",\r\n+ help="alphafold output directory",\r\n+ type=str\r\n+ )\r\n+ parser.add_argument(\r\n+ "-p",\r\n+ "--plddts",\r\n+ help="output per-residue confidence scores (pLDDTs)",\r\n+ action="store_true"\r\n+ )\r\n+ parser.add_argument(\r\n+ "-m",\r\n+ "--multimer",\r\n+ help="parse output from AlphaFold multimer",\r\n+ action="store_true"\r\n+ )\r\n+ parser.add_argument(\r\n+ "--pkl",\r\n+ help="rename model pkl outputs with rank order",\r\n+ action="store_true"\r\n+ )\r\n+ parser.add_argument(\r\n+ "--pae",\r\n+ help="extract PAE from pkl files to CSV format",\r\n+ action="store_true"\r\n+ )\r\n+ parser.add_argument(\r\n+ "--plot",\r\n+ help="Plot pLDDT and PAE for each model",\r\n+ action="store_true"\r\n+ )\r\n+ args = parser.parse_args()\r\n+ self.workdir = Path(args.workdir.rstrip(\'/\'))\r\n+ self.output_residue_scores = args.plddts\r\n+ self.output_model_pkls = args.pkl\r\n+ self.output_model_plots = args.plot\r\n+ self.output_pae = args.pae\r\n+ self.is_multimer = args.multimer\r\n+ self.output_dir = self.workdir / OUTPUT_DIR\r\n+ os.makedirs(self.output_dir, exist_ok=True)\r\n+\r\n+\r\n+class ExecutionContext:\r\n+ """Collect file paths etc."""\r\n+ def __init__(self, settings: Settings):\r\n+ self.settings = settings\r\n+ if settings.is_multimer:\r\n+ self.plddt_key = PLDDT_KEY[\'multimer\']\r\n+ else:\r\n+ self.plddt_key = PLDDT_KEY[\'monomer\']\r\n+\r\n+ def get_model_key(self, ix: int) -> str:\r\n+ """Return json key for model index.\r\n+\r\n+ The key format changed between minor AlphaFold versions so this\r\n+ function determines the correct key.\r\n+ """\r\n+ with open(self.ranking_debug) as f:\r\n+ data = json.load(f)\r\n+ model_keys = list(data[self.plddt_key].keys())\r\n+ for k in model_keys:\r\n+ if k.sta'..b'\r\n+\r\n+\r\n+def rename_model_pkls(ranking: ResultRanking, context: ExecutionContext):\r\n+ """Rename model.pkl files so the rank order is implicit."""\r\n+ for path in context.model_pkl_paths:\r\n+ model = ResultModelPrediction(path, context)\r\n+ rank = ranking.get_rank_for_model(model.name)\r\n+ new_path = (\r\n+ context.settings.workdir\r\n+ / OUTPUTS[\'model_pkl\'].format(rank=rank)\r\n+ )\r\n+ shutil.copyfile(path, new_path)\r\n+\r\n+\r\n+def extract_pae_to_csv(ranking: ResultRanking, context: ExecutionContext):\r\n+ """Extract predicted alignment error matrix from pickle files.\r\n+\r\n+ Creates a CSV file for each of five ranked models.\r\n+ """\r\n+ for path in context.model_pkl_paths:\r\n+ model = ResultModelPrediction(path, context)\r\n+ rank = ranking.get_rank_for_model(model.name)\r\n+ with open(path, \'rb\') as f:\r\n+ data = pk.load(f)\r\n+ if \'predicted_aligned_error\' not in data:\r\n+ print("Skipping PAE output"\r\n+ f" - not found in {path}."\r\n+ " Running with model_preset=monomer?")\r\n+ return\r\n+ pae = data[\'predicted_aligned_error\']\r\n+ out_path = (\r\n+ context.settings.workdir\r\n+ / OUTPUTS[\'model_pae\'].format(rank=rank)\r\n+ )\r\n+ with open(out_path, \'w\') as f:\r\n+ for row in pae:\r\n+ f.write(\',\'.join([str(x) for x in row]) + \'\\n\')\r\n+\r\n+\r\n+def rekey_relax_metrics(ranking: ResultRanking, context: ExecutionContext):\r\n+ """Replace keys in relax_metrics.json with 0-indexed rank."""\r\n+ with open(context.relax_metrics) as f:\r\n+ data = json.load(f)\r\n+ for k in list(data.keys()):\r\n+ rank = ranking.get_rank_for_model(k)\r\n+ data[f\'ranked_{rank}\'] = data.pop(k)\r\n+ new_path = context.settings.workdir / OUTPUTS[\'relax\']\r\n+ with open(new_path, \'w\') as f:\r\n+ json.dump(data, f)\r\n+\r\n+\r\n+def plddt_pae_plots(ranking: ResultRanking, context: ExecutionContext):\r\n+ """Generate a pLDDT + PAE plot for each model."""\r\n+ for path in context.model_pkl_paths:\r\n+ num_plots = 2\r\n+ model = ResultModelPrediction(path, context)\r\n+ rank = ranking.get_rank_for_model(model.name)\r\n+ png_path = (\r\n+ context.settings.workdir\r\n+ / OUTPUTS[\'model_plot\'].format(rank=rank)\r\n+ )\r\n+ plddts = model.data[\'plddt\']\r\n+ if \'predicted_aligned_error\' in model.data:\r\n+ pae = model.data[\'predicted_aligned_error\']\r\n+ max_pae = model.data[\'max_predicted_aligned_error\']\r\n+ else:\r\n+ num_plots = 1\r\n+\r\n+ plt.figure(figsize=[8 * num_plots, 6])\r\n+ plt.subplot(1, num_plots, 1)\r\n+ plt.plot(plddts)\r\n+ plt.title(\'Predicted LDDT\')\r\n+ plt.xlabel(\'Residue\')\r\n+ plt.ylabel(\'pLDDT\')\r\n+\r\n+ if num_plots == 2:\r\n+ plt.subplot(1, 2, 2)\r\n+ plt.imshow(pae, vmin=0., vmax=max_pae, cmap=\'Greens_r\')\r\n+ plt.colorbar(fraction=0.046, pad=0.04)\r\n+ plt.title(\'Predicted Aligned Error\')\r\n+ plt.xlabel(\'Scored residue\')\r\n+ plt.ylabel(\'Aligned residue\')\r\n+\r\n+ plt.savefig(png_path)\r\n+\r\n+\r\n+def main():\r\n+ """Parse output files and generate additional output files."""\r\n+ settings = Settings()\r\n+ context = ExecutionContext(settings)\r\n+ ranking = ResultRanking(context)\r\n+ write_confidence_scores(ranking, context)\r\n+ rekey_relax_metrics(ranking, context)\r\n+\r\n+ # Optional outputs\r\n+ if settings.output_model_pkls:\r\n+ rename_model_pkls(ranking, context)\r\n+ if settings.output_model_plots:\r\n+ plddt_pae_plots(ranking, context)\r\n+ if settings.output_pae:\r\n+ # Only created by monomer_ptm and multimer models\r\n+ extract_pae_to_csv(ranking, context)\r\n+ if settings.output_residue_scores:\r\n+ write_per_residue_scores(ranking, context)\r\n+\r\n+\r\n+if __name__ == \'__main__\':\r\n+ main()\r\n' |
b |
diff -r a58f7eb0df2c -r f9eb041c518c scripts/validate_fasta.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/validate_fasta.py Mon Apr 03 01:00:42 2023 +0000 |
[ |
b'@@ -0,0 +1,254 @@\n+"""Validate input FASTA sequence."""\r\n+\r\n+import argparse\r\n+import re\r\n+import sys\r\n+from typing import List\r\n+\r\n+MULTIMER_MAX_SEQUENCE_COUNT = 10\r\n+\r\n+\r\n+class Fasta:\r\n+ def __init__(self, header_str: str, seq_str: str):\r\n+ self.header = header_str\r\n+ self.aa_seq = seq_str\r\n+\r\n+\r\n+class FastaLoader:\r\n+ def __init__(self, fasta_path: str):\r\n+ """Initialize from FASTA file."""\r\n+ self.fastas = []\r\n+ self.load(fasta_path)\r\n+\r\n+ def load(self, fasta_path: str):\r\n+ """Load bare or FASTA formatted sequence."""\r\n+ with open(fasta_path, \'r\') as f:\r\n+ self.content = f.read()\r\n+\r\n+ if "__cn__" in self.content:\r\n+ # Pasted content with escaped characters\r\n+ self.newline = \'__cn__\'\r\n+ self.read_caret = \'__gt__\'\r\n+ else:\r\n+ # Uploaded file with normal content\r\n+ self.newline = \'\\n\'\r\n+ self.read_caret = \'>\'\r\n+\r\n+ self.lines = self.content.split(self.newline)\r\n+\r\n+ if not self.lines[0].startswith(self.read_caret):\r\n+ # Fasta is headless, load as single sequence\r\n+ self.update_fastas(\r\n+ \'\', \'\'.join(self.lines)\r\n+ )\r\n+\r\n+ else:\r\n+ header = None\r\n+ sequence = None\r\n+ for line in self.lines:\r\n+ if line.startswith(self.read_caret):\r\n+ if header:\r\n+ self.update_fastas(header, sequence)\r\n+ header = \'>\' + self.strip_header(line)\r\n+ sequence = \'\'\r\n+ else:\r\n+ sequence += line.strip(\'\\n \')\r\n+ self.update_fastas(header, sequence)\r\n+\r\n+ def strip_header(self, line):\r\n+ """Strip characters escaped with underscores from pasted text."""\r\n+ return re.sub(r\'\\_\\_.{2}\\_\\_\', \'\', line).strip(\'>\')\r\n+\r\n+ def update_fastas(self, header: str, sequence: str):\r\n+ # if we have a sequence\r\n+ if sequence:\r\n+ # create generic header if not exists\r\n+ if not header:\r\n+ fasta_count = len(self.fastas)\r\n+ header = f\'>sequence_{fasta_count}\'\r\n+\r\n+ # Create new Fasta\r\n+ self.fastas.append(Fasta(header, sequence))\r\n+\r\n+\r\n+class FastaValidator:\r\n+ def __init__(\r\n+ self,\r\n+ min_length=None,\r\n+ max_length=None,\r\n+ multiple=False):\r\n+ self.multiple = multiple\r\n+ self.min_length = min_length\r\n+ self.max_length = max_length\r\n+ self.iupac_characters = {\r\n+ \'A\', \'B\', \'C\', \'D\', \'E\', \'F\', \'G\',\r\n+ \'H\', \'I\', \'K\', \'L\', \'M\', \'N\', \'P\',\r\n+ \'Q\', \'R\', \'S\', \'T\', \'U\', \'V\', \'W\', \'X\',\r\n+ \'Y\', \'Z\', \'-\'\r\n+ }\r\n+\r\n+ def validate(self, fasta_list: List[Fasta]):\r\n+ """Perform FASTA validation."""\r\n+ self.fasta_list = fasta_list\r\n+ self.validate_num_seqs()\r\n+ self.validate_length()\r\n+ self.validate_alphabet()\r\n+ # not checking for \'X\' nucleotides at the moment.\r\n+ # alphafold can throw an error if it doesn\'t like it.\r\n+ # self.validate_x()\r\n+ return self.fasta_list\r\n+\r\n+ def validate_num_seqs(self) -> None:\r\n+ """Assert that only one sequence has been provided."""\r\n+ fasta_count = len(self.fasta_list)\r\n+\r\n+ if self.multiple:\r\n+ if fasta_count < 2:\r\n+ raise ValueError(\r\n+ \'Error encountered validating FASTA:\\n\'\r\n+ \'Multimer mode requires multiple input sequence.\'\r\n+ f\' Only {fasta_count} sequences were detected in\'\r\n+ \' the provided file.\')\r\n+ self.fasta_list = self.fasta_list\r\n+\r\n+ elif fasta_count > MULTIMER_MAX_SEQUENCE_COUNT:\r\n+ sys.stderr.write(\r\n+ f\'WARNING: detected {fasta_count} sequences but the\'\r\n+ f'..b' \' no FASTA sequences detected in input file.\')\r\n+\r\n+ def validate_length(self):\r\n+ """Confirm whether sequence length is valid."""\r\n+ fasta = self.fasta_list[0]\r\n+ if self.min_length:\r\n+ if len(fasta.aa_seq) < self.min_length:\r\n+ raise ValueError(\r\n+ \'Error encountered validating FASTA:\\n Sequence too short\'\r\n+ f\' ({len(fasta.aa_seq)}AA).\'\r\n+ f\' Minimum length is {self.min_length}AA.\')\r\n+ if self.max_length:\r\n+ if len(fasta.aa_seq) > self.max_length:\r\n+ raise ValueError(\r\n+ \'Error encountered validating FASTA:\\n\'\r\n+ f\' Sequence too long ({len(fasta.aa_seq)}AA).\'\r\n+ f\' Maximum length is {self.max_length}AA.\')\r\n+\r\n+ def validate_alphabet(self):\r\n+ """Confirm whether the sequence conforms to IUPAC codes.\r\n+\r\n+ If not, report the offending character and its position.\r\n+ """\r\n+ fasta = self.fasta_list[0]\r\n+ for i, char in enumerate(fasta.aa_seq.upper()):\r\n+ if char not in self.iupac_characters:\r\n+ raise ValueError(\r\n+ \'Error encountered validating FASTA:\\n Invalid amino acid\'\r\n+ f\' found at pos {i}: "{char}"\')\r\n+\r\n+ def validate_x(self):\r\n+ """Check for X bases."""\r\n+ fasta = self.fasta_list[0]\r\n+ for i, char in enumerate(fasta.aa_seq.upper()):\r\n+ if char == \'X\':\r\n+ raise ValueError(\r\n+ \'Error encountered validating FASTA:\\n Unsupported AA code\'\r\n+ f\' "X" found at pos {i}\')\r\n+\r\n+\r\n+class FastaWriter:\r\n+ def __init__(self) -> None:\r\n+ self.line_wrap = 60\r\n+\r\n+ def write(self, fasta: Fasta):\r\n+ header = fasta.header\r\n+ seq = self.format_sequence(fasta.aa_seq)\r\n+ sys.stdout.write(header + \'\\n\')\r\n+ sys.stdout.write(seq)\r\n+\r\n+ def format_sequence(self, aa_seq: str):\r\n+ formatted_seq = \'\'\r\n+ for i in range(0, len(aa_seq), self.line_wrap):\r\n+ formatted_seq += aa_seq[i: i + self.line_wrap] + \'\\n\'\r\n+ return formatted_seq.upper()\r\n+\r\n+\r\n+def main():\r\n+ # load fasta file\r\n+ try:\r\n+ args = parse_args()\r\n+ fas = FastaLoader(args.input)\r\n+\r\n+ # validate\r\n+ fv = FastaValidator(\r\n+ min_length=args.min_length,\r\n+ max_length=args.max_length,\r\n+ multiple=args.multimer,\r\n+ )\r\n+ clean_fastas = fv.validate(fas.fastas)\r\n+\r\n+ # write clean data\r\n+ fw = FastaWriter()\r\n+ for fas in clean_fastas:\r\n+ fw.write(fas)\r\n+\r\n+ sys.stderr.write("Validated FASTA sequence(s):\\n\\n")\r\n+ for fas in clean_fastas:\r\n+ sys.stderr.write(fas.header + \'\\n\')\r\n+ sys.stderr.write(fas.aa_seq + \'\\n\\n\')\r\n+\r\n+ except ValueError as exc:\r\n+ sys.stderr.write(f"{exc}\\n\\n")\r\n+ raise exc\r\n+\r\n+ except Exception as exc:\r\n+ sys.stderr.write(\r\n+ "Input error: FASTA input is invalid. Please check your input.\\n\\n"\r\n+ )\r\n+ raise exc\r\n+\r\n+\r\n+def parse_args() -> argparse.Namespace:\r\n+ parser = argparse.ArgumentParser()\r\n+ parser.add_argument(\r\n+ "input",\r\n+ help="input fasta file",\r\n+ type=str\r\n+ )\r\n+ parser.add_argument(\r\n+ "--min_length",\r\n+ dest=\'min_length\',\r\n+ help="Minimum length of input protein sequence (AA)",\r\n+ default=None,\r\n+ type=int,\r\n+ )\r\n+ parser.add_argument(\r\n+ "--max_length",\r\n+ dest=\'max_length\',\r\n+ help="Maximum length of input protein sequence (AA)",\r\n+ default=None,\r\n+ type=int,\r\n+ )\r\n+ parser.add_argument(\r\n+ "--multimer",\r\n+ action=\'store_true\',\r\n+ help="Require multiple input sequences",\r\n+ )\r\n+ return parser.parse_args()\r\n+\r\n+\r\n+if __name__ == \'__main__\':\r\n+ main()\r\n' |
b |
diff -r a58f7eb0df2c -r f9eb041c518c validate_fasta.py --- a/validate_fasta.py Fri Mar 10 02:48:07 2023 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,254 +0,0 @@\n-"""Validate input FASTA sequence."""\r\n-\r\n-import argparse\r\n-import re\r\n-import sys\r\n-from typing import List\r\n-\r\n-MULTIMER_MAX_SEQUENCE_COUNT = 10\r\n-\r\n-\r\n-class Fasta:\r\n- def __init__(self, header_str: str, seq_str: str):\r\n- self.header = header_str\r\n- self.aa_seq = seq_str\r\n-\r\n-\r\n-class FastaLoader:\r\n- def __init__(self, fasta_path: str):\r\n- """Initialize from FASTA file."""\r\n- self.fastas = []\r\n- self.load(fasta_path)\r\n-\r\n- def load(self, fasta_path: str):\r\n- """Load bare or FASTA formatted sequence."""\r\n- with open(fasta_path, \'r\') as f:\r\n- self.content = f.read()\r\n-\r\n- if "__cn__" in self.content:\r\n- # Pasted content with escaped characters\r\n- self.newline = \'__cn__\'\r\n- self.read_caret = \'__gt__\'\r\n- else:\r\n- # Uploaded file with normal content\r\n- self.newline = \'\\n\'\r\n- self.read_caret = \'>\'\r\n-\r\n- self.lines = self.content.split(self.newline)\r\n-\r\n- if not self.lines[0].startswith(self.read_caret):\r\n- # Fasta is headless, load as single sequence\r\n- self.update_fastas(\r\n- \'\', \'\'.join(self.lines)\r\n- )\r\n-\r\n- else:\r\n- header = None\r\n- sequence = None\r\n- for line in self.lines:\r\n- if line.startswith(self.read_caret):\r\n- if header:\r\n- self.update_fastas(header, sequence)\r\n- header = \'>\' + self.strip_header(line)\r\n- sequence = \'\'\r\n- else:\r\n- sequence += line.strip(\'\\n \')\r\n- self.update_fastas(header, sequence)\r\n-\r\n- def strip_header(self, line):\r\n- """Strip characters escaped with underscores from pasted text."""\r\n- return re.sub(r\'\\_\\_.{2}\\_\\_\', \'\', line).strip(\'>\')\r\n-\r\n- def update_fastas(self, header: str, sequence: str):\r\n- # if we have a sequence\r\n- if sequence:\r\n- # create generic header if not exists\r\n- if not header:\r\n- fasta_count = len(self.fastas)\r\n- header = f\'>sequence_{fasta_count}\'\r\n-\r\n- # Create new Fasta\r\n- self.fastas.append(Fasta(header, sequence))\r\n-\r\n-\r\n-class FastaValidator:\r\n- def __init__(\r\n- self,\r\n- min_length=None,\r\n- max_length=None,\r\n- multiple=False):\r\n- self.multiple = multiple\r\n- self.min_length = min_length\r\n- self.max_length = max_length\r\n- self.iupac_characters = {\r\n- \'A\', \'B\', \'C\', \'D\', \'E\', \'F\', \'G\',\r\n- \'H\', \'I\', \'K\', \'L\', \'M\', \'N\', \'P\',\r\n- \'Q\', \'R\', \'S\', \'T\', \'U\', \'V\', \'W\', \'X\',\r\n- \'Y\', \'Z\', \'-\'\r\n- }\r\n-\r\n- def validate(self, fasta_list: List[Fasta]):\r\n- """Perform FASTA validation."""\r\n- self.fasta_list = fasta_list\r\n- self.validate_num_seqs()\r\n- self.validate_length()\r\n- self.validate_alphabet()\r\n- # not checking for \'X\' nucleotides at the moment.\r\n- # alphafold can throw an error if it doesn\'t like it.\r\n- # self.validate_x()\r\n- return self.fasta_list\r\n-\r\n- def validate_num_seqs(self) -> None:\r\n- """Assert that only one sequence has been provided."""\r\n- fasta_count = len(self.fasta_list)\r\n-\r\n- if self.multiple:\r\n- if fasta_count < 2:\r\n- raise ValueError(\r\n- \'Error encountered validating FASTA:\\n\'\r\n- \'Multimer mode requires multiple input sequence.\'\r\n- f\' Only {fasta_count} sequences were detected in\'\r\n- \' the provided file.\')\r\n- self.fasta_list = self.fasta_list\r\n-\r\n- elif fasta_count > MULTIMER_MAX_SEQUENCE_COUNT:\r\n- sys.stderr.write(\r\n- f\'WARNING: detected {fasta_count} sequences but the\'\r\n- f'..b' \' no FASTA sequences detected in input file.\')\r\n-\r\n- def validate_length(self):\r\n- """Confirm whether sequence length is valid."""\r\n- fasta = self.fasta_list[0]\r\n- if self.min_length:\r\n- if len(fasta.aa_seq) < self.min_length:\r\n- raise ValueError(\r\n- \'Error encountered validating FASTA:\\n Sequence too short\'\r\n- f\' ({len(fasta.aa_seq)}AA).\'\r\n- f\' Minimum length is {self.min_length}AA.\')\r\n- if self.max_length:\r\n- if len(fasta.aa_seq) > self.max_length:\r\n- raise ValueError(\r\n- \'Error encountered validating FASTA:\\n\'\r\n- f\' Sequence too long ({len(fasta.aa_seq)}AA).\'\r\n- f\' Maximum length is {self.max_length}AA.\')\r\n-\r\n- def validate_alphabet(self):\r\n- """Confirm whether the sequence conforms to IUPAC codes.\r\n-\r\n- If not, report the offending character and its position.\r\n- """\r\n- fasta = self.fasta_list[0]\r\n- for i, char in enumerate(fasta.aa_seq.upper()):\r\n- if char not in self.iupac_characters:\r\n- raise ValueError(\r\n- \'Error encountered validating FASTA:\\n Invalid amino acid\'\r\n- f\' found at pos {i}: "{char}"\')\r\n-\r\n- def validate_x(self):\r\n- """Check for X bases."""\r\n- fasta = self.fasta_list[0]\r\n- for i, char in enumerate(fasta.aa_seq.upper()):\r\n- if char == \'X\':\r\n- raise ValueError(\r\n- \'Error encountered validating FASTA:\\n Unsupported AA code\'\r\n- f\' "X" found at pos {i}\')\r\n-\r\n-\r\n-class FastaWriter:\r\n- def __init__(self) -> None:\r\n- self.line_wrap = 60\r\n-\r\n- def write(self, fasta: Fasta):\r\n- header = fasta.header\r\n- seq = self.format_sequence(fasta.aa_seq)\r\n- sys.stdout.write(header + \'\\n\')\r\n- sys.stdout.write(seq)\r\n-\r\n- def format_sequence(self, aa_seq: str):\r\n- formatted_seq = \'\'\r\n- for i in range(0, len(aa_seq), self.line_wrap):\r\n- formatted_seq += aa_seq[i: i + self.line_wrap] + \'\\n\'\r\n- return formatted_seq.upper()\r\n-\r\n-\r\n-def main():\r\n- # load fasta file\r\n- try:\r\n- args = parse_args()\r\n- fas = FastaLoader(args.input)\r\n-\r\n- # validate\r\n- fv = FastaValidator(\r\n- min_length=args.min_length,\r\n- max_length=args.max_length,\r\n- multiple=args.multimer,\r\n- )\r\n- clean_fastas = fv.validate(fas.fastas)\r\n-\r\n- # write clean data\r\n- fw = FastaWriter()\r\n- for fas in clean_fastas:\r\n- fw.write(fas)\r\n-\r\n- sys.stderr.write("Validated FASTA sequence(s):\\n\\n")\r\n- for fas in clean_fastas:\r\n- sys.stderr.write(fas.header + \'\\n\')\r\n- sys.stderr.write(fas.aa_seq + \'\\n\\n\')\r\n-\r\n- except ValueError as exc:\r\n- sys.stderr.write(f"{exc}\\n\\n")\r\n- raise exc\r\n-\r\n- except Exception as exc:\r\n- sys.stderr.write(\r\n- "Input error: FASTA input is invalid. Please check your input.\\n\\n"\r\n- )\r\n- raise exc\r\n-\r\n-\r\n-def parse_args() -> argparse.Namespace:\r\n- parser = argparse.ArgumentParser()\r\n- parser.add_argument(\r\n- "input",\r\n- help="input fasta file",\r\n- type=str\r\n- )\r\n- parser.add_argument(\r\n- "--min_length",\r\n- dest=\'min_length\',\r\n- help="Minimum length of input protein sequence (AA)",\r\n- default=None,\r\n- type=int,\r\n- )\r\n- parser.add_argument(\r\n- "--max_length",\r\n- dest=\'max_length\',\r\n- help="Maximum length of input protein sequence (AA)",\r\n- default=None,\r\n- type=int,\r\n- )\r\n- parser.add_argument(\r\n- "--multimer",\r\n- action=\'store_true\',\r\n- help="Require multiple input sequences",\r\n- )\r\n- return parser.parse_args()\r\n-\r\n-\r\n-if __name__ == \'__main__\':\r\n- main()\r\n' |