# HG changeset patch
# User galaxy-australia
# Date 1678416487 0
# Node ID a58f7eb0df2cd0c4df5ddb753fec518da2f9cb86
# Parent d00e15139065ff39933cfbaddf29c7d7735bdaf9
planemo upload for repository https://github.com/usegalaxy-au/tools-au commit fd45a857a71358e7e5375dcfb5043cdc8560c5a5
diff -r d00e15139065 -r a58f7eb0df2c alphafold.xml
--- a/alphafold.xml Tue Feb 28 01:15:42 2023 +0000
+++ b/alphafold.xml Fri Mar 10 02:48:07 2023 +0000
@@ -2,7 +2,9 @@
- AI-guided 3D structural prediction of proteins
2.3.1
- 0
+ 1
+ macro_output.xml
+ macro_test_output.xml
topic_0082
@@ -14,17 +16,20 @@
alphafold_2
- neoformit/alphafold:v2.3.1_1
+ neoformit/alphafold:v2.3.1_2
input.fasta
#end if
@@ -32,55 +37,66 @@
&& python3 '$__tool_directory__/validate_fasta.py' input.fasta
--min_length \${ALPHAFOLD_AA_LENGTH_MIN:-0}
--max_length \${ALPHAFOLD_AA_LENGTH_MAX:-0}
-#if $multimer:
+#if $model_preset == 'multimer':
--multimer
#end if
> alphafold.fasta
-## Env vars -------------------------------
+## Env vars -------------------------------------------------------------------
&& export TF_FORCE_UNIFIED_MEMORY=1
&& export XLA_PYTHON_CLIENT_MEM_FRACTION=4.0
&& export TODAY=`date +"%Y-%m-%d"`
-## Run alphafold -------------------------
-&& python /app/alphafold/run_alphafold.py
- --fasta_paths alphafold.fasta
- --output_dir output
- --data_dir \${ALPHAFOLD_DB:-/data}
-
- ## Set reference database paths
- --uniref90_database_path \${ALPHAFOLD_DB:-/data}/uniref90/uniref90.fasta
- --mgnify_database_path \${ALPHAFOLD_DB:-/data}/mgnify/mgy_clusters_2022_05.fa
- --template_mmcif_dir \${ALPHAFOLD_DB:-/data}/pdb_mmcif/mmcif_files
- --obsolete_pdbs_path \${ALPHAFOLD_DB:-/data}/pdb_mmcif/obsolete.dat
- #if $dbs == 'full':
- --bfd_database_path \${ALPHAFOLD_DB:-/data}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt
- --uniref30_database_path \${ALPHAFOLD_DB:-/data}/uniref30/UniRef30_2021_03
- #else
- --db_preset=reduced_dbs
- --small_bfd_database_path \${ALPHAFOLD_DB:-/data}/small_bfd/bfd-first_non_consensus_sequences.fasta
- #end if
+## Run AlphaFold -------------------------------------------------------------
+#if os.environ.get('PLANEMO_TESTING'):
+ ## Run in testing mode (mocks a successful AlphaFold run by copying outputs)
+ && echo "Creating dummy outputs for model_preset=$model_preset..."
+ && bash '$__tool_directory__/mock_alphafold.sh' $model_preset
+#else:
+ ## Run AlphaFold
+ && python /app/alphafold/run_alphafold.py
+ --fasta_paths alphafold.fasta
+ --output_dir output
+ --data_dir \${ALPHAFOLD_DB:-/data}
+ --model_preset=$model_preset
- #if $max_template_date:
- --max_template_date=$max_template_date
- #else
- --max_template_date=\$TODAY
- #end if
+ ## Set reference database paths
+ --uniref90_database_path \${ALPHAFOLD_DB:-/data}/uniref90/uniref90.fasta
+ --mgnify_database_path \${ALPHAFOLD_DB:-/data}/mgnify/mgy_clusters_2022_05.fa
+ --template_mmcif_dir \${ALPHAFOLD_DB:-/data}/pdb_mmcif/mmcif_files
+ --obsolete_pdbs_path \${ALPHAFOLD_DB:-/data}/pdb_mmcif/obsolete.dat
+ #if $dbs == 'full':
+ --bfd_database_path \${ALPHAFOLD_DB:-/data}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt
+ --uniref30_database_path \${ALPHAFOLD_DB:-/data}/uniref30/UniRef30_2021_03
+ #else
+ --db_preset=reduced_dbs
+ --small_bfd_database_path \${ALPHAFOLD_DB:-/data}/small_bfd/bfd-first_non_consensus_sequences.fasta
+ #end if
- --use_gpu_relax=\${ALPHAFOLD_USE_GPU:-True} ## introduced in v2.1.2
+ #if $max_template_date:
+ --max_template_date=$max_template_date
+ #else
+ --max_template_date=\$TODAY
+ #end if
- #if $multimer:
- --model_preset=multimer
- --pdb_seqres_database_path=\${ALPHAFOLD_DB:-/data}/pdb_seqres/pdb_seqres.txt
- --uniprot_database_path=\${ALPHAFOLD_DB:-/data}/uniprot/uniprot.fasta
- --num_multimer_predictions_per_model=1 ## introduced in v2.2.0
- #else
- --pdb70_database_path \${ALPHAFOLD_DB:-/data}/pdb70/pdb70
- #end if
+ --use_gpu_relax=\${ALPHAFOLD_USE_GPU:-True} ## introduced in v2.1.2
-## Generate additional outputs ------------
-&& python3 '$__tool_directory__/outputs.py' output/alphafold $outputs.plddts
-#if $multimer:
+ #if $model_preset == 'multimer':
+ --pdb_seqres_database_path=\${ALPHAFOLD_DB:-/data}/pdb_seqres/pdb_seqres.txt
+ --uniprot_database_path=\${ALPHAFOLD_DB:-/data}/uniprot/uniprot.fasta
+ --num_multimer_predictions_per_model=1 ## introduced in v2.2.0
+ #else
+ --pdb70_database_path \${ALPHAFOLD_DB:-/data}/pdb70/pdb70
+ #end if
+#end if
+
+## Generate additional outputs ------------------------------------------------
+&& python3 '$__tool_directory__/outputs.py' output/alphafold
+$outputs.plddts
+$outputs.model_pkls
+$outputs.pae_csv
+$outputs.plots
+#if $model_preset == 'multimer':
--multimer
#end if
@@ -137,15 +153,31 @@
+ name="model_preset"
+ type="select"
+ label="Model preset"
+ help="Select which prediction model to run. The monomer model is the most accurate for single protein prediction. The multimer model allows prediction of protein complexes."
+ >
+
+
+
+
+
+
-
-
-
-
-
+
-
-
- outputs['confidence_scores']
-
-
-
- outputs['plddts']
-
-
-
- outputs['model_pkls']
-
-
- outputs['model_pkls']
-
-
- outputs['model_pkls']
-
-
- outputs['model_pkls']
-
-
- outputs['model_pkls']
-
-
- outputs['relax_json']
-
+
+
+
+
+
+
-
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
`_.
- | This means that it runs with Amber relaxation enabled, with relaxed PDB models collected as output datasets. If there are additonal parameters that you would like to interact with, please `send a support request to Galaxy AU `_, or open an issue on `our GitHub `_.
+ | This means that it runs with Amber relaxation enabled, with relaxed PDB models collected as output datasets (ranked\_*.pdb files). If there are additonal parameters that you would like to interact with, please `send a support request to Galaxy AU `_, or open an issue on `our GitHub `_.
|
|
diff -r d00e15139065 -r a58f7eb0df2c macro_output.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macro_output.xml Fri Mar 10 02:48:07 2023 +0000
@@ -0,0 +1,176 @@
+
+
+
+
+
+
+
+
+
+
+
+ outputs['pae_csv']
+ model_preset != "monomer"
+
+
+ outputs['pae_csv']
+ model_preset != "monomer"
+
+
+ outputs['pae_csv']
+ model_preset != "monomer"
+
+
+ outputs['pae_csv']
+ model_preset != "monomer"
+
+
+ outputs['pae_csv']
+ model_preset != "monomer"
+
+
+
+
+
+ outputs['model_pkls']
+
+
+ outputs['model_pkls']
+
+
+ outputs['model_pkls']
+
+
+ outputs['model_pkls']
+
+
+ outputs['model_pkls']
+
+
+
+
+
+ outputs['plots']
+
+
+ outputs['plots']
+
+
+ outputs['plots']
+
+
+ outputs['plots']
+
+
+ outputs['plots']
+
+
+
+
+
+ outputs['confidence_scores']
+
+
+
+
+
+ outputs['plddts']
+
+
+
+
+
+ outputs['relax_json']
+
+
+
diff -r d00e15139065 -r a58f7eb0df2c macro_test_output.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macro_test_output.xml Fri Mar 10 02:48:07 2023 +0000
@@ -0,0 +1,199 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r d00e15139065 -r a58f7eb0df2c outputs.py
--- a/outputs.py Tue Feb 28 01:15:42 2023 +0000
+++ b/outputs.py Fri Mar 10 02:48:07 2023 +0000
@@ -19,13 +19,16 @@
import os
import pickle as pk
import shutil
+from matplotlib import pyplot as plt
from pathlib import Path
from typing import List
-# Output file names
+# Output file paths
OUTPUT_DIR = 'extra'
OUTPUTS = {
'model_pkl': OUTPUT_DIR + '/ranked_{rank}.pkl',
+ 'model_pae': OUTPUT_DIR + '/pae_ranked_{rank}.csv',
+ 'model_plot': OUTPUT_DIR + '/ranked_{rank}.png',
'model_confidence_scores': OUTPUT_DIR + '/model_confidence_scores.tsv',
'plddts': OUTPUT_DIR + '/plddts.tsv',
'relax': OUTPUT_DIR + '/relax_metrics_ranked.json',
@@ -46,8 +49,9 @@
self.output_confidence_scores = True
self.output_residue_scores = False
self.is_multimer = False
+ self.parse()
- def parse_settings(self) -> None:
+ def parse(self) -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"workdir",
@@ -67,15 +71,26 @@
action="store_true"
)
parser.add_argument(
- "--model-pkl",
- dest="model_pkl",
+ "--pkl",
help="rename model pkl outputs with rank order",
action="store_true"
)
+ parser.add_argument(
+ "--pae",
+ help="extract PAE from pkl files to CSV format",
+ action="store_true"
+ )
+ parser.add_argument(
+ "--plot",
+ help="Plot pLDDT and PAE for each model",
+ action="store_true"
+ )
args = parser.parse_args()
self.workdir = Path(args.workdir.rstrip('/'))
self.output_residue_scores = args.plddts
- self.output_model_pkls = args.model_pkl
+ self.output_model_pkls = args.pkl
+ self.output_model_plots = args.plot
+ self.output_pae = args.pae
self.is_multimer = args.multimer
self.output_dir = self.workdir / OUTPUT_DIR
os.makedirs(self.output_dir, exist_ok=True)
@@ -212,6 +227,31 @@
shutil.copyfile(path, new_path)
+def extract_pae_to_csv(ranking: ResultRanking, context: ExecutionContext):
+ """Extract predicted alignment error matrix from pickle files.
+
+ Creates a CSV file for each of five ranked models.
+ """
+ for path in context.model_pkl_paths:
+ model = ResultModelPrediction(path, context)
+ rank = ranking.get_rank_for_model(model.name)
+ with open(path, 'rb') as f:
+ data = pk.load(f)
+ if 'predicted_aligned_error' not in data:
+ print("Skipping PAE output"
+ f" - not found in {path}."
+ " Running with model_preset=monomer?")
+ return
+ pae = data['predicted_aligned_error']
+ out_path = (
+ context.settings.workdir
+ / OUTPUTS['model_pae'].format(rank=rank)
+ )
+ with open(out_path, 'w') as f:
+ for row in pae:
+ f.write(','.join([str(x) for x in row]) + '\n')
+
+
def rekey_relax_metrics(ranking: ResultRanking, context: ExecutionContext):
"""Replace keys in relax_metrics.json with 0-indexed rank."""
with open(context.relax_metrics) as f:
@@ -224,10 +264,44 @@
json.dump(data, f)
+def plddt_pae_plots(ranking: ResultRanking, context: ExecutionContext):
+ """Generate a pLDDT + PAE plot for each model."""
+ for path in context.model_pkl_paths:
+ num_plots = 2
+ model = ResultModelPrediction(path, context)
+ rank = ranking.get_rank_for_model(model.name)
+ png_path = (
+ context.settings.workdir
+ / OUTPUTS['model_plot'].format(rank=rank)
+ )
+ plddts = model.data['plddt']
+ if 'predicted_aligned_error' in model.data:
+ pae = model.data['predicted_aligned_error']
+ max_pae = model.data['max_predicted_aligned_error']
+ else:
+ num_plots = 1
+
+ plt.figure(figsize=[8 * num_plots, 6])
+ plt.subplot(1, num_plots, 1)
+ plt.plot(plddts)
+ plt.title('Predicted LDDT')
+ plt.xlabel('Residue')
+ plt.ylabel('pLDDT')
+
+ if num_plots == 2:
+ plt.subplot(1, 2, 2)
+ plt.imshow(pae, vmin=0., vmax=max_pae, cmap='Greens_r')
+ plt.colorbar(fraction=0.046, pad=0.04)
+ plt.title('Predicted Aligned Error')
+ plt.xlabel('Scored residue')
+ plt.ylabel('Aligned residue')
+
+ plt.savefig(png_path)
+
+
def main():
"""Parse output files and generate additional output files."""
settings = Settings()
- settings.parse_settings()
context = ExecutionContext(settings)
ranking = ResultRanking(context)
write_confidence_scores(ranking, context)
@@ -236,7 +310,11 @@
# Optional outputs
if settings.output_model_pkls:
rename_model_pkls(ranking, context)
-
+ if settings.output_model_plots:
+ plddt_pae_plots(ranking, context)
+ if settings.output_pae:
+ # Only created by monomer_ptm and multimer models
+ extract_pae_to_csv(ranking, context)
if settings.output_residue_scores:
write_per_residue_scores(ranking, context)
diff -r d00e15139065 -r a58f7eb0df2c validate_fasta.py
--- a/validate_fasta.py Tue Feb 28 01:15:42 2023 +0000
+++ b/validate_fasta.py Fri Mar 10 02:48:07 2023 +0000
@@ -205,6 +205,11 @@
for fas in clean_fastas:
fw.write(fas)
+ sys.stderr.write("Validated FASTA sequence(s):\n\n")
+ for fas in clean_fastas:
+ sys.stderr.write(fas.header + '\n')
+ sys.stderr.write(fas.aa_seq + '\n\n')
+
except ValueError as exc:
sys.stderr.write(f"{exc}\n\n")
raise exc