# HG changeset patch # User galaxy-australia # Date 1730168136 0 # Node ID e7f1b552a6955b76a9614fabb4bbadc0d1bbdfed # Parent 6ab1a261520a4050b09da0c45f82b5a0c967b116 planemo upload for repository https://github.com/usegalaxy-au/tools-au commit 628c9fdcb77489063145a2307b6bb6a450416dd6-dirty diff -r 6ab1a261520a -r e7f1b552a695 alphafold.xml --- a/alphafold.xml Sun Jul 28 20:09:55 2024 +0000 +++ b/alphafold.xml Tue Oct 29 02:15:36 2024 +0000 @@ -3,7 +3,7 @@ 2.3.2 2.3 - 0 + 1 macro_output.xml macro_test_output.xml @@ -118,9 +118,7 @@ $outputs.model_pkls $outputs.pae_csv $outputs.plots -#if $model_preset.selection == 'multimer': ---multimer -#end if +$outputs.plot_msa ## HTML output && mkdir -p '${ html.files_path }' @@ -135,7 +133,7 @@ ]]> - + @@ -236,18 +234,27 @@ help="A two-panel plot showing pLDDT against residue position (left) and PAE (paired-alignment error) as a heatmap image with residue numbers running along vertical and horizontal axes and color at each pixel indicating PAE value for the corresponding pair of residues. (right). PAE heatmap is only produced with monomer_ptm and multimer model presets." /> + @@ -291,6 +298,7 @@ + @@ -464,6 +472,17 @@ | | + *MSA sequence coverage plot (optional)* + + | A heatmap in PNG format showing: + | a) Per-position sequence identity to query as a heatmap + | b) Per-position sequence coverage as a line plot + | + | This plot can help you understand if regions of low confidence are due to poor sequence coverage, rather than + | limitations of the model or intrinsically unstable regions. + | + | + *Model predicted-alignment error matrix (pae_ranked_n.csv)* | Per-model predicted-alignment error (PAE) matrix - only available with the ``monomer_ptm`` and ``multimer`` model presets. diff -r 6ab1a261520a -r e7f1b552a695 macro_output.xml --- a/macro_output.xml Sun Jul 28 20:09:55 2024 +0000 +++ b/macro_output.xml Tue Oct 29 02:15:36 2024 +0000 @@ -172,6 +172,17 @@ + + + outputs['plot_msa'] + + + bool: + """Check if the run was multimer or monomer.""" + with open(self.workdir / 'relax_metrics.json') as f: + if '_multimer_' in f.read(): + return PRESETS.multimer + if '_ptm_' in f.read(): + return PRESETS.monomer_ptm + return PRESETS.monomer + class ExecutionContext: """Collect file paths etc.""" def __init__(self, settings: Settings): self.settings = settings - if settings.is_multimer: - self.plddt_key = PLDDT_KEY['multimer'] + if settings.model_preset == PRESETS.multimer: + self.plddt_key = PLDDT_KEY.multimer else: - self.plddt_key = PLDDT_KEY['monomer'] + self.plddt_key = PLDDT_KEY.monomer def get_model_key(self, ix: int) -> str: """Return json key for model index. @@ -192,11 +210,30 @@ def write_confidence_scores(ranking: ResultRanking, context: ExecutionContext): """Write per-model confidence scores.""" - path = context.settings.workdir / OUTPUTS['model_confidence_scores'] - with open(path, 'w') as f: - for rank in range(1, len(context.model_pkl_paths) + 1): - score = ranking.get_plddt_for_rank(rank) - f.write(f'ranked_{rank - 1}\t{score:.2f}\n') + outfile = context.settings.workdir / OUTPUTS['model_confidence_scores'] + scores: Dict[str, list] = {} + header = ['model', context.plddt_key] + + for i, path in enumerate(context.model_pkl_paths): + rank = int(path.name.split('model_')[-1][0]) + scores_ls = [ranking.get_plddt_for_rank(rank)] + with open(path, 'rb') as f: + data = pk.load(f) + if 'ptm' in data: + scores_ls.append(data['ptm']) + if i == 0: + header += ['ptm'] + if 'iptm' in data: + scores_ls.append(data['iptm']) + if i == 0: + header += ['iptm'] + scores[rank] = scores_ls + + with open(outfile, 'w') as f: + f.write('\t'.join(header) + '\n') + for rank, score_ls in scores.items(): + row = [f"ranked_{rank - 1}"] + [str(x) for x in score_ls] + f.write('\t'.join(row) + '\n') def write_per_residue_scores( @@ -304,6 +341,40 @@ plt.ylabel('Aligned residue') plt.savefig(png_path) + plt.close() + + +def plot_msa(wdir: Path, dpi: int = 150): + """Plot MSA as a heatmap.""" + with open(wdir / 'features.pkl', 'rb') as f: + features = pk.load(f) + + msa = features.get('msa') + if msa is None: + print("Could not plot MSA coverage - 'msa' key not found in" + " features.pkl") + return + seqid = (np.array(msa[0] == msa).mean(-1)) + seqid_sort = seqid.argsort() + non_gaps = (msa != 21).astype(float) + non_gaps[non_gaps == 0] = np.nan + final = non_gaps[seqid_sort] * seqid[seqid_sort, None] + + plt.figure(figsize=(6, 4)) + # plt.subplot(111) + plt.title("Sequence coverage") + plt.imshow(final, + interpolation='nearest', aspect='auto', + cmap="rainbow_r", vmin=0, vmax=1, origin='lower') + plt.plot((msa != 21).sum(0), color='black') + plt.xlim(-0.5, msa.shape[1] - 0.5) + plt.ylim(-0.5, msa.shape[0] - 0.5) + plt.colorbar(label="Sequence identity to query", ) + plt.xlabel("Positions") + plt.ylabel("Sequences") + plt.tight_layout() + plt.savefig(wdir / OUTPUTS['msa'], dpi=dpi) + plt.close() def template_html(context: ExecutionContext): @@ -341,6 +412,8 @@ extract_pae_to_csv(ranking, context) if settings.output_residue_scores: write_per_residue_scores(ranking, context) + if settings.plot_msa: + plot_msa(context.settings.workdir) if __name__ == '__main__':