Next changeset 1:6c92e000d684 (2022-03-01) |
Commit message:
"planemo upload for repository https://github.com/usegalaxy-au/galaxy-local-tools commit 7b79778448363aa8c9b14604337e81009e461bd2-dirty" |
added:
README.md alphafold.html alphafold.xml gen_extra_outputs.py scripts/download_all_data.sh scripts/download_alphafold_params.sh scripts/download_bfd.sh scripts/download_mgnify.sh scripts/download_pdb70.sh scripts/download_pdb_mmcif.sh scripts/download_pdb_seqres.sh scripts/download_small_bfd.sh scripts/download_uniclust30.sh scripts/download_uniprot.sh scripts/download_uniref90.sh static/img/alphafold-visualization.png static/img/alphafold_runtime_graph.png test-data/test1.fasta validate_fasta.py |
b |
diff -r 000000000000 -r 7ae9d78b06f5 README.md --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.md Fri Jan 28 04:56:29 2022 +0000 |
[ |
@@ -0,0 +1,157 @@ + +# Alphafold compute setup + +## Overview + +Alphafold requires a customised compute environment to run. The machine needs a GPU, and access to a 2.2 Tb reference data store. + +This document is designed to provide details on the compute environment required for Alphafold operation, and the Galaxy job destination settings to run the wrapper. + +For full details on Alphafold requirements, see https://github.com/deepmind/alphafold. + +<br> + +### HARDWARE + +The machine is recommended to have the following specs: +- 12 cores +- 80 Gb RAM +- 2.5 Tb storage +- A fast Nvidia GPU. + +As a minimum, the Nvidia GPU must have 8Gb RAM. It also requires ***unified memory*** to be switched on. <br> +Unified memory is usually enabled by default, but some HPC systems will turn it off so the GPU can be shared between multiple jobs concurrently. + +<br> + +### ENVIRONMENT + +This wrapper runs Alphafold as a singularity container. The following software are needed: + +- [Singularity](https://sylabs.io/guides/3.0/user-guide/installation.html) +- [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) + +As Alphafold uses an Nvidia GPU, the NVIDIA Container Toolkit is needed. This makes the GPU available inside the running singularity container. + +To check that everything has been set up correctly, run the following + +``` +singularity run --nv docker://nvidia/cuda:11.0-base nvidia-smi +``` + +If you can see something similar to this output (details depend on your GPU), it has been set up correctly. + +``` ++-----------------------------------------------------------------------------+ +| NVIDIA-SMI 470.57.02 Driver Version: 470.57.02 CUDA Version: 11.4 | +|-------------------------------+----------------------+----------------------+ +| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|===============================+======================+======================| +| 0 Tesla T4 Off | 00000000:00:05.0 Off | 0 | +| N/A 49C P0 28W / 70W | 0MiB / 15109MiB | 0% Default | +| | | N/A | ++-------------------------------+----------------------+----------------------+ + ++-----------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=============================================================================| +| No running processes found | ++-----------------------------------------------------------------------------+ +``` + + +<br> + +### REFERENCE DATA + +Alphafold needs reference data to run. The wrapper expects this data to be present at `/data/alphafold_databases`. <br> +To download, run the following shell script command in the tool directory. + +``` +# make folders if needed +mkdir /data /data/alphafold_databases + +# download ref data +bash scripts/download_all_data.sh /data/alphafold_databases +``` + +This will install the reference data to `/data/alphafold_databases`. To check this has worked, ensure the final folder structure is as follows: + +``` +data/alphafold_databases +├── bfd +│ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffdata +│ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_a3m.ffindex +│ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffdata +│ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_cs219.ffindex +│ ├── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffdata +│ └── bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt_hhm.ffindex +├── mgnify +│ └── mgy_clusters_2018_12.fa +├── params +│ ├── LICENSE +│ ├── params_model_1.npz +│ ├── params_model_1_ptm.npz +│ ├── params_model_2.npz +│ ├── params_model_2_ptm.npz +│ ├── params_model_3.npz +│ ├── params_model_3_ptm.npz +│ ├── params_model_4.npz +│ ├── params_model_4_ptm.npz +│ ├── params_model_5.npz +│ └── params_model_5_ptm.npz +├── pdb70 +│ ├── md5sum +│ ├── pdb70_a3m.ffdata +│ ├── pdb70_a3m.ffindex +│ ├── pdb70_clu.tsv +│ ├── pdb70_cs219.ffdata +│ ├── pdb70_cs219.ffindex +│ ├── pdb70_hhm.ffdata +│ ├── pdb70_hhm.ffindex +│ └── pdb_filter.dat +├── pdb_mmcif +│ ├── mmcif_files +│ └── obsolete.dat +├── uniclust30 +│ └── uniclust30_2018_08 +└── uniref90 + └── uniref90.fasta +``` + + +<br> + +### JOB DESTINATION + +Alphafold needs a custom singularity job destination to run. +The destination needs to be configured for singularity, and some +extra singularity params need to be set as seen below. + +Specify the job runner. For example, a local runner + +``` +<plugin id="alphafold_runner" type="runner" load="galaxy.jobs.runners.local:LocalJobRunner"/> +``` + +Customise the job destination with required singularity settings. <br> +The settings below are mandatory, but you may include other settings as needed. + +``` +<destination id="alphafold" runner="alphafold_runner"> + <param id="dependency_resolution">'none'</param> + <param id="singularity_enabled">true</param> + <param id="singularity_run_extra_arguments">--nv</param> + <param id="singularity_volumes">"$job_directory:ro,$tool_directory:ro,$job_directory/outputs:rw,$working_directory:rw,/data/alphafold_databases:/data:ro"</param> +</destination> +``` + +<br> + +### Closing + +If you are experiencing technical issues, feel free to write to help@genome.edu.au. We may be able to provide comment on setting up Alphafold on your compute environment. |
b |
diff -r 000000000000 -r 7ae9d78b06f5 alphafold.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alphafold.html Fri Jan 28 04:56:29 2022 +0000 |
[ |
b'@@ -0,0 +1,656 @@\n+<!DOCTYPE html>\n+<html lang="en" dir="ltr">\n+\n+ <head>\n+ <meta charset="utf-8">\n+ <meta http-equiv="X-UA-Compatible" content="IE=edge">\n+ <meta name="viewport" content="width=device-width, initial-scale=1">\n+\n+ <title> Alphafold structure prediction </title>\n+\n+ <link rel="preconnect" href="https://fonts.googleapis.com">\n+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>\n+ <link href="https://fonts.googleapis.com/css2?family=Ubuntu:wght@300;400;500;700&display=swap" rel="stylesheet">\n+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous">\n+ <script src="https://cdnjs.cloudflare.com/ajax/libs/chroma-js/2.1.0/chroma.min.js" integrity="sha512-yocoLferfPbcwpCMr8v/B0AB4SWpJlouBwgE0D3ZHaiP1nuu5djZclFEIj9znuqghaZ3tdCMRrreLoM8km+jIQ==" crossorigin="anonymous"></script>\n+\n+ <style type="text/css">\n+ * {\n+ margin: 0;\n+ padding: 0;\n+ }\n+ html, body {\n+ width: 100%;\n+ font-size: 1rem;\n+ }\n+ body {\n+ font-family: \'Ubuntu\', sans-serif;\n+ }\n+ canvas {\n+ background-color: white;\n+ }\n+ h1, h2, h3, h4, h5, h6 {\n+ color: dodgerblue;\n+ text-align: center;\n+ font-weight: lighter;\n+ }\n+ h1 {\n+ margin: 2rem;\n+ font-size: 3rem;\n+ }\n+ h2 {\n+ font-size: 2rem;\n+ margin-top: 1rem;\n+ margin-bottom: .5rem;\n+ }\n+ button.btn {\n+ color: #ccc;\n+ margin: 1rem;\n+ padding: .5rem;\n+ font-size: 1rem;\n+ min-width: 4rem;\n+ border: none;\n+ border-radius: .5rem;\n+ background-color: grey;\n+ transition-duration: 0.25s;\n+ cursor: pointer;\n+ }\n+ button.btn.selected {\n+ color: #eee;\n+ background-color: dodgerblue;\n+ }\n+ button.btn.green {\n+ color: #eee;\n+ background-color: #10941f;\n+ }\n+ button.btn:focus {\n+ outline: none;\n+ color: inherit;\n+ }\n+ button.btn:hover {\n+ color: white;\n+ box-shadow: 0 0 10px dodgerblue;\n+ }\n+ button.btn.green:hover {\n+ color: white;\n+ box-shadow: 0 0 10px limegreen;\n+ }\n+ .main {\n+ min-height: 90vh;\n+ position: relative;\n+ }\n+ .flex {\n+ display: flex;\n+ justify-content: center;\n+ align-items: center;\n+ padding: 1rem;\n+ }\n+ .col {\n+ flex-direction: column;\n+ flex-grow: 0;\n+ }\n+ .controls {\n+ padding-bottom: 10vh;\n+ }\n+ .box {\n+ padding: .5rem 1rem;\n+ margin: .5rem auto;\n+ width: fit-content;\n+ border-radius: 1rem;\n+ }\n+ .mono {\n+ font-family: monospace;\n+ color: #555;\n+ background-color: #ddd;\n+ padding: .25rem;\n+ border-radius: .25rem;\n+ }\n+ .space-1 {\n+ line-height: 1.2;\n+ }\n+ .space-2 {\n+ line-height: 1.5;\n+ }\n+ .relative {\n+ position: relative;\n+ }\n+ .legend {\n+ max-width: 350px;\n+ }\n+ .legend .scale {\n+ display: flex;\n+ flex-direction: column;\n+ align-items: center;\n+ }\n+ .legend .color {\n+ width: 150px;\n+ height: 30px;\n+ justify-content: space-between;\n+ background: linear-gradient(\n+ 90deg,\n+ rgba(255,55,0,1) 0%,\n+ rgba(216,224,6,1) 33%,\n+ rgba(34,213,238,1) 66%,\n+ rgba(3,30,148,1) 100%\n+ );\n+ }\n+ .legend .ticks {\n+ margin-top: -1rem;\n+ width: 180px;\n+ justify-content: space-between;\n+ }\n+ #ngl-root-parent {\n+ width: 40vw;\n+ height: 30vw;\n+ margin: auto;\n+ position: relative;\n+ }\n+ #ngl-root {\n+ width: '..b' if (reps.length) {\n+ state.representations = {};\n+ } else {\n+ reps = [DEFAULT_REPRESENTATION];\n+ }\n+\n+ // Load PDB entry\n+ return stage.loadFile(uri(state.model)).then( (o) => {\n+ state.modelObject = o;\n+ reps.forEach( (r) => addModelRepresentation(r) );\n+ stage.setSpin(state.spin);\n+ o.autoView();\n+ setLoading(0);\n+ })\n+ }\n+\n+ // Representations ---------------------------------------------------------\n+\n+ const toggleModelRepresentation = (rep) => {\n+ rep in state.representations ?\n+ removeModelRepresentation(rep)\n+ : addModelRepresentation(rep)\n+ }\n+\n+ const addModelRepresentation = (rep) => {\n+ state.representations[rep] =\n+ state.modelObject.addRepresentation(rep, {colorScheme: COLORSCHEME});\n+ updateButtons();\n+ }\n+\n+ const removeModelRepresentation = (rep) => {\n+ o = state.representations[rep];\n+ state.modelObject.removeRepresentation(o);\n+ delete state.representations[rep];\n+ updateButtons();\n+ }\n+\n+ const clearModelRepresentations = () => {\n+ state.modelObject && state.modelObject.removeAllRepresentations();\n+ state.representations = {};\n+ }\n+\n+ // Actions -----------------------------------------------------------------\n+\n+ const toggleDark = () => {\n+ state.darkMode = !state.darkMode;\n+ stage.setParameters({\n+ backgroundColor: state.darkMode ? \'black\' : \'white\',\n+ });\n+ const btn = document.querySelector(\'#btn-toggle-dark\');\n+ btn && btn.classList.toggle(\'selected\');\n+ }\n+\n+ const setLoading = (state) => {\n+ document.getElementById(\'ngl-loading\')\n+ .style.display = state ? \'flex\' : \'none\';\n+ state.loading = state;\n+ }\n+\n+ const toggleSpin = () => {\n+ stage.toggleSpin();\n+ const btn = document.querySelector(\'#btn-toggle-spin\');\n+ btn && btn.classList.toggle(\'selected\');\n+ state.spin = !state.spin;\n+ }\n+\n+ const downloadPng = () => {\n+ const params = {\n+ factor: 3,\n+ antialias: true,\n+ }\n+ stage.makeImage(params).then( (blob) => {\n+ const name = MODELS[state.model].replace(\'.pdb\', \'.png\');\n+ const url = URL.createObjectURL(blob);\n+ makeDownload(url, name);\n+ })\n+ }\n+\n+ const downloadPdb = () => {\n+ const url = uri(state.model);\n+ const name = `alphafold_${MODELS[state.model]}`;\n+ makeDownload(url, name);\n+ }\n+\n+ const makeDownload = (url, name) => {\n+ // Will not work with cross-origin urls (i.e. during development)\n+ console.log(`Creating file download for ${name}, href ${url}`);\n+ const saveLink = document.createElement(\'a\');\n+ saveLink.href = url;\n+ saveLink.download = name;\n+ document.body.appendChild(saveLink);\n+ saveLink.dispatchEvent(\n+ new MouseEvent(\'click\', {\n+ bubbles: true,\n+ cancelable: true,\n+ view: window\n+ })\n+ );\n+ document.body.removeChild(saveLink);\n+ }\n+\n+ const updateButtons = () => {\n+ MODELS.forEach( (name, i) => {\n+ const id = `#btn-${name.replace(\'.pdb\', \'\')}`;\n+ const btn = document.querySelector(id);\n+ if (!btn) return\n+ i == state.model ?\n+ btn.classList.add(\'selected\')\n+ : btn.classList.remove(\'selected\');\n+ })\n+\n+ REPRESENTATIONS.forEach( (name) => {\n+ const id = `#btn-${name}`.replace(\'+\', \'-\');\n+ const btn = document.querySelector(id);\n+ if (!btn) return\n+ if (name in state.representations) {\n+ btn.classList.add(\'selected\')\n+ } else {\n+ btn.classList.remove(\'selected\');\n+ }\n+ });\n+\n+ // Show "Nothing to display" if no representations are selected\n+ document.querySelector(\'#ngl-nothing\').style.display =\n+ Object.keys(state.representations).length ?\n+ \'none\'\n+ : \'block\';\n+ }\n+\n+ </script>\n+\n+</html>\n' |
b |
diff -r 000000000000 -r 7ae9d78b06f5 alphafold.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alphafold.xml Fri Jan 28 04:56:29 2022 +0000 |
[ |
b'@@ -0,0 +1,246 @@\n+<tool id="alphafold" name="alphafold" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01">\n+ <description>Alphafold v2.0: AI-guided 3D structure prediction of proteins</description>\n+ <macros>\n+ <token name="@TOOL_VERSION@">2.0.0</token>\n+ <token name="@VERSION_SUFFIX@">0</token>\n+ </macros>\n+ <edam_topics>\n+ <edam_topic>topic_0082</edam_topic>\n+ </edam_topics>\n+ <edam_operations>\n+ <edam_operation>operation_0474</edam_operation>\n+ </edam_operations>\n+ <requirements>\n+ <container type="docker">neoformit/alphafold-galaxy@sha256:6adf7f07062b307d08c11130c39a28abc7c290b23f6c347b09c2c649c054c338</container>\n+ </requirements>\n+ <command detect_errors="exit_code"><![CDATA[\n+ ## fasta setup ----------------------------\n+ #if $fasta_or_text.input_mode == \'history\':\n+ cp \'$fasta_or_text.fasta_file\' input.fasta &&\n+\n+ #elif $fasta_or_text.input_mode == \'textbox\':\n+ echo \'$fasta_or_text.fasta_text\' > input.fasta &&\n+ #end if\n+\n+ python3 \'$__tool_directory__/validate_fasta.py\' input.fasta &&\n+\n+ ## env vars -------------------------------\n+ export TF_FORCE_UNIFIED_MEMORY=1 &&\n+ export XLA_PYTHON_CLIENT_MEM_FRACTION=4.0 &&\n+ export DATE=`date +"%Y-%m-%d"` &&\n+\n+ ## run alphafold -------------------------\n+ ln -s /app/alphafold/alphafold alphafold &&\n+ \t python /app/alphafold/run_alphafold.py\n+ \t --fasta_paths alphafold.fasta\n+ --output_dir output\n+ --data_dir /data ## location of the alphafold databases on pulsar node --> could this maybe a env var? $ALPHAFOLD_DB --> \\${ALPHAFOLD_DB:-/data}\n+ --uniref90_database_path /data/uniref90/uniref90.fasta\n+ --mgnify_database_path /data/mgnify/mgy_clusters_2018_12.fa\n+ --pdb70_database_path /data/pdb70/pdb70\n+ --template_mmcif_dir /data/pdb_mmcif/mmcif_files\n+ --obsolete_pdbs_path /data/pdb_mmcif/obsolete.dat\n+ --max_template_date=\\$DATE\n+ --bfd_database_path /data/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt\n+ --uniclust30_database_path /data/uniclust30/uniclust30_2018_08/uniclust30_2018_08\n+ &&\n+\n+ ## for dry run testing\n+ ## cp -r \'$__tool_directory__/output\' . &&\n+\n+ ## generate extra outputs -----------------\n+ ## plddts\n+ python3 \'$__tool_directory__/gen_extra_outputs.py\' output/alphafold $output_plddts &&\n+\n+ ## html\n+ mkdir -p \'${ html.files_path }\' &&\n+ cp \'$__tool_directory__/alphafold.html\' ${html} &&\n+ cp output/alphafold/ranked_*.pdb \'${html.files_path}\' &&\n+\n+ ## For some reason the working directory ends up being one level too deep!\n+ mv working/* .\n+ ]]></command>\n+ <inputs>\n+ <conditional name="fasta_or_text">\n+ <param name="input_mode" type="select" label="Fasta Input" help="Single protein sequence to fold. Input can be fasta file from history, or text. Provide only 1 sequence per job.">\n+ <option value="history">Use fasta from history</option>\n+ <option value="textbox">Paste sequence into textbox</option>\n+ </param>\n+ <when value="history">\n+ <param name="fasta_file" type="data" format="fasta" label="Fasta file from history" help="Select single fasta protein sequence from your history. If you wish to fold multiple proteins, submit an individual job for each protein." />\n+ </when>\n+ <when value="textbox">\n+ <param name="fasta_text" type="text" area="true" value="" label="Paste sequence" help="Paste single protein sequence into the textbox. If you wish to fold multiple proteins, submit individual jobs for each protein." />\n+ </when>\n+ </conditional>\n+ <param name="output_plddts" type="boolean" checked="false" label="Output per-residue confidence sc'..b'tomic-level accuracy, even when no similar structures are known.\n+ |\n+\n+ *Downstream analysis*\n+\n+ | Obtaining a protein fold is the first step in many analyses.\n+ | The 3D models created by AlphaFold can be used in downstream analysis, including the following:\n+ |\n+\n+ - Inspecting protein features\n+ 3D viewers (pymol, chimera, ngl, blender) can be used to inspect active sites, regulatory domains, binding sites.\n+ - Molecular docking\n+ 3D structures can be used to predict the binding affinity of different compounds.\n+ This is especially useful in screening drug candidates.\n+ - Protein-protein interactions\n+ Proteins associate in many biological processes, including intracellular signalling pathways and protein complex formation.\n+ To predict these interactions, other programs may ingest 3D models predicted by AlphaFold. Proprietary softwares include `GOLD <https://www.ccdc.cam.ac.uk/solutions/csd-discovery/components/gold/>`_ and `SeeSAR <https://www.biosolveit.de/SeeSAR>`_, but many `free and open-source options <https://en.wikipedia.org/wiki/List_of_protein-ligand_docking_software>`_ are available such as `AutoDock <https://autodock.scripps.edu/>`_ and `SwissDock <http://www.swissdock.ch/>`_.\n+\n+ *Expected run times*\n+\n+ .. image:: https://github.com/usegalaxy-au/galaxy-local-tools/blob/1a8d3e8daa7ccc5a345ca377697735ab95ed0666/tools/alphafold/static/img/alphafold_runtime_graph.png?raw=true\n+ :height: 520\n+ :alt: Run time graph\n+\n+ |\n+ | In general, we observe a quadratic relationship between sequence length and time to fold.\n+ | Once your job begins, a sequence of 50aa will take approximately 1hr to complete, while a sequence of 2000aa will take about 18hrs.\n+ |\n+\n+ **Input**\n+\n+ *Amino acid sequence*\n+\n+ | AlphaFold accepts a **single amino acid sequence** in FASTA format.\n+ | You can choose to input either a file from your Galaxy history or paste a sequence into a text box.\n+ | Please paste only a single sequence - we can only process a single sequence per job.\n+ | Multiple sequences will return an error.\n+ |\n+\n+ **Outputs**\n+\n+ *Visualization*\n+\n+ | An interactive 3D graphic of the best predicted molecular structures.\n+ | This output can be opened in Galaxy to give a visual impression of the results, with different structural representations to choose from.\n+ | Open the "Visualization" history output by clicking on the "view data" icon:\n+ |\n+\n+ .. image:: https://github.com/usegalaxy-au/galaxy-local-tools/blob/1a8d3e8daa7ccc5a345ca377697735ab95ed0666/tools/alphafold/static/img/alphafold-visualization.png?raw=true\n+ :height: 520\n+ :alt: Result visualization\n+\n+ |\n+\n+ *PDB files*\n+\n+ | Five PDB (Protein Data Bank) files will be created for the best ranking models predicted by AlphaFold.\n+ | These files describe the molecular structures and can be used for downstream analysis. e.g. *in silico* molecular docking.\n+ |\n+\n+ *Model confidence scores (optional)*\n+\n+ | This optional output produces a file which describes the confidence scores for each model (based on `pLDDTs <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3799472/>`_) which may be useful for downstream analysis.\n+ | Model confidence scores are also included as a column in the default PDB output.\n+ |\n+\n+ **External Resources**\n+\n+ We recommend checking out the\n+ `Alphafold Protein Structure Database <https://alphafold.ebi.ac.uk/>`_,\n+ which contains predicted sequences for thousands of Human proteins. See also:\n+\n+ - `Google Deepmind\'s article on AlphaFold <https://deepmind.com/blog/article/alphafold-a-solution-to-a-50-year-old-grand-challenge-in-biology>`_\n+ - `AlphaFold source code on GitHub <https://github.com/deepmind/alphafold>`_\n+\n+ ]]></help>\n+ <citations>\n+ <citation type="doi">https://doi.org/10.1038/s41586-021-03819-2</citation>\n+ </citations>\n+</tool>\n' |
b |
diff -r 000000000000 -r 7ae9d78b06f5 gen_extra_outputs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gen_extra_outputs.py Fri Jan 28 04:56:29 2022 +0000 |
[ |
@@ -0,0 +1,155 @@ + + +import json +import pickle +import argparse +from typing import Any, Dict, List + + +class Settings: + """parses then keeps track of program settings""" + def __init__(self): + self.workdir = None + self.output_confidence_scores = True + self.output_residue_scores = False + + def parse_settings(self) -> None: + parser = argparse.ArgumentParser() + parser.add_argument( + "workdir", + help="alphafold output directory", + type=str + ) + parser.add_argument( + "-p", + "--plddts", + help="output per-residue confidence scores (pLDDTs)", + action="store_true" + ) + args = parser.parse_args() + self.workdir = args.workdir.rstrip('/') + self.output_residue_scores = args.plddts + + +class ExecutionContext: + """uses program settings to get paths to files etc""" + def __init__(self, settings: Settings): + self.settings = settings + + @property + def ranking_debug(self) -> str: + return f'{self.settings.workdir}/ranking_debug.json' + + @property + def model_pkls(self) -> List[str]: + return [f'{self.settings.workdir}/result_model_{i}.pkl' + for i in range(1, 6)] + + @property + def model_conf_score_output(self) -> str: + return f'{self.settings.workdir}/model_confidence_scores.tsv' + + @property + def plddt_output(self) -> str: + return f'{self.settings.workdir}/plddts.tsv' + + +class FileLoader: + """loads file data for use by other classes""" + def __init__(self, context: ExecutionContext): + self.context = context + + def get_model_mapping(self) -> Dict[str, int]: + data = self.load_ranking_debug() + return {name: int(rank) + 1 + for (rank, name) in enumerate(data['order'])} + + def get_conf_scores(self) -> Dict[str, float]: + data = self.load_ranking_debug() + return {name: float(f'{score:.2f}') + for name, score in data['plddts'].items()} + + def load_ranking_debug(self) -> Dict[str, Any]: + with open(self.context.ranking_debug, 'r') as fp: + return json.load(fp) + + def get_model_plddts(self) -> Dict[str, List[float]]: + plddts: Dict[str, List[float]] = {} + model_pkls = self.context.model_pkls + for i in range(5): + pklfile = model_pkls[i] + with open(pklfile, 'rb') as fp: + data = pickle.load(fp) + plddts[f'model_{i+1}'] = [float(f'{x:.2f}') for x in data['plddt']] + return plddts + + +class OutputGenerator: + """generates the output data we are interested in creating""" + def __init__(self, loader: FileLoader): + self.loader = loader + + def gen_conf_scores(self): + mapping = self.loader.get_model_mapping() + scores = self.loader.get_conf_scores() + ranked = list(scores.items()) + ranked.sort(key=lambda x: x[1], reverse=True) + return {f'model_{mapping[name]}': score + for name, score in ranked} + + def gen_residue_scores(self) -> Dict[str, List[float]]: + mapping = self.loader.get_model_mapping() + model_plddts = self.loader.get_model_plddts() + return {f'model_{mapping[name]}': plddts + for name, plddts in model_plddts.items()} + + +class OutputWriter: + """writes generated data to files""" + def __init__(self, context: ExecutionContext): + self.context = context + + def write_conf_scores(self, data: Dict[str, float]) -> None: + outfile = self.context.model_conf_score_output + with open(outfile, 'w') as fp: + for model, score in data.items(): + fp.write(f'{model}\t{score}\n') + + def write_residue_scores(self, data: Dict[str, List[float]]) -> None: + outfile = self.context.plddt_output + model_plddts = list(data.items()) + model_plddts.sort() + + with open(outfile, 'w') as fp: + for model, plddts in model_plddts: + plddt_str_list = [str(x) for x in plddts] + plddt_str = ','.join(plddt_str_list) + fp.write(f'{model}\t{plddt_str}\n') + + +def main(): + # setup + settings = Settings() + settings.parse_settings() + context = ExecutionContext(settings) + loader = FileLoader(context) + + # generate & write outputs + generator = OutputGenerator(loader) + writer = OutputWriter(context) + + # confidence scores + conf_scores = generator.gen_conf_scores() + writer.write_conf_scores(conf_scores) + + # per-residue plddts + if settings.output_residue_scores: + residue_scores = generator.gen_residue_scores() + writer.write_residue_scores(residue_scores) + + +if __name__ == '__main__': + main() + + + |
b |
diff -r 000000000000 -r 7ae9d78b06f5 scripts/download_all_data.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/download_all_data.sh Fri Jan 28 04:56:29 2022 +0000 |
[ |
@@ -0,0 +1,74 @@ +#!/bin/bash +# +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Downloads and unzips all required data for AlphaFold. +# +# Usage: bash download_all_data.sh /path/to/download/directory +set -e + +if [[ $# -eq 0 ]]; then + echo "Error: download directory must be provided as an input argument." + exit 1 +fi + +if ! command -v aria2c &> /dev/null ; then + echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)." + exit 1 +fi + +DOWNLOAD_DIR="$1" +DOWNLOAD_MODE="${2:-full_dbs}" # Default mode to full_dbs. +if [[ "${DOWNLOAD_MODE}" != full_dbs && "${DOWNLOAD_MODE}" != reduced_dbs ]] +then + echo "DOWNLOAD_MODE ${DOWNLOAD_MODE} not recognized." + exit 1 +fi + +SCRIPT_DIR="$(dirname "$(realpath "$0")")" + +echo "Downloading AlphaFold parameters..." +bash "${SCRIPT_DIR}/download_alphafold_params.sh" "${DOWNLOAD_DIR}" + +if [[ "${DOWNLOAD_MODE}" = reduced_dbs ]] ; then + echo "Downloading Small BFD..." + bash "${SCRIPT_DIR}/download_small_bfd.sh" "${DOWNLOAD_DIR}" +else + echo "Downloading BFD..." + bash "${SCRIPT_DIR}/download_bfd.sh" "${DOWNLOAD_DIR}" +fi + +echo "Downloading MGnify..." +bash "${SCRIPT_DIR}/download_mgnify.sh" "${DOWNLOAD_DIR}" + +echo "Downloading PDB70..." +bash "${SCRIPT_DIR}/download_pdb70.sh" "${DOWNLOAD_DIR}" + +echo "Downloading PDB mmCIF files..." +bash "${SCRIPT_DIR}/download_pdb_mmcif.sh" "${DOWNLOAD_DIR}" + +echo "Downloading Uniclust30..." +bash "${SCRIPT_DIR}/download_uniclust30.sh" "${DOWNLOAD_DIR}" + +echo "Downloading Uniref90..." +bash "${SCRIPT_DIR}/download_uniref90.sh" "${DOWNLOAD_DIR}" + +echo "Downloading UniProt..." +bash "${SCRIPT_DIR}/download_uniprot.sh" "${DOWNLOAD_DIR}" + +echo "Downloading PDB SeqRes..." +bash "${SCRIPT_DIR}/download_pdb_seqres.sh" "${DOWNLOAD_DIR}" + +echo "All data downloaded." |
b |
diff -r 000000000000 -r 7ae9d78b06f5 scripts/download_alphafold_params.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/download_alphafold_params.sh Fri Jan 28 04:56:29 2022 +0000 |
[ |
@@ -0,0 +1,41 @@ +#!/bin/bash +# +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Downloads and unzips the AlphaFold parameters. +# +# Usage: bash download_alphafold_params.sh /path/to/download/directory +set -e + +if [[ $# -eq 0 ]]; then + echo "Error: download directory must be provided as an input argument." + exit 1 +fi + +if ! command -v aria2c &> /dev/null ; then + echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)." + exit 1 +fi + +DOWNLOAD_DIR="$1" +ROOT_DIR="${DOWNLOAD_DIR}/params" +SOURCE_URL="https://storage.googleapis.com/alphafold/alphafold_params_2022-01-19.tar" +BASENAME=$(basename "${SOURCE_URL}") + +mkdir --parents "${ROOT_DIR}" +aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" +tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \ + --directory="${ROOT_DIR}" --preserve-permissions +rm "${ROOT_DIR}/${BASENAME}" |
b |
diff -r 000000000000 -r 7ae9d78b06f5 scripts/download_bfd.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/download_bfd.sh Fri Jan 28 04:56:29 2022 +0000 |
[ |
@@ -0,0 +1,43 @@ +#!/bin/bash +# +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Downloads and unzips the BFD database for AlphaFold. +# +# Usage: bash download_bfd.sh /path/to/download/directory +set -e + +if [[ $# -eq 0 ]]; then + echo "Error: download directory must be provided as an input argument." + exit 1 +fi + +if ! command -v aria2c &> /dev/null ; then + echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)." + exit 1 +fi + +DOWNLOAD_DIR="$1" +ROOT_DIR="${DOWNLOAD_DIR}/bfd" +# Mirror of: +# https://bfd.mmseqs.com/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz. +SOURCE_URL="https://storage.googleapis.com/alphafold-databases/casp14_versions/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz" +BASENAME=$(basename "${SOURCE_URL}") + +mkdir --parents "${ROOT_DIR}" +aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" +tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \ + --directory="${ROOT_DIR}" +rm "${ROOT_DIR}/${BASENAME}" |
b |
diff -r 000000000000 -r 7ae9d78b06f5 scripts/download_mgnify.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/download_mgnify.sh Fri Jan 28 04:56:29 2022 +0000 |
[ |
@@ -0,0 +1,43 @@ +#!/bin/bash +# +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Downloads and unzips the MGnify database for AlphaFold. +# +# Usage: bash download_mgnify.sh /path/to/download/directory +set -e + +if [[ $# -eq 0 ]]; then + echo "Error: download directory must be provided as an input argument." + exit 1 +fi + +if ! command -v aria2c &> /dev/null ; then + echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)." + exit 1 +fi + +DOWNLOAD_DIR="$1" +ROOT_DIR="${DOWNLOAD_DIR}/mgnify" +# Mirror of: +# ftp://ftp.ebi.ac.uk/pub/databases/metagenomics/peptide_database/2018_12/mgy_clusters.fa.gz +SOURCE_URL="https://storage.googleapis.com/alphafold-databases/casp14_versions/mgy_clusters_2018_12.fa.gz" +BASENAME=$(basename "${SOURCE_URL}") + +mkdir --parents "${ROOT_DIR}" +aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" +pushd "${ROOT_DIR}" +gunzip "${ROOT_DIR}/${BASENAME}" +popd |
b |
diff -r 000000000000 -r 7ae9d78b06f5 scripts/download_pdb70.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/download_pdb70.sh Fri Jan 28 04:56:29 2022 +0000 |
[ |
@@ -0,0 +1,41 @@ +#!/bin/bash +# +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Downloads and unzips the PDB70 database for AlphaFold. +# +# Usage: bash download_pdb70.sh /path/to/download/directory +set -e + +if [[ $# -eq 0 ]]; then + echo "Error: download directory must be provided as an input argument." + exit 1 +fi + +if ! command -v aria2c &> /dev/null ; then + echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)." + exit 1 +fi + +DOWNLOAD_DIR="$1" +ROOT_DIR="${DOWNLOAD_DIR}/pdb70" +SOURCE_URL="http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/old-releases/pdb70_from_mmcif_200401.tar.gz" +BASENAME=$(basename "${SOURCE_URL}") + +mkdir --parents "${ROOT_DIR}" +aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" +tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \ + --directory="${ROOT_DIR}" +rm "${ROOT_DIR}/${BASENAME}" |
b |
diff -r 000000000000 -r 7ae9d78b06f5 scripts/download_pdb_mmcif.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/download_pdb_mmcif.sh Fri Jan 28 04:56:29 2022 +0000 |
[ |
@@ -0,0 +1,61 @@ +#!/bin/bash +# +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Downloads, unzips and flattens the PDB database for AlphaFold. +# +# Usage: bash download_pdb_mmcif.sh /path/to/download/directory +set -e + +if [[ $# -eq 0 ]]; then + echo "Error: download directory must be provided as an input argument." + exit 1 +fi + +if ! command -v aria2c &> /dev/null ; then + echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)." + exit 1 +fi + +if ! command -v rsync &> /dev/null ; then + echo "Error: rsync could not be found. Please install rsync." + exit 1 +fi + +DOWNLOAD_DIR="$1" +ROOT_DIR="${DOWNLOAD_DIR}/pdb_mmcif" +RAW_DIR="${ROOT_DIR}/raw" +MMCIF_DIR="${ROOT_DIR}/mmcif_files" + +echo "Running rsync to fetch all mmCIF files (note that the rsync progress estimate might be inaccurate)..." +mkdir --parents "${RAW_DIR}" +rsync --recursive --links --perms --times --compress --info=progress2 --delete --port=33444 \ + rsync.rcsb.org::ftp_data/structures/divided/mmCIF/ \ + "${RAW_DIR}" + +echo "Unzipping all mmCIF files..." +find "${RAW_DIR}/" -type f -iname "*.gz" -exec gunzip {} + + +echo "Flattening all mmCIF files..." +mkdir --parents "${MMCIF_DIR}" +find "${RAW_DIR}" -type d -empty -delete # Delete empty directories. +for subdir in "${RAW_DIR}"/*; do + mv "${subdir}/"*.cif "${MMCIF_DIR}" +done + +# Delete empty download directory structure. +find "${RAW_DIR}" -type d -empty -delete + +aria2c "ftp://ftp.wwpdb.org/pub/pdb/data/status/obsolete.dat" --dir="${ROOT_DIR}" |
b |
diff -r 000000000000 -r 7ae9d78b06f5 scripts/download_pdb_seqres.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/download_pdb_seqres.sh Fri Jan 28 04:56:29 2022 +0000 |
[ |
@@ -0,0 +1,38 @@ +#!/bin/bash +# +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Downloads and unzips the PDB SeqRes database for AlphaFold. +# +# Usage: bash download_pdb_seqres.sh /path/to/download/directory +set -e + +if [[ $# -eq 0 ]]; then + echo "Error: download directory must be provided as an input argument." + exit 1 +fi + +if ! command -v aria2c &> /dev/null ; then + echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)." + exit 1 +fi + +DOWNLOAD_DIR="$1" +ROOT_DIR="${DOWNLOAD_DIR}/pdb_seqres" +SOURCE_URL="ftp://ftp.wwpdb.org/pub/pdb/derived_data/pdb_seqres.txt" +BASENAME=$(basename "${SOURCE_URL}") + +mkdir --parents "${ROOT_DIR}" +aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" |
b |
diff -r 000000000000 -r 7ae9d78b06f5 scripts/download_small_bfd.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/download_small_bfd.sh Fri Jan 28 04:56:29 2022 +0000 |
[ |
@@ -0,0 +1,41 @@ +#!/bin/bash +# +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Downloads and unzips the Small BFD database for AlphaFold. +# +# Usage: bash download_small_bfd.sh /path/to/download/directory +set -e + +if [[ $# -eq 0 ]]; then + echo "Error: download directory must be provided as an input argument." + exit 1 +fi + +if ! command -v aria2c &> /dev/null ; then + echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)." + exit 1 +fi + +DOWNLOAD_DIR="$1" +ROOT_DIR="${DOWNLOAD_DIR}/small_bfd" +SOURCE_URL="https://storage.googleapis.com/alphafold-databases/reduced_dbs/bfd-first_non_consensus_sequences.fasta.gz" +BASENAME=$(basename "${SOURCE_URL}") + +mkdir --parents "${ROOT_DIR}" +aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" +pushd "${ROOT_DIR}" +gunzip "${ROOT_DIR}/${BASENAME}" +popd |
b |
diff -r 000000000000 -r 7ae9d78b06f5 scripts/download_uniclust30.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/download_uniclust30.sh Fri Jan 28 04:56:29 2022 +0000 |
[ |
@@ -0,0 +1,43 @@ +#!/bin/bash +# +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Downloads and unzips the Uniclust30 database for AlphaFold. +# +# Usage: bash download_uniclust30.sh /path/to/download/directory +set -e + +if [[ $# -eq 0 ]]; then + echo "Error: download directory must be provided as an input argument." + exit 1 +fi + +if ! command -v aria2c &> /dev/null ; then + echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)." + exit 1 +fi + +DOWNLOAD_DIR="$1" +ROOT_DIR="${DOWNLOAD_DIR}/uniclust30" +# Mirror of: +# http://wwwuser.gwdg.de/~compbiol/uniclust/2018_08/uniclust30_2018_08_hhsuite.tar.gz +SOURCE_URL="https://storage.googleapis.com/alphafold-databases/casp14_versions/uniclust30_2018_08_hhsuite.tar.gz" +BASENAME=$(basename "${SOURCE_URL}") + +mkdir --parents "${ROOT_DIR}" +aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" +tar --extract --verbose --file="${ROOT_DIR}/${BASENAME}" \ + --directory="${ROOT_DIR}" +rm "${ROOT_DIR}/${BASENAME}" |
b |
diff -r 000000000000 -r 7ae9d78b06f5 scripts/download_uniprot.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/download_uniprot.sh Fri Jan 28 04:56:29 2022 +0000 |
[ |
@@ -0,0 +1,55 @@ +#!/bin/bash +# +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Downloads, unzips and merges the SwissProt and TrEMBL databases for +# AlphaFold-Multimer. +# +# Usage: bash download_uniprot.sh /path/to/download/directory +set -e + +if [[ $# -eq 0 ]]; then + echo "Error: download directory must be provided as an input argument." + exit 1 +fi + +if ! command -v aria2c &> /dev/null ; then + echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)." + exit 1 +fi + +DOWNLOAD_DIR="$1" +ROOT_DIR="${DOWNLOAD_DIR}/uniprot" + +TREMBL_SOURCE_URL="ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_trembl.fasta.gz" +TREMBL_BASENAME=$(basename "${TREMBL_SOURCE_URL}") +TREMBL_UNZIPPED_BASENAME="${TREMBL_BASENAME%.gz}" + +SPROT_SOURCE_URL="ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz" +SPROT_BASENAME=$(basename "${SPROT_SOURCE_URL}") +SPROT_UNZIPPED_BASENAME="${SPROT_BASENAME%.gz}" + +mkdir --parents "${ROOT_DIR}" +aria2c "${TREMBL_SOURCE_URL}" --dir="${ROOT_DIR}" +aria2c "${SPROT_SOURCE_URL}" --dir="${ROOT_DIR}" +pushd "${ROOT_DIR}" +gunzip "${ROOT_DIR}/${TREMBL_BASENAME}" +gunzip "${ROOT_DIR}/${SPROT_BASENAME}" + +# Concatenate TrEMBL and SwissProt, rename to uniprot and clean up. +cat "${ROOT_DIR}/${SPROT_UNZIPPED_BASENAME}" >> "${ROOT_DIR}/${TREMBL_UNZIPPED_BASENAME}" +mv "${ROOT_DIR}/${TREMBL_UNZIPPED_BASENAME}" "${ROOT_DIR}/uniprot.fasta" +rm "${ROOT_DIR}/${SPROT_UNZIPPED_BASENAME}" +popd |
b |
diff -r 000000000000 -r 7ae9d78b06f5 scripts/download_uniref90.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/download_uniref90.sh Fri Jan 28 04:56:29 2022 +0000 |
[ |
@@ -0,0 +1,41 @@ +#!/bin/bash +# +# Copyright 2021 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Downloads and unzips the UniRef90 database for AlphaFold. +# +# Usage: bash download_uniref90.sh /path/to/download/directory +set -e + +if [[ $# -eq 0 ]]; then + echo "Error: download directory must be provided as an input argument." + exit 1 +fi + +if ! command -v aria2c &> /dev/null ; then + echo "Error: aria2c could not be found. Please install aria2c (sudo apt install aria2)." + exit 1 +fi + +DOWNLOAD_DIR="$1" +ROOT_DIR="${DOWNLOAD_DIR}/uniref90" +SOURCE_URL="ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/uniref90.fasta.gz" +BASENAME=$(basename "${SOURCE_URL}") + +mkdir --parents "${ROOT_DIR}" +aria2c "${SOURCE_URL}" --dir="${ROOT_DIR}" +pushd "${ROOT_DIR}" +gunzip "${ROOT_DIR}/${BASENAME}" +popd |
b |
diff -r 000000000000 -r 7ae9d78b06f5 static/img/alphafold-visualization.png |
b |
Binary file static/img/alphafold-visualization.png has changed |
b |
diff -r 000000000000 -r 7ae9d78b06f5 static/img/alphafold_runtime_graph.png |
b |
Binary file static/img/alphafold_runtime_graph.png has changed |
b |
diff -r 000000000000 -r 7ae9d78b06f5 test-data/test1.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test1.fasta Fri Jan 28 04:56:29 2022 +0000 |
b |
@@ -0,0 +1,3 @@ +>UPI0015CE2E61 status=active +DGKILADKVSDKLEQTATLTGLDYGRFTRSMLLSQGQFAAFLNAKPSDRAELLEELTGTE +IYGQISAMVYEQHKAARHALEKFEAQAAGIVLLTEAQQ |
b |
diff -r 000000000000 -r 7ae9d78b06f5 validate_fasta.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/validate_fasta.py Fri Jan 28 04:56:29 2022 +0000 |
[ |
@@ -0,0 +1,161 @@ + + +import argparse +from typing import List, TextIO + + +class Fasta: + def __init__(self, header_str: str, seq_str: str): + self.header = header_str + self.aa_seq = seq_str + + +class FastaLoader: + def __init__(self): + """creates a Fasta() from a file""" + self.fastas: List[Fasta] = [] + + def load(self, fasta_path: str): + """ + load function has to be very flexible. + file may be normal fasta format (header, seq) or can just be a bare sequence. + """ + with open(fasta_path, 'r') as fp: + header, sequence = self.interpret_first_line(fp) + line = fp.readline().rstrip('\n') + + while line: + if line.startswith('>'): + self.update_fastas(header, sequence) + header = line + sequence = '' + else: + sequence += line + line = fp.readline().rstrip('\n') + + # after reading whole file, header & sequence buffers might be full + self.update_fastas(header, sequence) + return self.fastas + + def interpret_first_line(self, fp: TextIO): + header = '' + sequence = '' + line = fp.readline().rstrip('\n') + if line.startswith('>'): + header = line + else: + sequence += line + return header, sequence + + def update_fastas(self, header: str, sequence: str): + # if we have a sequence + if not sequence == '': + # create generic header if not exists + if header == '': + fasta_count = len(self.fastas) + header = f'>sequence_{fasta_count}' + + # create new Fasta + self.fastas.append(Fasta(header, sequence)) + + +class FastaValidator: + def __init__(self, fasta_list: List[Fasta]): + self.fasta_list = fasta_list + self.min_length = 30 + self.max_length = 2000 + self.iupac_characters = { + 'A', 'B', 'C', 'D', 'E', 'F', 'G', + 'H', 'I', 'K', 'L', 'M', 'N', 'P', + 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', + 'Y', 'Z', '-' + } + + def validate(self): + """performs fasta validation""" + self.validate_num_seqs() + self.validate_length() + self.validate_alphabet() + # not checking for 'X' nucleotides at the moment. + # alphafold can throw an error if it doesn't like it. + #self.validate_x() + + def validate_num_seqs(self) -> None: + if len(self.fasta_list) > 1: + raise Exception(f'Error encountered validating fasta: More than 1 sequence detected ({len(self.fasta_list)}). Please use single fasta sequence as input') + elif len(self.fasta_list) == 0: + raise Exception(f'Error encountered validating fasta: input file has no fasta sequences') + + def validate_length(self): + """Confirms whether sequence length is valid. """ + fasta = self.fasta_list[0] + if len(fasta.aa_seq) < self.min_length: + raise Exception(f'Error encountered validating fasta: Sequence too short ({len(fasta.aa_seq)}aa). Must be > 30aa') + if len(fasta.aa_seq) > self.max_length: + raise Exception(f'Error encountered validating fasta: Sequence too long ({len(fasta.aa_seq)}aa). Must be < 2000aa') + + def validate_alphabet(self): + """ + Confirms whether the sequence conforms to IUPAC codes. + If not, reports the offending character and its position. + """ + fasta = self.fasta_list[0] + for i, char in enumerate(fasta.aa_seq.upper()): + if char not in self.iupac_characters: + raise Exception(f'Error encountered validating fasta: Invalid amino acid found at pos {i}: {char}') + + def validate_x(self): + """checks if any bases are X. TODO check whether alphafold accepts X bases. """ + fasta = self.fasta_list[0] + for i, char in enumerate(fasta.aa_seq.upper()): + if char == 'X': + raise Exception(f'Error encountered validating fasta: Unsupported aa code "X" found at pos {i}') + + +class FastaWriter: + def __init__(self) -> None: + self.outfile = 'alphafold.fasta' + self.formatted_line_len = 60 + + def write(self, fasta: Fasta): + with open(self.outfile, 'w') as fp: + header = fasta.header + seq = self.format_sequence(fasta.aa_seq) + fp.write(header + '\n') + fp.write(seq + '\n') + + def format_sequence(self, aa_seq: str): + formatted_seq = '' + for i in range(0, len(aa_seq), self.formatted_line_len): + formatted_seq += aa_seq[i: i + self.formatted_line_len] + '\n' + return formatted_seq + + +def main(): + # load fasta file + args = parse_args() + fl = FastaLoader() + fastas = fl.load(args.input_fasta) + + # validate + fv = FastaValidator(fastas) + fv.validate() + + # write cleaned version + fw = FastaWriter() + fw.write(fastas[0]) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument( + "input_fasta", + help="input fasta file", + type=str + ) + return parser.parse_args() + + + +if __name__ == '__main__': + main() \ No newline at end of file |