Mercurial > repos > iuc > data_manager_clair3_models
changeset 0:11e42265a9b0 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/data_managers/data_manager_clair3_models commit 2672414472cc968c736dc7d42f5a119ff8c16c62
author | iuc |
---|---|
date | Thu, 20 Feb 2025 17:57:11 +0000 |
parents | |
children | |
files | data_manager/install_clair3_models.xml data_manager/model_fetcher.py data_manager_conf.xml test-data/clair3_models.loc tool-data/clair3_models.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
diffstat | 7 files changed, 327 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/install_clair3_models.xml Thu Feb 20 17:57:11 2025 +0000 @@ -0,0 +1,102 @@ +<tool id="data_manager_clair3_models" name="Clair3 model downloader" version="0.0.1" tool_type="manage_data" profile="23.2"> + <requirements> + <requirement type="package" version="3.12">python</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + ## this code looks up the existing table and uses it to build a list of known models + ## because models are uniquely identified by their name, downloading duplicate models + ## would be an error, so it is useful to know which models we already have + ## + ## $__app__.tool_data_tables is a dictionary where the keys are data table names and the values + ## are TabularToolDataTable objects (from lib/galaxy/tools/data/__init__.py) + ## + ## the get_fields() method on the TabularToolDataTable returns a list of lists, with one list + ## per line of the tool data table, so row[0] is the first field (i.e. the value column) + ## + + #set $data_table = $__app__.tool_data_tables.get("clair3_models") + #if $data_table is not None and len($data_table.get_fields()) > 0: + #set $known_models = ','.join([ row[0] for row in $data_table.get_fields() ]) + #set $sha256_sums = ','.join([ row[1] for row in $data_table.get_fields() ]) + #else + #set $known_models = None + #set $sha256_sums = None + #end if + + python '$__tool_directory__/model_fetcher.py' + '${output_file}' + #if $known_models is not None + --known_models '$known_models' + --sha256_sums '$sha256_sums' + #end if + #if $model_selection.source == 'latest' + --download_latest + #elif $model_selection.source == 'chosen' + --download_models '$model_selection.model_list' + #end if + ]]></command> + <inputs> + <conditional name="model_selection"> + <param name="source" label="Select the source of the list of models to download" type="select"> + <option value="latest">Latest models from Rerio page</option> + <option value="chosen">User provided list of models</option> + </param> + <when value="latest"> + </when> + <when value="chosen"> + <param name="model_list" type="text" label="List of models to download" help="A comma separated list of model to download, e.g. 'r1041_e82_400bps_sup_v430,r1041_e82_400bps_hac_v430'"> + <validator type="regex" message="Invalid model list. Format is a comma separated list of model names (e.g. 'r1041_e82_400bps_sup_v430,r1041_e82_400bps_hac_v430')">^[a-z_0-9,]+$</validator> + </param> + </when> + </conditional> + </inputs> + <outputs> + <data name="output_file" format="data_manager_json" label="Data Manager Output (JSON)" /> + </outputs> + <tests> + <test> <!-- test1 --> + <conditional name="model_selection"> + <param name="source" value="chosen"/> + <param name="model_list" value="r1041_e82_400bps_sup_v500,r1041_e82_400bps_hac_v500" /> + </conditional> + <output name="output_file"> + <assert_contents> + <!-- the text 'r1041_e82_400bps_sup_v500' is only there if the test is run for the first time (i.e. empty test-data/clair3_models.loc) so need to look for something else --> + <has_text text='clair3_models' /> + </assert_contents> + </output> + </test> + <test> <!-- test2 --> + <conditional name="model_selection"> + <param name="source" value="latest"/> + </conditional> + <output name="output_file"> + <assert_contents> + <!-- because we don't know what the names of the latest models are we can only test to see if the data table output is created --> + <has_text text='data_tables' /> + </assert_contents> + </output> + </test> + </tests> + <help><![CDATA[ + Clair3_ is a variant caller for long read data developed at the University of Hong Kong. This tool makes use of models trained to match particular + sequencing technologies and basecallers. Oxford Nanopore provides a set of models for Clair3 on their Rerio_ page. These tools are designed for + "research release" under the terms of the "Oxford Nanopore Technologies, Ltd. Public License Version 1.0" license_. This data manager allows + downloading model files from the Rerio page and installing them on a Galaxy server. + + .. _Clair3: https://github.com/HKU-BAL/Clair3 + .. _Rerio: https://github.com/nanoporetech/rerio + .. _license: https://github.com/nanoporetech/rerio/blob/master/LICENCE.txt + ]]> + </help> + <citations> + <citation type="doi">10.1101/2021.12.29.474431v2</citation> + <citation type="bibtex"><![CDATA[@misc{ONT2024, + title = {Rerio}, + author = {Oxford Nanopore Technologies}, + year = 2024, + howpublished = {\url{https://github.com/nanoporetech/rerio}}, + commit = {c0c8ce6} + }]]></citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/model_fetcher.py Thu Feb 20 17:57:11 2025 +0000 @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 + +import argparse +import json +import sys +import tarfile +from hashlib import sha256 +from io import BytesIO, StringIO +from pathlib import Path +from urllib.error import HTTPError +from urllib.request import Request, urlopen + +DATA_TABLE_NAME = 'clair3_models' + + +def find_latest_models(): + # based on the README.rst of the rerio repository as of 7 January 2025 + url = 'https://raw.githubusercontent.com/nanoporetech/rerio/refs/heads/master/README.rst' + httprequest = Request(url) + with urlopen(httprequest) as response: + if response.status != 200: + raise IOError(f'Failed to fetch the latest models: {response.status}') + data = response.read().decode('utf-8') + init_line_seen = False + latest_seen = False + config_line_seen = False + read_lines = False + models = [] + # the file that we are parsing has a section that looks like this: + # Clair3 Models + # ------------- + + # Clair3 models for the following configurations are available: + + # Latest: + + # ========================== =================== ======================= + # Config Chemistry Dorado basecaller model + # ========================== =================== ======================= + # r1041_e82_400bps_sup_v500 R10.4.1 E8.2 (5kHz) v5.0.0 SUP + # r1041_e82_400bps_hac_v500 R10.4.1 E8.2 (5kHz) v5.0.0 HAC + # r1041_e82_400bps_sup_v410 R10.4.1 E8.2 (4kHz) v4.1.0 SUP + # r1041_e82_400bps_hac_v410 R10.4.1 E8.2 (4kHz) v4.1.0 HAC + # ========================== =================== ======================= + # + # and the aim is to extract the list of model names from the table by successfully looking for + # "Clair3 Models", then "Latest:", then "Config" and then "=====" and then reading the lines until + # the next "=====" is encountered + for line in StringIO(data): + if read_lines: + if line.startswith('====='): + read_lines = False + break + model = line.split()[0] + models.append(model) + if config_line_seen and line.startswith('====='): + read_lines = True + continue + if init_line_seen and line.startswith('Latest:'): + latest_seen = True + continue + if latest_seen and line.startswith('Config'): + config_line_seen = True + continue + if line.startswith('Clair3 Models'): + init_line_seen = True + continue + return models + + +def fetch_model(model_name): + # the model files are tar gzipped, with a structure like: + # model_name/pileup.index + # model_name/full_alignment.index + # and other files, with the key point being that the model_name becoomes the model_directory + + url = f'https://raw.githubusercontent.com/nanoporetech/rerio/refs/heads/master/clair3_models/{model_name}_model' + httprequest = Request(url) + try: + # urlopen throws a HTTPError if it gets a 404 status (and perhaps other non-200 status?) + with urlopen(httprequest) as response: + if response.status != 200: + raise IOError(f'Failed to fetch the model {model_name}: {response.status}') + final_url = response.read().decode('utf-8').strip() + httprequest = Request(final_url) + except HTTPError as e: + raise IOError(f'Failed to fetch the model {model_name}: {e}') + + with urlopen(httprequest) as response: + if response.status != 200: + raise IOError(f'Failed to fetch the model {model_name} from CDN URL {final_url}: {response.status}') + data = response.read() + return data + + +def unpack_model(data, outdir): + with tarfile.open(fileobj=BytesIO(data), mode='r:*') as tar: + tar.extractall(outdir) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('dm_filename', type=str, help='The filename of the data manager file to read parameters from and write outputs to') + parser.add_argument('--known_models', type=str, help='List of models already known in the Galaxy data table') + parser.add_argument('--sha256_sums', type=str, help='List of sha256sums of the models already known in the Galaxy data table') + parser.add_argument('--download_latest', action='store_true', default=False, help='Download the latest models as per the Rerio repository') + parser.add_argument('--download_models', type=str, help='Comma separated list of models to download') + args = parser.parse_args() + + # parameters to a data manager are passed in a JSON file (see https://docs.galaxyproject.org/en/latest/dev/data_managers.html) and + # similarily a JSON file is created to pass the output back to Galaxy + models = [] + if args.download_latest: + models.extend(find_latest_models()) + if args.download_models: + models.extend(args.download_models.split(',')) + + if not models: + sys.exit('No models to download, please specify either --download_latest or --download_models') + + with open(args.dm_filename) as fh: + config = json.load(fh) + if 'extra_files_path' not in config.get('output_data', [{}])[0]: + sys.exit('Please specify the output directory in the data manager configuration (the extra_files_path)') + output_directory = config["output_data"][0]["extra_files_path"] + if not Path(output_directory).exists(): + Path(output_directory).mkdir(parents=True) + + data_manager_dict = {} + data_manager_dict["data_tables"] = config.get("data_tables", {}) + data_manager_dict["data_tables"][DATA_TABLE_NAME] = [] + + known_models = set(args.known_models.split(',')) if args.known_models else set() + model_to_sha256 = {} + if args.known_models: + sha256_sums = args.sha256_sums.split(',') + for (i, model) in enumerate(known_models): + model_to_sha256[model] = sha256_sums[i] + + for model in models: + model_dir = Path(output_directory) / model + # The data table cannot handle duplicate entries, so we skip models that are already in the data table + if model in known_models: + print(f'Model {model} already exists, skipping', file=sys.stderr) + continue + data = fetch_model(model) + sha256sum = sha256(data).hexdigest() + + # Since we skip models that are already known we cannot test the sha256sum here. This code is retained to illustrate that an + # alternative logic would be to download the model each time and check if the sha256sum matches what is already known. Hopefully + # ONT does not update the models while keeping the same name, so this is not needed. The sha256sum is stored in the data table + # in case it is needed in the future. + # if model in model_to_sha256 and sha256sum != model_to_sha256[model]: + # sys.exit(f'Model {model} already exists with a different sha256sum {model_to_sha256[model]}. This is a serious error, inform the Galaxy admin') + + unpack_model(data, output_directory) + + data_manager_dict["data_tables"][DATA_TABLE_NAME].append( + dict( + value=model, + platform="ont", + sha256=sha256sum, + path=str(model_dir), + source="rerio" + ) + ) + + with open(args.dm_filename, 'w') as fh: + json.dump(data_manager_dict, fh, sort_keys=True, indent=4)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Thu Feb 20 17:57:11 2025 +0000 @@ -0,0 +1,22 @@ +<?xml version="1.0"?> +<data_managers> + <data_manager tool_file="data_manager/install_clair3_models.xml" id="data_manager_clair3_models"> + <data_table name="clair3_models"> + <output> + <column name="value" /> + <column name="platform" /> + <column name="sha256" /> + <column name="path" output_ref="output_file" > + <!-- note: the Python script sanitises the possibly user-supplied scheme name ('value') --> + <move type="directory"> + <source>${path}</source> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">clair3_models/#echo str($value)#</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/clair3_models/#echo str($value)#</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + <column name="source" /> + </output> + </data_table> + </data_manager> +</data_managers>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/clair3_models.loc Thu Feb 20 17:57:11 2025 +0000 @@ -0,0 +1,11 @@ +# this is a table separated file describing the locations of Clair3 models (which are download from Oxford Nanopore's Rerio site and provided as directories) +# +# the columns are: +# 1. value +# 2. platform +# 3. sha256sum (sha256 hash of the downloaded model, before unpacking) +# 4. path (path to directory containing model) +# 5. source (where the model came from. rerio means Oxford Nanopore's https://github.com/nanoporetech/rerio with its associated license) +# for example +# +# r1041_e82_400bps_hac_v500 ont a1b998a80bc94ba4f5babc811d62e83a61bba3819188c488daee1c698bb72ae5 /data/galaxy/tool_data/clair3_models/r1041_e82_400bps_hac_v500 rerio
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/clair3_models.loc.sample Thu Feb 20 17:57:11 2025 +0000 @@ -0,0 +1,11 @@ +# this is a table separated file describing the locations of Clair3 models (which are download from Oxford Nanopore's Rerio site and provided as directories) +# +# the columns are: +# 1. value +# 2. platform +# 3. sha256sum (sha256 hash of the downloaded model, before unpacking) +# 4. path (path to directory containing model) +# 5. source (where the model came from. rerio means Oxford Nanopore's https://github.com/nanoporetech/rerio with its associated license) +# for example +# +# r1041_e82_400bps_hac_v500 ont a1b998a80bc94ba4f5babc811d62e83a61bba3819188c488daee1c698bb72ae5 /data/galaxy/tool_data/clair3_models/r1041_e82_400bps_hac_v500 rerio
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Thu Feb 20 17:57:11 2025 +0000 @@ -0,0 +1,6 @@ +<tables> + <table name="clair3_models" comment_char="#" allow_duplicate_entries="False"> + <columns>value, platform, sha256, path, source</columns> + <file path="tool-data/clair3_models.loc" /> + </table> +</tables> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Thu Feb 20 17:57:11 2025 +0000 @@ -0,0 +1,6 @@ +<tables> + <table name="clair3_models" comment_char="#" allow_duplicate_entries="False"> + <columns>value, platform, sha256, path, source</columns> + <file path="${__HERE__}/test-data/clair3_models.loc" /> + </table> +</tables> \ No newline at end of file