Mercurial > repos > iuc > data_manager_clair3_models

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/install_clair3_models.xml	Thu Feb 20 17:57:11 2025 +0000
@@ -0,0 +1,102 @@
+<tool id="data_manager_clair3_models" name="Clair3 model downloader" version="0.0.1" tool_type="manage_data" profile="23.2">
+    <requirements>
+        <requirement type="package" version="3.12">python</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+            ## this code looks up the existing table and uses it to build a list of known models
+            ## because models are uniquely identified by their name, downloading duplicate models
+            ## would be an error, so it is useful to know which models we already have
+            ##
+            ## $__app__.tool_data_tables is a dictionary where the keys are data table names and the values
+            ## are TabularToolDataTable objects (from lib/galaxy/tools/data/__init__.py)
+            ##
+            ## the get_fields() method on the TabularToolDataTable returns a list of lists, with one list
+            ## per line of the tool data table, so row[0] is the first field (i.e. the value column)
+            ##
+
+            #set $data_table = $__app__.tool_data_tables.get("clair3_models")
+            #if $data_table is not None and len($data_table.get_fields()) > 0:
+                #set $known_models = ','.join([ row[0] for row in $data_table.get_fields() ])
+                #set $sha256_sums = ','.join([ row[1] for row in $data_table.get_fields() ])
+            #else
+                #set $known_models = None
+                #set $sha256_sums = None
+            #end if
+
+        python '$__tool_directory__/model_fetcher.py'
+            '${output_file}'
+            #if $known_models is not None
+                --known_models '$known_models'
+                --sha256_sums '$sha256_sums'
+            #end if
+            #if $model_selection.source == 'latest'
+                --download_latest
+            #elif $model_selection.source == 'chosen'
+                --download_models '$model_selection.model_list'
+            #end if
+    ]]></command>
+    <inputs>
+        <conditional name="model_selection">
+            <param name="source" label="Select the source of the list of models to download" type="select">
+                <option value="latest">Latest models from Rerio page</option>
+                <option value="chosen">User provided list of models</option>
+            </param>
+            <when value="latest">
+            </when>
+            <when value="chosen">
+                <param name="model_list" type="text" label="List of models to download" help="A comma separated list of model to download, e.g. 'r1041_e82_400bps_sup_v430,r1041_e82_400bps_hac_v430'">
+                    <validator type="regex" message="Invalid model list. Format is a comma separated list of model names (e.g. 'r1041_e82_400bps_sup_v430,r1041_e82_400bps_hac_v430')">^[a-z_0-9,]+$</validator>
+                </param>
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="output_file" format="data_manager_json" label="Data Manager Output (JSON)" />
+    </outputs>
+    <tests>
+        <test> <!-- test1 -->
+            <conditional name="model_selection">
+                <param name="source" value="chosen"/>
+                <param name="model_list" value="r1041_e82_400bps_sup_v500,r1041_e82_400bps_hac_v500" />
+            </conditional>
+            <output name="output_file">
+                <assert_contents>
+                    <!-- the text 'r1041_e82_400bps_sup_v500' is only there if the test is run for the first time (i.e. empty test-data/clair3_models.loc) so need to look for something else -->
+                    <has_text text='clair3_models' />
+                </assert_contents>
+            </output>
+        </test>
+        <test> <!-- test2 -->
+            <conditional name="model_selection">
+                <param name="source" value="latest"/>
+            </conditional>
+            <output name="output_file">
+                <assert_contents>
+                    <!-- because we don't know what the names of the latest models are we can only test to see if the data table output is created -->
+                    <has_text text='data_tables' />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+    Clair3_ is a variant caller for long read data developed at the University of Hong Kong. This tool makes use of models trained to match particular
+    sequencing technologies and basecallers. Oxford Nanopore provides a set of models for Clair3 on their Rerio_ page. These tools are designed for
+    "research release" under the terms of the "Oxford Nanopore Technologies, Ltd. Public License Version 1.0" license_. This data manager allows
+    downloading model files from the Rerio page and installing them on a Galaxy server.
+
+    .. _Clair3: https://github.com/HKU-BAL/Clair3
+    .. _Rerio: https://github.com/nanoporetech/rerio
+    .. _license: https://github.com/nanoporetech/rerio/blob/master/LICENCE.txt
+    ]]>
+    </help>
+    <citations>
+        <citation type="doi">10.1101/2021.12.29.474431v2</citation>
+        <citation type="bibtex"><![CDATA[@misc{ONT2024,
+            title        = {Rerio},
+            author       = {Oxford Nanopore Technologies},
+            year         = 2024,
+            howpublished = {\url{https://github.com/nanoporetech/rerio}},
+            commit       = {c0c8ce6}
+    }]]></citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/model_fetcher.py	Thu Feb 20 17:57:11 2025 +0000
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import sys
+import tarfile
+from hashlib import sha256
+from io import BytesIO, StringIO
+from pathlib import Path
+from urllib.error import HTTPError
+from urllib.request import Request, urlopen
+
+DATA_TABLE_NAME = 'clair3_models'
+
+
+def find_latest_models():
+    # based on the README.rst of the rerio repository as of 7 January 2025
+    url = 'https://raw.githubusercontent.com/nanoporetech/rerio/refs/heads/master/README.rst'
+    httprequest = Request(url)
+    with urlopen(httprequest) as response:
+        if response.status != 200:
+            raise IOError(f'Failed to fetch the latest models: {response.status}')
+        data = response.read().decode('utf-8')
+        init_line_seen = False
+        latest_seen = False
+        config_line_seen = False
+        read_lines = False
+        models = []
+        # the file that we are parsing has a section that looks like this:
+        # Clair3 Models
+        # -------------
+
+        # Clair3 models for the following configurations are available:
+
+        # Latest:
+
+        # ========================== =================== =======================
+        # Config                     Chemistry           Dorado basecaller model
+        # ========================== =================== =======================
+        # r1041_e82_400bps_sup_v500  R10.4.1 E8.2 (5kHz) v5.0.0 SUP
+        # r1041_e82_400bps_hac_v500  R10.4.1 E8.2 (5kHz) v5.0.0 HAC
+        # r1041_e82_400bps_sup_v410  R10.4.1 E8.2 (4kHz) v4.1.0 SUP
+        # r1041_e82_400bps_hac_v410  R10.4.1 E8.2 (4kHz) v4.1.0 HAC
+        # ========================== =================== =======================
+        #
+        # and the aim is to extract the list of model names from the table by successfully looking for
+        # "Clair3 Models", then "Latest:", then "Config" and then "=====" and then reading the lines until
+        # the next "=====" is encountered
+        for line in StringIO(data):
+            if read_lines:
+                if line.startswith('====='):
+                    read_lines = False
+                    break
+                model = line.split()[0]
+                models.append(model)
+            if config_line_seen and line.startswith('====='):
+                read_lines = True
+                continue
+            if init_line_seen and line.startswith('Latest:'):
+                latest_seen = True
+                continue
+            if latest_seen and line.startswith('Config'):
+                config_line_seen = True
+                continue
+            if line.startswith('Clair3 Models'):
+                init_line_seen = True
+                continue
+        return models
+
+
+def fetch_model(model_name):
+    # the model files are tar gzipped, with a structure like:
+    # model_name/pileup.index
+    # model_name/full_alignment.index
+    # and other files, with the key point being that the model_name becoomes the model_directory
+
+    url = f'https://raw.githubusercontent.com/nanoporetech/rerio/refs/heads/master/clair3_models/{model_name}_model'
+    httprequest = Request(url)
+    try:
+        # urlopen throws a HTTPError if it gets a 404 status (and perhaps other non-200 status?)
+        with urlopen(httprequest) as response:
+            if response.status != 200:
+                raise IOError(f'Failed to fetch the model {model_name}: {response.status}')
+            final_url = response.read().decode('utf-8').strip()
+        httprequest = Request(final_url)
+    except HTTPError as e:
+        raise IOError(f'Failed to fetch the model {model_name}: {e}')
+
+    with urlopen(httprequest) as response:
+        if response.status != 200:
+            raise IOError(f'Failed to fetch the model {model_name} from CDN URL {final_url}: {response.status}')
+        data = response.read()
+    return data
+
+
+def unpack_model(data, outdir):
+    with tarfile.open(fileobj=BytesIO(data), mode='r:*') as tar:
+        tar.extractall(outdir)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('dm_filename', type=str, help='The filename of the data manager file to read parameters from and write outputs to')
+    parser.add_argument('--known_models', type=str, help='List of models already known in the Galaxy data table')
+    parser.add_argument('--sha256_sums', type=str, help='List of sha256sums of the models already known in the Galaxy data table')
+    parser.add_argument('--download_latest', action='store_true', default=False, help='Download the latest models as per the Rerio repository')
+    parser.add_argument('--download_models', type=str, help='Comma separated list of models to download')
+    args = parser.parse_args()
+
+    # parameters to a data manager are passed in a JSON file (see https://docs.galaxyproject.org/en/latest/dev/data_managers.html) and
+    # similarily a JSON file is created to pass the output back to Galaxy
+    models = []
+    if args.download_latest:
+        models.extend(find_latest_models())
+    if args.download_models:
+        models.extend(args.download_models.split(','))
+
+    if not models:
+        sys.exit('No models to download, please specify either --download_latest or --download_models')
+
+    with open(args.dm_filename) as fh:
+        config = json.load(fh)
+    if 'extra_files_path' not in config.get('output_data', [{}])[0]:
+        sys.exit('Please specify the output directory in the data manager configuration (the extra_files_path)')
+    output_directory = config["output_data"][0]["extra_files_path"]
+    if not Path(output_directory).exists():
+        Path(output_directory).mkdir(parents=True)
+
+    data_manager_dict = {}
+    data_manager_dict["data_tables"] = config.get("data_tables", {})
+    data_manager_dict["data_tables"][DATA_TABLE_NAME] = []
+
+    known_models = set(args.known_models.split(',')) if args.known_models else set()
+    model_to_sha256 = {}
+    if args.known_models:
+        sha256_sums = args.sha256_sums.split(',')
+        for (i, model) in enumerate(known_models):
+            model_to_sha256[model] = sha256_sums[i]
+
+    for model in models:
+        model_dir = Path(output_directory) / model
+        # The data table cannot handle duplicate entries, so we skip models that are already in the data table
+        if model in known_models:
+            print(f'Model {model} already exists, skipping', file=sys.stderr)
+            continue
+        data = fetch_model(model)
+        sha256sum = sha256(data).hexdigest()
+
+        # Since we skip models that are already known we cannot test the sha256sum here. This code is retained to illustrate that an
+        # alternative logic would be to download the model each time and check if the sha256sum matches what is already known. Hopefully
+        # ONT does not update the models while keeping the same name, so this is not needed. The sha256sum is stored in the data table
+        # in case it is needed in the future.
+        # if model in model_to_sha256 and sha256sum != model_to_sha256[model]:
+        #    sys.exit(f'Model {model} already exists with a different sha256sum {model_to_sha256[model]}. This is a serious error, inform the Galaxy admin')
+
+        unpack_model(data, output_directory)
+
+        data_manager_dict["data_tables"][DATA_TABLE_NAME].append(
+            dict(
+                value=model,
+                platform="ont",
+                sha256=sha256sum,
+                path=str(model_dir),
+                source="rerio"
+            )
+        )
+
+    with open(args.dm_filename, 'w') as fh:
+        json.dump(data_manager_dict, fh, sort_keys=True, indent=4)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml	Thu Feb 20 17:57:11 2025 +0000
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<data_managers>
+    <data_manager tool_file="data_manager/install_clair3_models.xml" id="data_manager_clair3_models">
+        <data_table name="clair3_models">
+            <output>
+                <column name="value" />
+                <column name="platform" />
+                <column name="sha256" />
+                <column name="path" output_ref="output_file" >
+                    <!-- note: the Python script sanitises the possibly user-supplied scheme name ('value') -->
+                    <move type="directory">
+                        <source>${path}</source>
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">clair3_models/#echo str($value)#</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/clair3_models/#echo str($value)#</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+                <column name="source" />
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/clair3_models.loc	Thu Feb 20 17:57:11 2025 +0000
@@ -0,0 +1,11 @@
+# this is a table separated file describing the locations of Clair3 models (which are download from Oxford Nanopore's Rerio site and provided as directories)
+#
+# the columns are:
+# 1. value
+# 2. platform
+# 3. sha256sum (sha256 hash of the downloaded model, before unpacking)
+# 4. path (path to directory containing model)
+# 5. source (where the model came from. rerio means Oxford Nanopore's https://github.com/nanoporetech/rerio with its associated license)
+# for example
+#
+# r1041_e82_400bps_hac_v500	ont	a1b998a80bc94ba4f5babc811d62e83a61bba3819188c488daee1c698bb72ae5	/data/galaxy/tool_data/clair3_models/r1041_e82_400bps_hac_v500	rerio
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/clair3_models.loc.sample	Thu Feb 20 17:57:11 2025 +0000
@@ -0,0 +1,11 @@
+# this is a table separated file describing the locations of Clair3 models (which are download from Oxford Nanopore's Rerio site and provided as directories)
+#
+# the columns are:
+# 1. value
+# 2. platform
+# 3. sha256sum (sha256 hash of the downloaded model, before unpacking)
+# 4. path (path to directory containing model)
+# 5. source (where the model came from. rerio means Oxford Nanopore's https://github.com/nanoporetech/rerio with its associated license)
+# for example
+#
+# r1041_e82_400bps_hac_v500	ont	a1b998a80bc94ba4f5babc811d62e83a61bba3819188c488daee1c698bb72ae5	/data/galaxy/tool_data/clair3_models/r1041_e82_400bps_hac_v500	rerio
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Thu Feb 20 17:57:11 2025 +0000
@@ -0,0 +1,6 @@
+<tables>
+    <table name="clair3_models" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, platform, sha256, path, source</columns>
+        <file path="tool-data/clair3_models.loc" />
+    </table>
+</tables>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.test	Thu Feb 20 17:57:11 2025 +0000
@@ -0,0 +1,6 @@
+<tables>
+    <table name="clair3_models" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, platform, sha256, path, source</columns>
+        <file path="${__HERE__}/test-data/clair3_models.loc" />
+    </table>
+</tables>
\ No newline at end of file