Mercurial > repos > iuc > data_manager_ncbi_fcs_gx_database_downloader
changeset 1:35cef758050c draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/data_managers/data_manager_ncbi_fcs_gx_database_downloader commit 25c9d8d297d0e10f92e373f6a959274dedc10433
author | iuc |
---|---|
date | Wed, 09 Oct 2024 08:53:07 +0000 |
parents | 6be6e6198ac3 |
children | |
files | data_manager/data_manager_ncbi_fcs_gx_database_downloader.py data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml data_manager/macros.xml data_manager_conf.xml test-data/ncbi_fcs_gx_databases.loc test-data/ncbi_fcs_gx_databases_ext.loc test-data/ncbi_fcs_gx_divisions.tsv test-data/test.json tool-data/ncbi_fcs_gx_databases.loc.sample tool-data/ncbi_fcs_gx_databases_ext.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
diffstat | 12 files changed, 181 insertions(+), 50 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/data_manager_ncbi_fcs_gx_database_downloader.py Fri Jan 12 22:11:17 2024 +0000 +++ b/data_manager/data_manager_ncbi_fcs_gx_database_downloader.py Wed Oct 09 08:53:07 2024 +0000 @@ -4,14 +4,15 @@ import json import os import subprocess +import typing -def main(): +def main() -> None: opts = parse_args() output_dict = { "data_tables": { - "ncbi_fcs_gx_databases": sync_files(opts), + "ncbi_fcs_gx_databases_ext": sync_files(opts), "ncbi_fcs_gx_divisions": get_divisions(opts), } } @@ -20,17 +21,23 @@ print(json.dumps(output_dict, sort_keys=True, indent=2), file=f) -def parse_args(): +def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser() - parser.add_argument("--tag", required=True) - parser.add_argument("--source_manifest", required=True) + + parser.add_argument("--tag", required=True, help="Unique identifier for this database") + parser.add_argument("--description", required=True, help="Description for this database") + parser.add_argument("--source_manifest", required=True, help="Should the tool use the source manifest") + parser.add_argument("--use_source_manifest", action="store_true", help="Manifest file for this database") + parser.add_argument("--phone_home", action="store_true", help="Should phone home be enabled") + parser.add_argument("--phone_home_label", default="", help="Phone home label") + parser.add_argument("--node_cache_dir", required=True, help="Directory to copy database to local node") parser.add_argument("--output_file", required=True) parser.add_argument("--output_dir", required=True) return parser.parse_args() -def sync_files(opts): +def sync_files(opts: argparse.Namespace) -> typing.Dict[str, typing.List[typing.Dict[str, str]]]: os.makedirs(opts.output_dir, exist_ok=True) args = [ @@ -51,8 +58,12 @@ "add": [ { "value": opts.tag, + "description": opts.description, "source_manifest": opts.source_manifest, - "name": opts.output_dir, + "use_source_manifest": "1" if opts.use_source_manifest else "0", + "phone_home": "1" if opts.phone_home else "0", + "phone_home_label": opts.phone_home_label, + "local_manifest": opts.output_dir, } ] } @@ -60,7 +71,7 @@ return entries_dict -def get_divisions(opts): +def get_divisions(opts: argparse.Namespace) -> typing.Dict[str, typing.List[typing.Dict[str, str]]]: # descriptions for the top-level gx divisions top_level_description = { "anml": "Animals (Metazoa)", @@ -99,10 +110,10 @@ # add an element to support unknown/unclassified samples elements.append(("Unknown / Unclassified", "unkn:unknown")) - entries_dict = {"add": []} + entries_dict: typing.Dict[str, typing.List[typing.Dict[str, str]]] = {"add": []} for name, gx_div in sorted(elements): - entries_dict["add"].append({"value": gx_div, "tag": opts.tag, "name": name}) + entries_dict["add"].append({"value": gx_div, "tag": opts.tag, "description": name}) return entries_dict
--- a/data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml Fri Jan 12 22:11:17 2024 +0000 +++ b/data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml Wed Oct 09 08:53:07 2024 +0000 @@ -7,29 +7,129 @@ <expand macro="requirements"/> <command detect_errors="exit_code"><![CDATA[ python '$__tool_directory__/data_manager_ncbi_fcs_gx_database_downloader.py' ---tag '$tag' ---source_manifest '$source_manifest' ---output_file '$output_file' ---output_dir '$output_file.extra_files_path' + --tag '$tag' + --description '$description' + --source_manifest '$source_manifest' +#if str($use_source_manifest) == "true" + --use_source_manifest +#end if +#if str($phone_home) == "true" + --phone_home + --phone_home_label '$phone_home_label' +#end if + --node_cache_dir '\${TMPDIR}' + --output_file '$output_file' + --output_dir '$output_file.extra_files_path' ]]></command> <inputs> - <param name="tag" type="text" label="Unique identifier for this database"/> - <param name="source_manifest" type="text" label="Source Manifest"/> + <param name="tag" type="text" optional="false" label="Unique identifier for this database"/> + <param name="description" type="text" optional="false" label="Description for this database"/> + <param name="source_manifest" type="text" optional="false" label="Manifest file for this database"/> + <param name="use_source_manifest" type="boolean" label="Should the tool use the source manifest"/> + <param name="phone_home" type="boolean" label="Should phone home be enabled"/> + <param name="phone_home_label" type="text" label="Phone home label"/> </inputs> <outputs> <data name="output_file" format="data_manager_json"/> </outputs> <tests> <test> - <param name="tag" value="test-only"/> + <param name="tag" value="test"/> + <param name="description" value="Test Database"/> <param name="source_manifest" value="https://ncbi-fcs-gx.s3.amazonaws.com/gxdb/test-only/test-only.manifest"/> + <param name="use_source_manifest" value="true"/> + <param name="phone_home" value="false"/> <output name="output_file" file="test.json" compare="re_match"/> </test> </tests> <help><![CDATA[ -This tool downloads NCBI FCS GX databases makes them available to the NCBI FCX GX tool. + +Overview +======== + +The NCBI FCS GX tool requires a curated reference database as described in the paper, `Rapid and sensitive detection of genome contamination at scale with FCS‑GX <https://doi.org/10.1186/s13059-024-03198-7>`_. The current database is about 470 GiB in total. Each database includes a json-formatted manifest file with contains details about each database file. A sample manifest file can be found below. + +The data manager downloads the GX database given a manifest file. It takes six inputs: + +1. **tag** - unique identifier for this database chosen by the Galaxy Admin +2. **description** - description for this database seen and selectable by the user when running the NCBI FCS GX tool +3. **source_manifest** - manifest file for this database (url or filesystem path) +4. **use_source_manifest** - when true, the compute node will download the GX database itself instead of using the local copy +5. **phone_home** - when true, the NCBI FCS GX tool will send analytics to NCBI about the run, The code for this can be seen `here <https://github.com/ncbi/fcs-gx/blob/release/scripts/run_gx.py#L79-L115>`_. It sends the following information: + + 1. version of the gx executable + 2. build date of the GX database + 3. the platform the software is running on + 4. the version of the Python interpreter + 5. the size of physical memory in GiB + 6. the duration of the run + 7. the run’s exit status (0 for success, otherwise 1) + 8. **phone_home_label** + +6. **phone_home_label** - arbitrary string set by the Galaxy Admin to identify the analytics data sent to NCBI + +The data manager also creates a lookup table for the NCBI FCS GX tool based on the `taxa.tsv <https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/latest/all.taxa.tsv>`_ file in the database. + +Sample Manifest File +==================== + +.. code-block:: JSON -See https://github.com/ncbi/fcs/wiki/FCS-GX#b-download-the-database + { + "version": 1, + "totalFiles": 8, + "timeStamp": "2023-01-24T16:18:22.220812", + "fileDetails": [ + { + "fileName": "all.blast_div.tsv.gz", + "fileSize": 8241107, + "hashAlgorithm": "md5", + "hashValue": "a6b08c85c46da76548fff6ed220f8f9d" + }, + { + "fileName": "all.assemblies.tsv", + "fileSize": 8887448, + "hashAlgorithm": "md5", + "hashValue": "441beceb8c467593fa6b87a071c5ec6b" + }, + { + "fileName": "all.taxa.tsv", + "fileSize": 6385518, + "hashAlgorithm": "md5", + "hashValue": "c94d1fc80f81dbbf30b114d4cdaf29ad" + }, + { + "fileName": "all.gxs", + "fileSize": 177317125807, + "hashAlgorithm": "md5", + "hashValue": "da205626565a61be6dfd8c9b5ed1a9b7" + }, + { + "fileName": "all.meta.jsonl", + "fileSize": 59, + "hashAlgorithm": "md5", + "hashValue": "c2096cdb8106d44a310052b06a23836c" + }, + { + "fileName": "all.gxi", + "fileSize": 321216733352, + "hashAlgorithm": "md5", + "hashValue": "36bf346693e2b9de693de38efe219aa7" + }, + { + "fileName": "all.seq_info.tsv.gz", + "fileSize": 22549956, + "hashAlgorithm": "md5", + "hashValue": "6a760eed5a94aaf46d4dd8c75f370875" + }, + { + "fileName": "all.README.txt", + "fileSize": 187, + "hashAlgorithm": "md5", + "hashValue": "7deb2d4fa5241f95a25073fb43147cb1" + } + ] + } ]]></help> <expand macro="citations"/> </tool>
--- a/data_manager/macros.xml Fri Jan 12 22:11:17 2024 +0000 +++ b/data_manager/macros.xml Wed Oct 09 08:53:07 2024 +0000 @@ -2,10 +2,9 @@ <xml name="requirements"> <requirements> <requirement type="package" version="@TOOL_VERSION@">ncbi-fcs-gx</requirement> - <yield/> </requirements> </xml> - <token name="@TOOL_VERSION@">0.5.0</token> + <token name="@TOOL_VERSION@">0.5.4</token> <token name="@VERSION_SUFFIX@">0</token> <token name="@PROFILE@">21.05</token> <xml name="edam_ontology"> @@ -16,7 +15,6 @@ <xml name="citations"> <citations> <citation type="doi">10.1101/2023.06.02.543519</citation> - <yield/> </citations> </xml> </macros>
--- a/data_manager_conf.xml Fri Jan 12 22:11:17 2024 +0000 +++ b/data_manager_conf.xml Wed Oct 09 08:53:07 2024 +0000 @@ -1,18 +1,22 @@ <data_managers> <data_manager tool_file="data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml" id="data_manager_ncbi_fcs_gx_database_downloader"> - <data_table name="ncbi_fcs_gx_databases"> + <data_table name="ncbi_fcs_gx_databases_ext"> <output> <column name="value" /> + <column name="name" /> <column name="source_manifest" /> - <column name="name" output_ref="output_file"> + <column name="use_source_manifest" /> + <column name="phone_home" /> + <column name="phone_home_label" /> + <column name="local_manifest" output_ref="output_file"> <move type="directory"> - <source>${name}</source> - <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">ncbi_fcs_gx_databases/${value}</target> + <source>${local_manifest}</source> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">ncbi_fcs_gx_databases_ext/${value}</target> </move> <value_translation><![CDATA[ #import os #set manifest_filename = os.path.basename($source_manifest) -$GALAXY_DATA_MANAGER_DATA_PATH/ncbi_fcs_gx_databases/$value/$manifest_filename +$GALAXY_DATA_MANAGER_DATA_PATH/ncbi_fcs_gx_databases_ext/$value/$manifest_filename ]]></value_translation> <value_translation type="function">abspath</value_translation> </column>
--- a/test-data/ncbi_fcs_gx_databases.loc Fri Jan 12 22:11:17 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -#tag source_manifest local_manifest
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ncbi_fcs_gx_databases_ext.loc Wed Oct 09 08:53:07 2024 +0000 @@ -0,0 +1,2 @@ +#tag description source_manifest use_source_manifest phone_home phone_home_label local_manifest +test https://ncbi-fcs-gx.s3.amazonaws.com/gxdb/test-only/test-only.manifest 1 0 /scratch/rico/galaxy/tool-data/ncbi_fcs_gx_databases_ext/test/test-only.manifest
--- a/test-data/ncbi_fcs_gx_divisions.tsv Fri Jan 12 22:11:17 2024 +0000 +++ b/test-data/ncbi_fcs_gx_divisions.tsv Wed Oct 09 08:53:07 2024 +0000 @@ -1,1 +1,3 @@ -#gx_div tag description +#tag gx_div description +prok:CFB group bacteria test +unkn:unknown test
--- a/test-data/test.json Fri Jan 12 22:11:17 2024 +0000 +++ b/test-data/test.json Wed Oct 09 08:53:07 2024 +0000 @@ -1,24 +1,28 @@ \{ "data_tables": \{ - "ncbi_fcs_gx_databases": \{ + "ncbi_fcs_gx_databases_ext": \{ "add": \[ \{ - "name": "[^"]+", + "description": "Test Database", + "local_manifest": ".+", + "phone_home": "0", + "phone_home_label": "", "source_manifest": "https://ncbi-fcs-gx.s3.amazonaws.com/gxdb/test-only/test-only.manifest", - "value": "test-only" + "use_source_manifest": "1", + "value": "test" \} \] \}, "ncbi_fcs_gx_divisions": \{ "add": \[ \{ - "name": "Bacteria - CFB group bacteria", - "tag": "test-only", + "description": "Bacteria - CFB group bacteria", + "tag": "test", "value": "prok:CFB group bacteria" \}, \{ - "name": "Unknown / Unclassified", - "tag": "test-only", + "description": "Unknown / Unclassified", + "tag": "test", "value": "unkn:unknown" \} \]
--- a/tool-data/ncbi_fcs_gx_databases.loc.sample Fri Jan 12 22:11:17 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,8 +0,0 @@ -## NCBI FCS GX Databases -# -#tag manifest path -#r2022-01-24 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/r2022-01-24/all.manifest /big/data/dir/ncbi_fcs_gx_databases/r2022-01-24/all.manifest -#r2022-07-08 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/r2022-07-08/all.manifest /big/data/dir/ncbi_fcs_gx_databases/r2022-07-08/all.manifest -#r2023-01-24 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/r2023-01-24/all.manifest /big/data/dir/ncbi_fcs_gx_databases/r2023-01-24/all.manifest -#latest https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/latest/all.manifest /big/data/dir/ncbi_fcs_gx_databases/latest/all.manifest -#test-only https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.manifest /big/data/dir/ncbi_fcs_gx_databases/test-only/test-only.manifest
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/ncbi_fcs_gx_databases_ext.loc.sample Wed Oct 09 08:53:07 2024 +0000 @@ -0,0 +1,19 @@ +# When phone_home is set to "1", the NCBI FCS GX tool will send analytics +# to NCBI about the run. The following information is sent: +# +# 1. version of the gx executable +# 2. build date of the GX database +# 3. the platform the software is running on +# 4. the version of the Python interpreter +# 5. the size of physical memory in GiB +# 6. the duration of the run +# 7. the run’s exit status (0 for success, otherwise 1) +# 8. phone_home_label +# +# The phone_home_label is an arbitrary string send to NCBI to identify +# data. For instance, all NCBI FCS GX runs on usegalaxy.org use the +# phone_home_label "usegalaxy.org" +# +#tag description source_manifest use_source_manifest phone_home phone_home_label local_manifest +#latest Full GX Database https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/latest/all.manifest 0 1 usegalaxy.org /big/data/dir/ncbi_fcs_gx_databases_ext/latest/all.manifest +#test Test GX Database https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.manifest 0 1 usegalaxy.org /big/data/dir/ncbi_fcs_gx_databases_ext/test/test-only.manifest
--- a/tool_data_table_conf.xml.sample Fri Jan 12 22:11:17 2024 +0000 +++ b/tool_data_table_conf.xml.sample Wed Oct 09 08:53:07 2024 +0000 @@ -1,8 +1,8 @@ <tables> <!-- Locations of NCBI FCS GX databases --> - <table name="ncbi_fcs_gx_databases" comment_char="#"> - <columns>value, source_manifest, name</columns> - <file path="tool-data/ncbi_fcs_gx_databases.loc" /> + <table name="ncbi_fcs_gx_databases_ext" comment_char="#"> + <columns>value, name, source_manifest, use_source_manifest, phone_home, phone_home_label, local_manifest</columns> + <file path="tool-data/ncbi_fcs_gx_databases_ext.loc" /> </table> <!-- NCBI FCS GX divisions --> <table name="ncbi_fcs_gx_divisions" comment_char="#">
--- a/tool_data_table_conf.xml.test Fri Jan 12 22:11:17 2024 +0000 +++ b/tool_data_table_conf.xml.test Wed Oct 09 08:53:07 2024 +0000 @@ -1,8 +1,8 @@ <tables> <!-- Locations of NCBI FCS GX databases --> - <table name="ncbi_fcs_gx_databases" comment_char="#"> - <columns>value, source_manifest, name</columns> - <file path="${__HERE__}/test-data/ncbi_fcs_gx_databases.loc" /> + <table name="ncbi_fcs_gx_databases_ext" comment_char="#"> + <columns>value, name, source_manifest, use_source_manifest, phone_home, phone_home_label, local_manifest</columns> + <file path="${__HERE__}/test-data/ncbi_fcs_gx_databases_ext.loc" /> </table> <!-- NCBI FCS GX divisions --> <table name="ncbi_fcs_gx_divisions" comment_char="#">