# HG changeset patch # User iuc # Date 1728463987 0 # Node ID 35cef758050c8486ce2d87976b2e147614b130d0 # Parent 6be6e6198ac3af1bc533b82a04e84e549fc8e63a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/data_managers/data_manager_ncbi_fcs_gx_database_downloader commit 25c9d8d297d0e10f92e373f6a959274dedc10433 diff -r 6be6e6198ac3 -r 35cef758050c data_manager/data_manager_ncbi_fcs_gx_database_downloader.py --- a/data_manager/data_manager_ncbi_fcs_gx_database_downloader.py Fri Jan 12 22:11:17 2024 +0000 +++ b/data_manager/data_manager_ncbi_fcs_gx_database_downloader.py Wed Oct 09 08:53:07 2024 +0000 @@ -4,14 +4,15 @@ import json import os import subprocess +import typing -def main(): +def main() -> None: opts = parse_args() output_dict = { "data_tables": { - "ncbi_fcs_gx_databases": sync_files(opts), + "ncbi_fcs_gx_databases_ext": sync_files(opts), "ncbi_fcs_gx_divisions": get_divisions(opts), } } @@ -20,17 +21,23 @@ print(json.dumps(output_dict, sort_keys=True, indent=2), file=f) -def parse_args(): +def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser() - parser.add_argument("--tag", required=True) - parser.add_argument("--source_manifest", required=True) + + parser.add_argument("--tag", required=True, help="Unique identifier for this database") + parser.add_argument("--description", required=True, help="Description for this database") + parser.add_argument("--source_manifest", required=True, help="Should the tool use the source manifest") + parser.add_argument("--use_source_manifest", action="store_true", help="Manifest file for this database") + parser.add_argument("--phone_home", action="store_true", help="Should phone home be enabled") + parser.add_argument("--phone_home_label", default="", help="Phone home label") + parser.add_argument("--node_cache_dir", required=True, help="Directory to copy database to local node") parser.add_argument("--output_file", required=True) parser.add_argument("--output_dir", required=True) return parser.parse_args() -def sync_files(opts): +def sync_files(opts: argparse.Namespace) -> typing.Dict[str, typing.List[typing.Dict[str, str]]]: os.makedirs(opts.output_dir, exist_ok=True) args = [ @@ -51,8 +58,12 @@ "add": [ { "value": opts.tag, + "description": opts.description, "source_manifest": opts.source_manifest, - "name": opts.output_dir, + "use_source_manifest": "1" if opts.use_source_manifest else "0", + "phone_home": "1" if opts.phone_home else "0", + "phone_home_label": opts.phone_home_label, + "local_manifest": opts.output_dir, } ] } @@ -60,7 +71,7 @@ return entries_dict -def get_divisions(opts): +def get_divisions(opts: argparse.Namespace) -> typing.Dict[str, typing.List[typing.Dict[str, str]]]: # descriptions for the top-level gx divisions top_level_description = { "anml": "Animals (Metazoa)", @@ -99,10 +110,10 @@ # add an element to support unknown/unclassified samples elements.append(("Unknown / Unclassified", "unkn:unknown")) - entries_dict = {"add": []} + entries_dict: typing.Dict[str, typing.List[typing.Dict[str, str]]] = {"add": []} for name, gx_div in sorted(elements): - entries_dict["add"].append({"value": gx_div, "tag": opts.tag, "name": name}) + entries_dict["add"].append({"value": gx_div, "tag": opts.tag, "description": name}) return entries_dict diff -r 6be6e6198ac3 -r 35cef758050c data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml --- a/data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml Fri Jan 12 22:11:17 2024 +0000 +++ b/data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml Wed Oct 09 08:53:07 2024 +0000 @@ -7,29 +7,129 @@ - - + + + + + + - + + + + `_. The current database is about 470 GiB in total. Each database includes a json-formatted manifest file with contains details about each database file. A sample manifest file can be found below. + +The data manager downloads the GX database given a manifest file. It takes six inputs: + +1. **tag** - unique identifier for this database chosen by the Galaxy Admin +2. **description** - description for this database seen and selectable by the user when running the NCBI FCS GX tool +3. **source_manifest** - manifest file for this database (url or filesystem path) +4. **use_source_manifest** - when true, the compute node will download the GX database itself instead of using the local copy +5. **phone_home** - when true, the NCBI FCS GX tool will send analytics to NCBI about the run, The code for this can be seen `here `_. It sends the following information: + + 1. version of the gx executable + 2. build date of the GX database + 3. the platform the software is running on + 4. the version of the Python interpreter + 5. the size of physical memory in GiB + 6. the duration of the run + 7. the run’s exit status (0 for success, otherwise 1) + 8. **phone_home_label** + +6. **phone_home_label** - arbitrary string set by the Galaxy Admin to identify the analytics data sent to NCBI + +The data manager also creates a lookup table for the NCBI FCS GX tool based on the `taxa.tsv `_ file in the database. + +Sample Manifest File +==================== + +.. code-block:: JSON -See https://github.com/ncbi/fcs/wiki/FCS-GX#b-download-the-database + { + "version": 1, + "totalFiles": 8, + "timeStamp": "2023-01-24T16:18:22.220812", + "fileDetails": [ + { + "fileName": "all.blast_div.tsv.gz", + "fileSize": 8241107, + "hashAlgorithm": "md5", + "hashValue": "a6b08c85c46da76548fff6ed220f8f9d" + }, + { + "fileName": "all.assemblies.tsv", + "fileSize": 8887448, + "hashAlgorithm": "md5", + "hashValue": "441beceb8c467593fa6b87a071c5ec6b" + }, + { + "fileName": "all.taxa.tsv", + "fileSize": 6385518, + "hashAlgorithm": "md5", + "hashValue": "c94d1fc80f81dbbf30b114d4cdaf29ad" + }, + { + "fileName": "all.gxs", + "fileSize": 177317125807, + "hashAlgorithm": "md5", + "hashValue": "da205626565a61be6dfd8c9b5ed1a9b7" + }, + { + "fileName": "all.meta.jsonl", + "fileSize": 59, + "hashAlgorithm": "md5", + "hashValue": "c2096cdb8106d44a310052b06a23836c" + }, + { + "fileName": "all.gxi", + "fileSize": 321216733352, + "hashAlgorithm": "md5", + "hashValue": "36bf346693e2b9de693de38efe219aa7" + }, + { + "fileName": "all.seq_info.tsv.gz", + "fileSize": 22549956, + "hashAlgorithm": "md5", + "hashValue": "6a760eed5a94aaf46d4dd8c75f370875" + }, + { + "fileName": "all.README.txt", + "fileSize": 187, + "hashAlgorithm": "md5", + "hashValue": "7deb2d4fa5241f95a25073fb43147cb1" + } + ] + } ]]> diff -r 6be6e6198ac3 -r 35cef758050c data_manager/macros.xml --- a/data_manager/macros.xml Fri Jan 12 22:11:17 2024 +0000 +++ b/data_manager/macros.xml Wed Oct 09 08:53:07 2024 +0000 @@ -2,10 +2,9 @@ ncbi-fcs-gx - - 0.5.0 + 0.5.4 0 21.05 @@ -16,7 +15,6 @@ 10.1101/2023.06.02.543519 - diff -r 6be6e6198ac3 -r 35cef758050c data_manager_conf.xml --- a/data_manager_conf.xml Fri Jan 12 22:11:17 2024 +0000 +++ b/data_manager_conf.xml Wed Oct 09 08:53:07 2024 +0000 @@ -1,18 +1,22 @@ - + + - + + + + - ${name} - ncbi_fcs_gx_databases/${value} + ${local_manifest} + ncbi_fcs_gx_databases_ext/${value} abspath diff -r 6be6e6198ac3 -r 35cef758050c test-data/ncbi_fcs_gx_databases.loc --- a/test-data/ncbi_fcs_gx_databases.loc Fri Jan 12 22:11:17 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -#tag source_manifest local_manifest diff -r 6be6e6198ac3 -r 35cef758050c test-data/ncbi_fcs_gx_databases_ext.loc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ncbi_fcs_gx_databases_ext.loc Wed Oct 09 08:53:07 2024 +0000 @@ -0,0 +1,2 @@ +#tag description source_manifest use_source_manifest phone_home phone_home_label local_manifest +test https://ncbi-fcs-gx.s3.amazonaws.com/gxdb/test-only/test-only.manifest 1 0 /scratch/rico/galaxy/tool-data/ncbi_fcs_gx_databases_ext/test/test-only.manifest diff -r 6be6e6198ac3 -r 35cef758050c test-data/ncbi_fcs_gx_divisions.tsv --- a/test-data/ncbi_fcs_gx_divisions.tsv Fri Jan 12 22:11:17 2024 +0000 +++ b/test-data/ncbi_fcs_gx_divisions.tsv Wed Oct 09 08:53:07 2024 +0000 @@ -1,1 +1,3 @@ -#gx_div tag description +#tag gx_div description +prok:CFB group bacteria test +unkn:unknown test diff -r 6be6e6198ac3 -r 35cef758050c test-data/test.json --- a/test-data/test.json Fri Jan 12 22:11:17 2024 +0000 +++ b/test-data/test.json Wed Oct 09 08:53:07 2024 +0000 @@ -1,24 +1,28 @@ \{ "data_tables": \{ - "ncbi_fcs_gx_databases": \{ + "ncbi_fcs_gx_databases_ext": \{ "add": \[ \{ - "name": "[^"]+", + "description": "Test Database", + "local_manifest": ".+", + "phone_home": "0", + "phone_home_label": "", "source_manifest": "https://ncbi-fcs-gx.s3.amazonaws.com/gxdb/test-only/test-only.manifest", - "value": "test-only" + "use_source_manifest": "1", + "value": "test" \} \] \}, "ncbi_fcs_gx_divisions": \{ "add": \[ \{ - "name": "Bacteria - CFB group bacteria", - "tag": "test-only", + "description": "Bacteria - CFB group bacteria", + "tag": "test", "value": "prok:CFB group bacteria" \}, \{ - "name": "Unknown / Unclassified", - "tag": "test-only", + "description": "Unknown / Unclassified", + "tag": "test", "value": "unkn:unknown" \} \] diff -r 6be6e6198ac3 -r 35cef758050c tool-data/ncbi_fcs_gx_databases.loc.sample --- a/tool-data/ncbi_fcs_gx_databases.loc.sample Fri Jan 12 22:11:17 2024 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,8 +0,0 @@ -## NCBI FCS GX Databases -# -#tag manifest path -#r2022-01-24 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/r2022-01-24/all.manifest /big/data/dir/ncbi_fcs_gx_databases/r2022-01-24/all.manifest -#r2022-07-08 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/r2022-07-08/all.manifest /big/data/dir/ncbi_fcs_gx_databases/r2022-07-08/all.manifest -#r2023-01-24 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/r2023-01-24/all.manifest /big/data/dir/ncbi_fcs_gx_databases/r2023-01-24/all.manifest -#latest https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/latest/all.manifest /big/data/dir/ncbi_fcs_gx_databases/latest/all.manifest -#test-only https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.manifest /big/data/dir/ncbi_fcs_gx_databases/test-only/test-only.manifest diff -r 6be6e6198ac3 -r 35cef758050c tool-data/ncbi_fcs_gx_databases_ext.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/ncbi_fcs_gx_databases_ext.loc.sample Wed Oct 09 08:53:07 2024 +0000 @@ -0,0 +1,19 @@ +# When phone_home is set to "1", the NCBI FCS GX tool will send analytics +# to NCBI about the run. The following information is sent: +# +# 1. version of the gx executable +# 2. build date of the GX database +# 3. the platform the software is running on +# 4. the version of the Python interpreter +# 5. the size of physical memory in GiB +# 6. the duration of the run +# 7. the run’s exit status (0 for success, otherwise 1) +# 8. phone_home_label +# +# The phone_home_label is an arbitrary string send to NCBI to identify +# data. For instance, all NCBI FCS GX runs on usegalaxy.org use the +# phone_home_label "usegalaxy.org" +# +#tag description source_manifest use_source_manifest phone_home phone_home_label local_manifest +#latest Full GX Database https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/latest/all.manifest 0 1 usegalaxy.org /big/data/dir/ncbi_fcs_gx_databases_ext/latest/all.manifest +#test Test GX Database https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.manifest 0 1 usegalaxy.org /big/data/dir/ncbi_fcs_gx_databases_ext/test/test-only.manifest diff -r 6be6e6198ac3 -r 35cef758050c tool_data_table_conf.xml.sample --- a/tool_data_table_conf.xml.sample Fri Jan 12 22:11:17 2024 +0000 +++ b/tool_data_table_conf.xml.sample Wed Oct 09 08:53:07 2024 +0000 @@ -1,8 +1,8 @@ - - value, source_manifest, name - +
+ value, name, source_manifest, use_source_manifest, phone_home, phone_home_label, local_manifest +
diff -r 6be6e6198ac3 -r 35cef758050c tool_data_table_conf.xml.test --- a/tool_data_table_conf.xml.test Fri Jan 12 22:11:17 2024 +0000 +++ b/tool_data_table_conf.xml.test Wed Oct 09 08:53:07 2024 +0000 @@ -1,8 +1,8 @@ -
- value, source_manifest, name - +
+ value, name, source_manifest, use_source_manifest, phone_home, phone_home_label, local_manifest +