changeset 1:35cef758050c draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/data_managers/data_manager_ncbi_fcs_gx_database_downloader commit 25c9d8d297d0e10f92e373f6a959274dedc10433
author iuc
date Wed, 09 Oct 2024 08:53:07 +0000
parents 6be6e6198ac3
children
files data_manager/data_manager_ncbi_fcs_gx_database_downloader.py data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml data_manager/macros.xml data_manager_conf.xml test-data/ncbi_fcs_gx_databases.loc test-data/ncbi_fcs_gx_databases_ext.loc test-data/ncbi_fcs_gx_divisions.tsv test-data/test.json tool-data/ncbi_fcs_gx_databases.loc.sample tool-data/ncbi_fcs_gx_databases_ext.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test
diffstat 12 files changed, 181 insertions(+), 50 deletions(-) [+]
line wrap: on
line diff
--- a/data_manager/data_manager_ncbi_fcs_gx_database_downloader.py	Fri Jan 12 22:11:17 2024 +0000
+++ b/data_manager/data_manager_ncbi_fcs_gx_database_downloader.py	Wed Oct 09 08:53:07 2024 +0000
@@ -4,14 +4,15 @@
 import json
 import os
 import subprocess
+import typing
 
 
-def main():
+def main() -> None:
     opts = parse_args()
 
     output_dict = {
         "data_tables": {
-            "ncbi_fcs_gx_databases": sync_files(opts),
+            "ncbi_fcs_gx_databases_ext": sync_files(opts),
             "ncbi_fcs_gx_divisions": get_divisions(opts),
         }
     }
@@ -20,17 +21,23 @@
         print(json.dumps(output_dict, sort_keys=True, indent=2), file=f)
 
 
-def parse_args():
+def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser()
-    parser.add_argument("--tag", required=True)
-    parser.add_argument("--source_manifest", required=True)
+
+    parser.add_argument("--tag", required=True, help="Unique identifier for this database")
+    parser.add_argument("--description", required=True, help="Description for this database")
+    parser.add_argument("--source_manifest", required=True, help="Should the tool use the source manifest")
+    parser.add_argument("--use_source_manifest", action="store_true", help="Manifest file for this database")
+    parser.add_argument("--phone_home", action="store_true", help="Should phone home be enabled")
+    parser.add_argument("--phone_home_label", default="", help="Phone home label")
+    parser.add_argument("--node_cache_dir", required=True, help="Directory to copy database to local node")
     parser.add_argument("--output_file", required=True)
     parser.add_argument("--output_dir", required=True)
 
     return parser.parse_args()
 
 
-def sync_files(opts):
+def sync_files(opts: argparse.Namespace) -> typing.Dict[str, typing.List[typing.Dict[str, str]]]:
     os.makedirs(opts.output_dir, exist_ok=True)
 
     args = [
@@ -51,8 +58,12 @@
         "add": [
             {
                 "value": opts.tag,
+                "description": opts.description,
                 "source_manifest": opts.source_manifest,
-                "name": opts.output_dir,
+                "use_source_manifest": "1" if opts.use_source_manifest else "0",
+                "phone_home": "1" if opts.phone_home else "0",
+                "phone_home_label": opts.phone_home_label,
+                "local_manifest": opts.output_dir,
             }
         ]
     }
@@ -60,7 +71,7 @@
     return entries_dict
 
 
-def get_divisions(opts):
+def get_divisions(opts: argparse.Namespace) -> typing.Dict[str, typing.List[typing.Dict[str, str]]]:
     # descriptions for the top-level gx divisions
     top_level_description = {
         "anml": "Animals (Metazoa)",
@@ -99,10 +110,10 @@
     # add an element to support unknown/unclassified samples
     elements.append(("Unknown / Unclassified", "unkn:unknown"))
 
-    entries_dict = {"add": []}
+    entries_dict: typing.Dict[str, typing.List[typing.Dict[str, str]]] = {"add": []}
 
     for name, gx_div in sorted(elements):
-        entries_dict["add"].append({"value": gx_div, "tag": opts.tag, "name": name})
+        entries_dict["add"].append({"value": gx_div, "tag": opts.tag, "description": name})
 
     return entries_dict
 
--- a/data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml	Fri Jan 12 22:11:17 2024 +0000
+++ b/data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml	Wed Oct 09 08:53:07 2024 +0000
@@ -7,29 +7,129 @@
     <expand macro="requirements"/>
     <command detect_errors="exit_code"><![CDATA[
 python '$__tool_directory__/data_manager_ncbi_fcs_gx_database_downloader.py'
---tag '$tag'
---source_manifest '$source_manifest'
---output_file '$output_file'
---output_dir '$output_file.extra_files_path'
+    --tag '$tag'
+    --description '$description'
+    --source_manifest '$source_manifest'
+#if str($use_source_manifest) == "true"
+    --use_source_manifest
+#end if
+#if str($phone_home) == "true"
+    --phone_home
+    --phone_home_label '$phone_home_label'
+#end if
+    --node_cache_dir '\${TMPDIR}'
+    --output_file '$output_file'
+    --output_dir '$output_file.extra_files_path'
     ]]></command>
     <inputs>
-        <param name="tag" type="text" label="Unique identifier for this database"/>
-        <param name="source_manifest" type="text" label="Source Manifest"/>
+        <param name="tag" type="text" optional="false" label="Unique identifier for this database"/>
+        <param name="description" type="text" optional="false" label="Description for this database"/>
+        <param name="source_manifest" type="text" optional="false" label="Manifest file for this database"/>
+        <param name="use_source_manifest" type="boolean" label="Should the tool use the source manifest"/>
+        <param name="phone_home" type="boolean" label="Should phone home be enabled"/>
+        <param name="phone_home_label" type="text" label="Phone home label"/>
     </inputs>
     <outputs>
         <data name="output_file" format="data_manager_json"/>
     </outputs>
     <tests>
         <test>
-            <param name="tag" value="test-only"/>
+            <param name="tag" value="test"/>
+            <param name="description" value="Test Database"/>
             <param name="source_manifest" value="https://ncbi-fcs-gx.s3.amazonaws.com/gxdb/test-only/test-only.manifest"/>
+            <param name="use_source_manifest" value="true"/>
+            <param name="phone_home" value="false"/>
             <output name="output_file" file="test.json" compare="re_match"/>
         </test>
     </tests>
     <help><![CDATA[
-This tool downloads NCBI FCS GX databases makes them available to the NCBI FCX GX tool.
+
+Overview
+========
+
+The NCBI FCS GX tool requires a curated reference database as described in the paper, `Rapid and sensitive detection of genome contamination at scale with FCS‑GX <https://doi.org/10.1186/s13059-024-03198-7>`_. The current database is about 470 GiB in total. Each database includes a json-formatted manifest file with contains details about each database file.  A sample manifest file can be found below.
+
+The data manager downloads the GX database given a manifest file.  It takes six inputs:
+
+1. **tag** - unique identifier for this database chosen by the Galaxy Admin
+2. **description** - description for this database  seen and selectable by the user when running the NCBI FCS GX tool
+3. **source_manifest** - manifest file for this database (url or filesystem path)
+4. **use_source_manifest** - when true, the compute node will download the GX database itself instead of using the local copy
+5. **phone_home** - when true, the NCBI FCS GX tool will send analytics to NCBI about the run,  The code for this can be seen `here <https://github.com/ncbi/fcs-gx/blob/release/scripts/run_gx.py#L79-L115>`_. It sends the following information:
+
+   1. version of the gx executable
+   2. build date of the GX database
+   3. the platform the software is running on
+   4. the version of the Python interpreter
+   5. the size of physical memory in GiB
+   6. the duration of the run
+   7. the run’s exit status (0 for success, otherwise 1)
+   8. **phone_home_label**
+
+6. **phone_home_label** - arbitrary string set by the Galaxy Admin to identify the analytics data sent to NCBI
+
+The data manager also creates a lookup table for the NCBI FCS GX tool based on the `taxa.tsv <https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/latest/all.taxa.tsv>`_ file in the database.
+
+Sample Manifest File
+====================
+
+.. code-block:: JSON
 
-See https://github.com/ncbi/fcs/wiki/FCS-GX#b-download-the-database
+   {
+     "version": 1,
+     "totalFiles": 8,
+     "timeStamp": "2023-01-24T16:18:22.220812",
+     "fileDetails": [
+       {
+         "fileName": "all.blast_div.tsv.gz",
+         "fileSize": 8241107,
+         "hashAlgorithm": "md5",
+         "hashValue": "a6b08c85c46da76548fff6ed220f8f9d"
+       },
+       {
+         "fileName": "all.assemblies.tsv",
+         "fileSize": 8887448,
+         "hashAlgorithm": "md5",
+         "hashValue": "441beceb8c467593fa6b87a071c5ec6b"
+       },
+       {
+         "fileName": "all.taxa.tsv",
+         "fileSize": 6385518,
+         "hashAlgorithm": "md5",
+         "hashValue": "c94d1fc80f81dbbf30b114d4cdaf29ad"
+       },
+       {
+         "fileName": "all.gxs",
+         "fileSize": 177317125807,
+         "hashAlgorithm": "md5",
+         "hashValue": "da205626565a61be6dfd8c9b5ed1a9b7"
+       },
+       {
+         "fileName": "all.meta.jsonl",
+         "fileSize": 59,
+         "hashAlgorithm": "md5",
+         "hashValue": "c2096cdb8106d44a310052b06a23836c"
+       },
+       {
+         "fileName": "all.gxi",
+         "fileSize": 321216733352,
+         "hashAlgorithm": "md5",
+         "hashValue": "36bf346693e2b9de693de38efe219aa7"
+       },
+       {
+         "fileName": "all.seq_info.tsv.gz",
+         "fileSize": 22549956,
+         "hashAlgorithm": "md5",
+         "hashValue": "6a760eed5a94aaf46d4dd8c75f370875"
+       },
+       {
+         "fileName": "all.README.txt",
+         "fileSize": 187,
+         "hashAlgorithm": "md5",
+         "hashValue": "7deb2d4fa5241f95a25073fb43147cb1"
+       }
+     ]
+   }
     ]]></help>
     <expand macro="citations"/>
 </tool>
--- a/data_manager/macros.xml	Fri Jan 12 22:11:17 2024 +0000
+++ b/data_manager/macros.xml	Wed Oct 09 08:53:07 2024 +0000
@@ -2,10 +2,9 @@
     <xml name="requirements">
         <requirements>
             <requirement type="package" version="@TOOL_VERSION@">ncbi-fcs-gx</requirement>
-            <yield/>
         </requirements>
     </xml>
-    <token name="@TOOL_VERSION@">0.5.0</token>
+    <token name="@TOOL_VERSION@">0.5.4</token>
     <token name="@VERSION_SUFFIX@">0</token>
     <token name="@PROFILE@">21.05</token>
     <xml name="edam_ontology">
@@ -16,7 +15,6 @@
     <xml name="citations">
         <citations>
             <citation type="doi">10.1101/2023.06.02.543519</citation>
-            <yield/>
         </citations>
     </xml>
 </macros>
--- a/data_manager_conf.xml	Fri Jan 12 22:11:17 2024 +0000
+++ b/data_manager_conf.xml	Wed Oct 09 08:53:07 2024 +0000
@@ -1,18 +1,22 @@
 <data_managers>
     <data_manager tool_file="data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml" id="data_manager_ncbi_fcs_gx_database_downloader">
-        <data_table name="ncbi_fcs_gx_databases">
+        <data_table name="ncbi_fcs_gx_databases_ext">
             <output>
                 <column name="value" />
+                <column name="name" />
                 <column name="source_manifest" />
-                <column name="name" output_ref="output_file">
+                <column name="use_source_manifest" />
+                <column name="phone_home" />
+                <column name="phone_home_label" />
+                <column name="local_manifest" output_ref="output_file">
                     <move type="directory">
-                        <source>${name}</source>
-                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">ncbi_fcs_gx_databases/${value}</target>
+                        <source>${local_manifest}</source>
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">ncbi_fcs_gx_databases_ext/${value}</target>
                     </move>
                     <value_translation><![CDATA[
 #import os
 #set manifest_filename = os.path.basename($source_manifest)
-$GALAXY_DATA_MANAGER_DATA_PATH/ncbi_fcs_gx_databases/$value/$manifest_filename
+$GALAXY_DATA_MANAGER_DATA_PATH/ncbi_fcs_gx_databases_ext/$value/$manifest_filename
                     ]]></value_translation>
                     <value_translation type="function">abspath</value_translation>
                 </column>
--- a/test-data/ncbi_fcs_gx_databases.loc	Fri Jan 12 22:11:17 2024 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1 +0,0 @@
-#tag	source_manifest	local_manifest
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ncbi_fcs_gx_databases_ext.loc	Wed Oct 09 08:53:07 2024 +0000
@@ -0,0 +1,2 @@
+#tag	description	source_manifest	use_source_manifest	phone_home	phone_home_label	local_manifest
+test		https://ncbi-fcs-gx.s3.amazonaws.com/gxdb/test-only/test-only.manifest	1	0		/scratch/rico/galaxy/tool-data/ncbi_fcs_gx_databases_ext/test/test-only.manifest
--- a/test-data/ncbi_fcs_gx_divisions.tsv	Fri Jan 12 22:11:17 2024 +0000
+++ b/test-data/ncbi_fcs_gx_divisions.tsv	Wed Oct 09 08:53:07 2024 +0000
@@ -1,1 +1,3 @@
-#gx_div	tag	description
+#tag	gx_div	description
+prok:CFB group bacteria	test	
+unkn:unknown	test	
--- a/test-data/test.json	Fri Jan 12 22:11:17 2024 +0000
+++ b/test-data/test.json	Wed Oct 09 08:53:07 2024 +0000
@@ -1,24 +1,28 @@
 \{
   "data_tables": \{
-    "ncbi_fcs_gx_databases": \{
+    "ncbi_fcs_gx_databases_ext": \{
       "add": \[
         \{
-          "name": "[^"]+",
+          "description": "Test Database",
+          "local_manifest": ".+",
+          "phone_home": "0",
+          "phone_home_label": "",
           "source_manifest": "https://ncbi-fcs-gx.s3.amazonaws.com/gxdb/test-only/test-only.manifest",
-          "value": "test-only"
+          "use_source_manifest": "1",
+          "value": "test"
         \}
       \]
     \},
     "ncbi_fcs_gx_divisions": \{
       "add": \[
         \{
-          "name": "Bacteria - CFB group bacteria",
-          "tag": "test-only",
+          "description": "Bacteria - CFB group bacteria",
+          "tag": "test",
           "value": "prok:CFB group bacteria"
         \},
         \{
-          "name": "Unknown / Unclassified",
-          "tag": "test-only",
+          "description": "Unknown / Unclassified",
+          "tag": "test",
           "value": "unkn:unknown"
         \}
       \]
--- a/tool-data/ncbi_fcs_gx_databases.loc.sample	Fri Jan 12 22:11:17 2024 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,8 +0,0 @@
-## NCBI FCS GX Databases
-# 
-#tag	manifest	path
-#r2022-01-24	https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/r2022-01-24/all.manifest	/big/data/dir/ncbi_fcs_gx_databases/r2022-01-24/all.manifest
-#r2022-07-08	https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/r2022-07-08/all.manifest	/big/data/dir/ncbi_fcs_gx_databases/r2022-07-08/all.manifest
-#r2023-01-24	https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/r2023-01-24/all.manifest	/big/data/dir/ncbi_fcs_gx_databases/r2023-01-24/all.manifest
-#latest	https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/latest/all.manifest	/big/data/dir/ncbi_fcs_gx_databases/latest/all.manifest
-#test-only	https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.manifest	/big/data/dir/ncbi_fcs_gx_databases/test-only/test-only.manifest
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/ncbi_fcs_gx_databases_ext.loc.sample	Wed Oct 09 08:53:07 2024 +0000
@@ -0,0 +1,19 @@
+# When phone_home is set to "1", the NCBI FCS GX tool will send analytics
+# to NCBI about the run.  The following information is sent:
+#
+#  1. version of the gx executable
+#  2. build date of the GX database
+#  3. the platform the software is running on
+#  4. the version of the Python interpreter
+#  5. the size of physical memory in GiB
+#  6. the duration of the run
+#  7. the run’s exit status (0 for success, otherwise 1)
+#  8. phone_home_label
+#
+# The phone_home_label is an arbitrary string send to NCBI to identify
+# data. For instance, all NCBI FCS GX runs on usegalaxy.org use the
+# phone_home_label "usegalaxy.org"
+#
+#tag	description	source_manifest	use_source_manifest	phone_home	phone_home_label	local_manifest
+#latest	Full GX Database	https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/latest/all.manifest	0	1	usegalaxy.org	/big/data/dir/ncbi_fcs_gx_databases_ext/latest/all.manifest
+#test	Test GX Database	https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.manifest	0	1	usegalaxy.org	/big/data/dir/ncbi_fcs_gx_databases_ext/test/test-only.manifest
--- a/tool_data_table_conf.xml.sample	Fri Jan 12 22:11:17 2024 +0000
+++ b/tool_data_table_conf.xml.sample	Wed Oct 09 08:53:07 2024 +0000
@@ -1,8 +1,8 @@
 <tables>
     <!-- Locations of NCBI FCS GX databases -->
-    <table name="ncbi_fcs_gx_databases" comment_char="#">
-        <columns>value, source_manifest, name</columns>
-        <file path="tool-data/ncbi_fcs_gx_databases.loc" />
+    <table name="ncbi_fcs_gx_databases_ext" comment_char="#">
+        <columns>value, name, source_manifest, use_source_manifest, phone_home, phone_home_label, local_manifest</columns>
+        <file path="tool-data/ncbi_fcs_gx_databases_ext.loc" />
     </table>
     <!-- NCBI FCS GX divisions -->
     <table name="ncbi_fcs_gx_divisions" comment_char="#">
--- a/tool_data_table_conf.xml.test	Fri Jan 12 22:11:17 2024 +0000
+++ b/tool_data_table_conf.xml.test	Wed Oct 09 08:53:07 2024 +0000
@@ -1,8 +1,8 @@
 <tables>
     <!-- Locations of NCBI FCS GX databases -->
-    <table name="ncbi_fcs_gx_databases" comment_char="#">
-        <columns>value, source_manifest, name</columns>
-        <file path="${__HERE__}/test-data/ncbi_fcs_gx_databases.loc" />
+    <table name="ncbi_fcs_gx_databases_ext" comment_char="#">
+        <columns>value, name, source_manifest, use_source_manifest, phone_home, phone_home_label, local_manifest</columns>
+        <file path="${__HERE__}/test-data/ncbi_fcs_gx_databases_ext.loc" />
     </table>
     <!-- NCBI FCS GX divisions -->
     <table name="ncbi_fcs_gx_divisions" comment_char="#">