changeset 2:6ab422fba1a3 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_gtdbtk_database_installer commit ad14947c3e13babe90a6878b45608fe56a16150d
author iuc
date Tue, 13 Aug 2024 21:13:43 +0000
parents 2814c058a087
children c4830a9870fa
files data_manager/gtdbtk_database_installer.py data_manager/gtdbtk_database_installer.xml data_manager_conf.xml test-data/gtdbtk_database.loc test-data/gtdbtk_database_metadata_versioned.loc test-data/gtdbtk_database_versioned.loc.sample tool-data/gtdbtk_database.loc.sample tool-data/gtdbtk_database_metadata_versioned.loc.sample tool-data/gtdbtk_database_versioned.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test
diffstat 11 files changed, 197 insertions(+), 92 deletions(-) [+]
line wrap: on
line diff
--- a/data_manager/gtdbtk_database_installer.py	Tue Jan 03 09:05:09 2023 +0000
+++ b/data_manager/gtdbtk_database_installer.py	Tue Aug 13 21:13:43 2024 +0000
@@ -1,25 +1,57 @@
 #!/usr/bin/env python
 
 import argparse
+import gzip
 import json
 import os
 import shutil
 import sys
 import tarfile
+from datetime import datetime
 from urllib.parse import urlparse
-from urllib.request import Request
-from urllib.request import urlopen
+from urllib.request import Request, urlopen
+
+# rather provide the urls based on the release, less error potential for the admins !
+urls = {
+    "202": {
+        "full": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz",
+        "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/ar122_taxonomy_r202.tsv.gz",
+        "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/bac120_taxonomy_r202.tsv.gz",
+    },
+    "207": {
+        "full": "https://data.gtdb.ecogenomic.org/releases/release207/207.0/auxillary_files/gtdbtk_r207_data.tar.gz",
+        "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release207/207.0/ar53_taxonomy_r207.tsv.gz",
+        "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release207/207.0/bac120_taxonomy_r207.tsv.gz",
+    },
+    "214": {
+        "full": "https://data.gtdb.ecogenomic.org/releases/release214/214.0/auxillary_files/gtdbtk_r214_data.tar.gz",
+        "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release214/214.0/ar53_taxonomy_r214.tsv.gz",
+        "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release214/214.0/bac120_taxonomy_r214.tsv.gz",
+    },
+    "220": {
+        "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz",
+        "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_taxonomy_r220.tsv.gz",
+        "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_taxonomy_r220.tsv.gz",
+    },
+    "test": {  # using VERSION to check if files are there
+        "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/VERSION.txt",
+        "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_taxonomy_r220.tsv.gz",
+        "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_taxonomy_r220.tsv.gz",
+    },
+}
 
 
 def url_download(url, target_directory):
     url_parts = urlparse(url)
-    tarball = os.path.abspath(os.path.join(target_directory, os.path.basename(url_parts.path)))
+    tarball = os.path.abspath(
+        os.path.join(target_directory, os.path.basename(url_parts.path))
+    )
     src = None
     dst = None
     try:
         req = Request(url)
         src = urlopen(req)
-        with open(tarball, 'wb') as dst:
+        with open(tarball, "wb") as dst:
             while True:
                 chunk = src.read(2**10)
                 if chunk:
@@ -32,9 +64,20 @@
         if src is not None:
             src.close()
     if tarfile.is_tarfile(tarball):
-        fh = tarfile.open(tarball, 'r:*')
+        fh = tarfile.open(tarball, "r:*")
     else:
-        return tarball
+        # unzip metadata file
+        if ".gz" in tarball:
+            with gzip.open(tarball, "rb") as f_in:
+                unzipped_file = tarball.strip(".gz")
+                with open(unzipped_file, "wb") as f_out:
+                    shutil.copyfileobj(f_in, f_out)
+                os.remove(tarball)
+                folder_of_unzipped_file = os.path.dirname(unzipped_file)
+            return folder_of_unzipped_file
+        else:
+            # this is basically only the return for the test not using a tarfile
+            return tarball
     fh.extractall(target_directory)
     fh.close()
     os.remove(tarball)
@@ -52,33 +95,83 @@
     return target_directory
 
 
-def download(database_id, database_name, url, out_file):
+def download(database_name, release, meta, test, out_file):
 
     with open(out_file) as fh:
         params = json.load(fh)
 
-    target_directory = params['output_data'][0]['extra_files_path']
+    target_directory = params["output_data"][0]["extra_files_path"]
     os.makedirs(target_directory)
-    file_path = url_download(url, target_directory)
+
+    if test:
+        release = "test"
+
+    # download both taxonomy metadata tables
+    if meta:
+        url = urls[release]["meta_ar"]
+        file_path = url_download(url, target_directory)
+        url = urls[release]["meta_bac"]
+        file_path = url_download(url, target_directory)
+    # download the full DB
+    else:
+        url = urls[release]["full"]
+        file_path = url_download(url, target_directory)
+
+    time = datetime.utcnow().strftime("%Y-%m-%d")
 
     data_manager_json = {"data_tables": {}}
     data_manager_entry = {}
-    data_manager_entry['value'] = database_id
-    data_manager_entry['name'] = database_name
-    data_manager_entry['path'] = file_path
-    data_manager_json["data_tables"]["gtdbtk_database"] = data_manager_entry
+    data_manager_entry["value"] = f"{database_name}_release_{release}_downloaded_{time}"
+    data_manager_entry["name"] = database_name
+    data_manager_entry["path"] = file_path
+    data_manager_entry["version"] = release
 
-    with open(out_file, 'w') as fh:
+    # store in dedicated metadata table
+    if meta:
+        data_manager_json["data_tables"][
+            "gtdbtk_database_metadata_versioned"
+        ] = data_manager_entry
+    else:
+        data_manager_json["data_tables"][
+            "gtdbtk_database_versioned"
+        ] = data_manager_entry
+
+    with open(out_file, "w") as fh:
         json.dump(data_manager_json, fh, sort_keys=True)
 
 
 parser = argparse.ArgumentParser()
 
-parser.add_argument('--database_name', dest='database_name', help='GTDB-Tk database display name')
-parser.add_argument('--database_id', dest='database_id', help='Unique GTDB-Tk database id')
-parser.add_argument('--url', dest='url', help='URL to download GTDB-Tk databse version')
-parser.add_argument('--out_file', dest='out_file', help='JSON output file')
+parser.add_argument(
+    "--database_name", dest="database_name", help="GTDB-Tk database display name"
+)
+
+parser.add_argument("--version", dest="version", help="DB version")
+
+parser.add_argument(
+    "--release", dest="release", help="Release of the GTDB-Tk database version"
+)
+parser.add_argument("--out_file", dest="out_file", help="JSON output file")
+parser.add_argument(
+    "--meta",
+    dest="meta",
+    action="store_true",
+    help="Store meta data flag",
+)
+
+parser.add_argument(
+    "--test",
+    dest="test",
+    action="store_true",
+    help="Run test",
+)
 
 args = parser.parse_args()
 
-download(args.database_id, args.database_name, args.url, args.out_file)
+download(
+    args.database_name,
+    args.release,
+    args.meta,
+    args.test,
+    args.out_file,
+)
--- a/data_manager/gtdbtk_database_installer.xml	Tue Jan 03 09:05:09 2023 +0000
+++ b/data_manager/gtdbtk_database_installer.xml	Tue Aug 13 21:13:43 2024 +0000
@@ -11,41 +11,59 @@
     <command>
     <![CDATA[
         python '$__tool_directory__/gtdbtk_database_installer.py'
-          --database_id '$database_id'
           --database_name '$database_name'
-          --url '$url'
+          --release '$release'
           --out_file '$out_file'
+          $meta
+          $test
     ]]>
     </command>
     <inputs>
 	    <param name="database_name" type="text" value="" label="Database name or description" help="This value will be displayed in the GTDB-Tk Database select list"/>
-        <param name="database_id" type="text" value="" label="Database id" help="This value must be unique with no whitespace allowed - use underscores"/>
-        <param
-            name="url"
-            type="text"
-            value="https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz"
-            label="URL for GTDB release"
-            help="This should point to a GTDB release tarball. A table of available databases and their version compatability can be found at https://ecogenomics.github.io/GTDBTk/installing/index.html#gtdb-tk-reference-data."
-        />
+        <param name="meta" type="boolean" truevalue="--meta" falsevalue="" checked="false" label="Only store GTDBTK metadata in a dedicated data table. " />
+        <param name="test" type="hidden" value="" checked="false" label="Run a dry test run !" />
+        <param name="release" type="select" multiple="false" label="GTDB Release">
+            <option value="202">202</option>
+            <option value="207">207</option>
+            <option value="214">214</option>
+            <option value="220">220</option>
+        </param>
     </inputs>
     <outputs>
         <data name="out_file" format="data_manager_json"/>
     </outputs>
     <tests>
         <test>
+            <!-- TODO -->
             <!-- Not actually installing a huge GTDB-Tk database -->
-            <param name="database_id" value="release202"/>
+            <param name="release" value="202"/>
             <param name="database_name" value="GTDB-Tk database release 202"/>
-            <param name="url" value="https://data.gtdb.ecogenomic.org/releases/release202/202.0/VERSION"/>
+            <param name="test" value="--test"/>
             <output name="out_file">
                 <assert_contents>
                     <has_text text="GTDB-Tk database release 202"/>
-                    <has_text text="release202"/>
+                    <has_text text="release_test"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <!-- Test meta data download -->
+            <param name="release" value="202"/>
+            <param name="database_name" value="GTDB-Tk database release 202 metadata"/>
+            <param name="meta" value="true"/>
+            <param name="test" value="--test"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="GTDB-Tk database release 202 metadata"/>
+                    <has_text text="release_test"/>
                 </assert_contents>
             </output>
         </test>
     </tests>
     <help>
+This data manager downloads the DB required for GTDB-Tk tools such as 
+the `gtdbtk classify_wf`. The meta options allows downloading only the metadata for the 
+corresponding DB, which is used by tools like `gtdb_to_taxdump`.
     </help>
     <citations>
         <citation type="doi">doi.org/10.1038/s41587-020-0501-8</citation>
--- a/data_manager_conf.xml	Tue Jan 03 09:05:09 2023 +0000
+++ b/data_manager_conf.xml	Tue Aug 13 21:13:43 2024 +0000
@@ -1,14 +1,29 @@
 <data_managers>
     <data_manager tool_file="data_manager/gtdbtk_database_installer.xml" id="gtdbtk_database_installer">
-        <data_table name="gtdbtk_database">
+        <data_table name="gtdbtk_database_versioned">
             <output>
                 <column name="value"/>
                 <column name="name"/>
+                <column name="version"/>
                 <column name="path" output_ref="out_file">
                     <move type="directory" relativize_symlinks="True">
-                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">gtdbtk_database/${value}</target>
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">gtdbtk_database_versioned/${value}</target>
                     </move>
-                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/gtdbtk_database/${value}</value_translation>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/gtdbtk_database_versioned/${value}</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
+        <data_table name="gtdbtk_database_metadata_versioned">
+            <output>
+                <column name="value"/>
+                <column name="name"/>
+                <column name="version"/>
+                <column name="path" output_ref="out_file">
+                    <move type="directory" relativize_symlinks="True">
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">gtdbtk_database_metadata_versioned/${value}</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/gtdbtk_database_metadata_versioned/${value}</value_translation>
                     <value_translation type="function">abspath</value_translation>
                 </column>
             </output>
--- a/test-data/gtdbtk_database.loc	Tue Jan 03 09:05:09 2023 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,26 +0,0 @@
-# This is a sample file distributed with Galaxy that enables tools
-# to use a directory of GTDB-Tk databases.  The gtdbtk_databases.loc
-# file has this format (longer white space characters are TAB characters):
-#
-# <unique_build_id> <display_name>  <directory_path>
-#
-# So, for example, if you have the gtdbtk 202 stored in 
-# /depot/data2/galaxy/gtdbtk/202/, 
-# then the gtdbtk_databases.loc entry would look like this:
-#
-# release202    gtdbtk database release 202 /depot/data2/galaxy/gtdbtk/release202
-#
-# and your /depot/data2/galaxy/gtdbtk/release202 directory
-# would contain GTDB-Tk database files for release 202, sommething like this:
-#
-#drwxr-sr-x  3 gvk G-824019    4096 Apr 20  2021 fastani/
-#-rw-r--r--  1 gvk G-824019 4810764 Apr 22  2021 manifest.tsv
-#drwxr-sr-x  4 gvk G-824019    4096 Apr 21  2021 markers/
-#drwxr-sr-x  2 gvk G-824019    4096 Apr 20  2021 masks/
-#drwxr-sr-x  2 gvk G-824019    4096 Apr 20  2021 metadata/
-#drwxr-sr-x  2 gvk G-824019    4096 Apr 21  2021 mrca_red/
-#drwxr-sr-x  2 gvk G-824019    4096 Apr 20  2021 msa/
-#drwxr-sr-x  4 gvk G-824019    4096 Apr 21  2021 pplacer/
-#drwxr-sr-x  2 gvk G-824019    4096 Apr 20  2021 radii/
-#drwxr-sr-x  2 gvk G-824019    4096 Apr 20  2021 taxonomy/
-release202		GTDB-Tk database release 202	/depot/data2/galaxy/tool-data/gtdbtk_database/release202
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/gtdbtk_database_metadata_versioned.loc	Tue Aug 13 21:13:43 2024 +0000
@@ -0,0 +1,5 @@
+# This is a sample file distributed with Galaxy that enables tools
+# to use a directory of GTDB-Tk databases.  The gtdbtk_databases.loc
+# file has this format (longer white space characters are TAB characters):
+#
+# <unique_build_id> <display_name> <version> <directory_path>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/gtdbtk_database_versioned.loc.sample	Tue Aug 13 21:13:43 2024 +0000
@@ -0,0 +1,5 @@
+# This is a sample file distributed with Galaxy that enables tools
+# to use a directory of GTDB-Tk databases.  The gtdbtk_databases.loc
+# file has this format (longer white space characters are TAB characters):
+#
+# <unique_build_id> <display_name> <version> <directory_path>
--- a/tool-data/gtdbtk_database.loc.sample	Tue Jan 03 09:05:09 2023 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,25 +0,0 @@
-# This is a sample file distributed with Galaxy that enables tools
-# to use a directory of GTDB-Tk databases.  The gtdbtk_databases.loc
-# file has this format (longer white space characters are TAB characters):
-#
-# <unique_build_id> <display_name>  <directory_path>
-#
-# So, for example, if you have the gtdbtk 202 stored in 
-# /depot/data2/galaxy/gtdbtk/202/, 
-# then the gtdbtk_databases.loc entry would look like this:
-#
-# release202    gtdbtk database release 202 /depot/data2/galaxy/gtdbtk/release202
-#
-# and your /depot/data2/galaxy/gtdbtk/release202 directory
-# would contain GTDB-Tk database files for release 202, sommething like this:
-#
-#drwxr-sr-x  3 gvk G-824019    4096 Apr 20  2021 fastani/
-#-rw-r--r--  1 gvk G-824019 4810764 Apr 22  2021 manifest.tsv
-#drwxr-sr-x  4 gvk G-824019    4096 Apr 21  2021 markers/
-#drwxr-sr-x  2 gvk G-824019    4096 Apr 20  2021 masks/
-#drwxr-sr-x  2 gvk G-824019    4096 Apr 20  2021 metadata/
-#drwxr-sr-x  2 gvk G-824019    4096 Apr 21  2021 mrca_red/
-#drwxr-sr-x  2 gvk G-824019    4096 Apr 20  2021 msa/
-#drwxr-sr-x  4 gvk G-824019    4096 Apr 21  2021 pplacer/
-#drwxr-sr-x  2 gvk G-824019    4096 Apr 20  2021 radii/
-#drwxr-sr-x  2 gvk G-824019    4096 Apr 20  2021 taxonomy/
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/gtdbtk_database_metadata_versioned.loc.sample	Tue Aug 13 21:13:43 2024 +0000
@@ -0,0 +1,5 @@
+# This is a sample file distributed with Galaxy that enables tools
+# to use a directory of GTDB-Tk databases.  The gtdbtk_databases.loc
+# file has this format (longer white space characters are TAB characters):
+#
+# <unique_build_id> <display_name> <version> <directory_path>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/gtdbtk_database_versioned.loc.sample	Tue Aug 13 21:13:43 2024 +0000
@@ -0,0 +1,6 @@
+# This is a sample file distributed with Galaxy that enables tools
+# to use a directory of GTDB-Tk databases.  The gtdbtk_databases.loc
+# file has this format (longer white space characters are TAB characters):
+#
+# <unique_build_id> <display_name> <version> <directory_path>
+s
\ No newline at end of file
--- a/tool_data_table_conf.xml.sample	Tue Jan 03 09:05:09 2023 +0000
+++ b/tool_data_table_conf.xml.sample	Tue Aug 13 21:13:43 2024 +0000
@@ -1,7 +1,12 @@
+<?xml version="1.0"?>
 <tables>
     <!-- Locations of GTDB-Tk database versions 202 and higher -->
-    <table name="gtdbtk_database" comment_char="#">
-        <columns>value, name, path</columns>
-        <file path="tool-data/gtdbtk_database.loc" />
+    <table name="gtdbtk_database_versioned" comment_char="#">
+        <columns>value, name, version, path</columns>
+        <file path="tool-data/gtdbtk_database_versioned.loc" />
+    </table>
+    <table name="gtdbtk_database_metadata_versioned" comment_char="#">
+        <columns>value, name, version, path</columns>
+        <file path="tool-data/gtdbtk_database_metadata_versioned.loc" />
     </table>
 </tables>
--- a/tool_data_table_conf.xml.test	Tue Jan 03 09:05:09 2023 +0000
+++ b/tool_data_table_conf.xml.test	Tue Aug 13 21:13:43 2024 +0000
@@ -1,7 +1,11 @@
 <tables>
     <!-- Location of databases for gtdbtk version 202 and higher -->
-    <table name="gtdbtk_database" comment_char="#">
-        <columns>value, name, path</columns>
-        <file path="${__HERE__}/test-data/gtdbtk_database.loc" />
+    <table name="gtdbtk_database_versioned" comment_char="#">
+        <columns>value, name, version, path</columns>
+        <file path="${__HERE__}/test-data/gtdbtk_database_versioned.loc" />
+    </table>
+    <table name="gtdbtk_database_metadata_versioned" comment_char="#">
+        <columns>value, name, version, path</columns>
+        <file path="${__HERE__}/test-data/gtdbtk_database_metadata_versioned.loc" />
     </table>
 </tables>