# HG changeset patch
# User iuc
# Date 1723583623 0
# Node ID 6ab422fba1a3b49fd55526796e36b7de4bfbfd59
# Parent 2814c058a087db3b2541a19494fb31df16e7f8fb
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_gtdbtk_database_installer commit ad14947c3e13babe90a6878b45608fe56a16150d
diff -r 2814c058a087 -r 6ab422fba1a3 data_manager/gtdbtk_database_installer.py
--- a/data_manager/gtdbtk_database_installer.py Tue Jan 03 09:05:09 2023 +0000
+++ b/data_manager/gtdbtk_database_installer.py Tue Aug 13 21:13:43 2024 +0000
@@ -1,25 +1,57 @@
#!/usr/bin/env python
import argparse
+import gzip
import json
import os
import shutil
import sys
import tarfile
+from datetime import datetime
from urllib.parse import urlparse
-from urllib.request import Request
-from urllib.request import urlopen
+from urllib.request import Request, urlopen
+
+# rather provide the urls based on the release, less error potential for the admins !
+urls = {
+ "202": {
+ "full": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz",
+ "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/ar122_taxonomy_r202.tsv.gz",
+ "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/bac120_taxonomy_r202.tsv.gz",
+ },
+ "207": {
+ "full": "https://data.gtdb.ecogenomic.org/releases/release207/207.0/auxillary_files/gtdbtk_r207_data.tar.gz",
+ "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release207/207.0/ar53_taxonomy_r207.tsv.gz",
+ "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release207/207.0/bac120_taxonomy_r207.tsv.gz",
+ },
+ "214": {
+ "full": "https://data.gtdb.ecogenomic.org/releases/release214/214.0/auxillary_files/gtdbtk_r214_data.tar.gz",
+ "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release214/214.0/ar53_taxonomy_r214.tsv.gz",
+ "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release214/214.0/bac120_taxonomy_r214.tsv.gz",
+ },
+ "220": {
+ "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz",
+ "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_taxonomy_r220.tsv.gz",
+ "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_taxonomy_r220.tsv.gz",
+ },
+ "test": { # using VERSION to check if files are there
+ "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/VERSION.txt",
+ "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_taxonomy_r220.tsv.gz",
+ "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_taxonomy_r220.tsv.gz",
+ },
+}
def url_download(url, target_directory):
url_parts = urlparse(url)
- tarball = os.path.abspath(os.path.join(target_directory, os.path.basename(url_parts.path)))
+ tarball = os.path.abspath(
+ os.path.join(target_directory, os.path.basename(url_parts.path))
+ )
src = None
dst = None
try:
req = Request(url)
src = urlopen(req)
- with open(tarball, 'wb') as dst:
+ with open(tarball, "wb") as dst:
while True:
chunk = src.read(2**10)
if chunk:
@@ -32,9 +64,20 @@
if src is not None:
src.close()
if tarfile.is_tarfile(tarball):
- fh = tarfile.open(tarball, 'r:*')
+ fh = tarfile.open(tarball, "r:*")
else:
- return tarball
+ # unzip metadata file
+ if ".gz" in tarball:
+ with gzip.open(tarball, "rb") as f_in:
+ unzipped_file = tarball.strip(".gz")
+ with open(unzipped_file, "wb") as f_out:
+ shutil.copyfileobj(f_in, f_out)
+ os.remove(tarball)
+ folder_of_unzipped_file = os.path.dirname(unzipped_file)
+ return folder_of_unzipped_file
+ else:
+ # this is basically only the return for the test not using a tarfile
+ return tarball
fh.extractall(target_directory)
fh.close()
os.remove(tarball)
@@ -52,33 +95,83 @@
return target_directory
-def download(database_id, database_name, url, out_file):
+def download(database_name, release, meta, test, out_file):
with open(out_file) as fh:
params = json.load(fh)
- target_directory = params['output_data'][0]['extra_files_path']
+ target_directory = params["output_data"][0]["extra_files_path"]
os.makedirs(target_directory)
- file_path = url_download(url, target_directory)
+
+ if test:
+ release = "test"
+
+ # download both taxonomy metadata tables
+ if meta:
+ url = urls[release]["meta_ar"]
+ file_path = url_download(url, target_directory)
+ url = urls[release]["meta_bac"]
+ file_path = url_download(url, target_directory)
+ # download the full DB
+ else:
+ url = urls[release]["full"]
+ file_path = url_download(url, target_directory)
+
+ time = datetime.utcnow().strftime("%Y-%m-%d")
data_manager_json = {"data_tables": {}}
data_manager_entry = {}
- data_manager_entry['value'] = database_id
- data_manager_entry['name'] = database_name
- data_manager_entry['path'] = file_path
- data_manager_json["data_tables"]["gtdbtk_database"] = data_manager_entry
+ data_manager_entry["value"] = f"{database_name}_release_{release}_downloaded_{time}"
+ data_manager_entry["name"] = database_name
+ data_manager_entry["path"] = file_path
+ data_manager_entry["version"] = release
- with open(out_file, 'w') as fh:
+ # store in dedicated metadata table
+ if meta:
+ data_manager_json["data_tables"][
+ "gtdbtk_database_metadata_versioned"
+ ] = data_manager_entry
+ else:
+ data_manager_json["data_tables"][
+ "gtdbtk_database_versioned"
+ ] = data_manager_entry
+
+ with open(out_file, "w") as fh:
json.dump(data_manager_json, fh, sort_keys=True)
parser = argparse.ArgumentParser()
-parser.add_argument('--database_name', dest='database_name', help='GTDB-Tk database display name')
-parser.add_argument('--database_id', dest='database_id', help='Unique GTDB-Tk database id')
-parser.add_argument('--url', dest='url', help='URL to download GTDB-Tk databse version')
-parser.add_argument('--out_file', dest='out_file', help='JSON output file')
+parser.add_argument(
+ "--database_name", dest="database_name", help="GTDB-Tk database display name"
+)
+
+parser.add_argument("--version", dest="version", help="DB version")
+
+parser.add_argument(
+ "--release", dest="release", help="Release of the GTDB-Tk database version"
+)
+parser.add_argument("--out_file", dest="out_file", help="JSON output file")
+parser.add_argument(
+ "--meta",
+ dest="meta",
+ action="store_true",
+ help="Store meta data flag",
+)
+
+parser.add_argument(
+ "--test",
+ dest="test",
+ action="store_true",
+ help="Run test",
+)
args = parser.parse_args()
-download(args.database_id, args.database_name, args.url, args.out_file)
+download(
+ args.database_name,
+ args.release,
+ args.meta,
+ args.test,
+ args.out_file,
+)
diff -r 2814c058a087 -r 6ab422fba1a3 data_manager/gtdbtk_database_installer.xml
--- a/data_manager/gtdbtk_database_installer.xml Tue Jan 03 09:05:09 2023 +0000
+++ b/data_manager/gtdbtk_database_installer.xml Tue Aug 13 21:13:43 2024 +0000
@@ -11,41 +11,59 @@
-
-
+
+
+
+
+
+
+
+
+
-
+
-
+
+
+
+
+
+
+
+
+
+This data manager downloads the DB required for GTDB-Tk tools such as
+the `gtdbtk classify_wf`. The meta options allows downloading only the metadata for the
+corresponding DB, which is used by tools like `gtdb_to_taxdump`.
doi.org/10.1038/s41587-020-0501-8
diff -r 2814c058a087 -r 6ab422fba1a3 data_manager_conf.xml
--- a/data_manager_conf.xml Tue Jan 03 09:05:09 2023 +0000
+++ b/data_manager_conf.xml Tue Aug 13 21:13:43 2024 +0000
@@ -1,14 +1,29 @@
-
+
+
+
+
diff -r 2814c058a087 -r 6ab422fba1a3 test-data/gtdbtk_database.loc
--- a/test-data/gtdbtk_database.loc Tue Jan 03 09:05:09 2023 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,26 +0,0 @@
-# This is a sample file distributed with Galaxy that enables tools
-# to use a directory of GTDB-Tk databases. The gtdbtk_databases.loc
-# file has this format (longer white space characters are TAB characters):
-#
-#
-#
-# So, for example, if you have the gtdbtk 202 stored in
-# /depot/data2/galaxy/gtdbtk/202/,
-# then the gtdbtk_databases.loc entry would look like this:
-#
-# release202 gtdbtk database release 202 /depot/data2/galaxy/gtdbtk/release202
-#
-# and your /depot/data2/galaxy/gtdbtk/release202 directory
-# would contain GTDB-Tk database files for release 202, sommething like this:
-#
-#drwxr-sr-x 3 gvk G-824019 4096 Apr 20 2021 fastani/
-#-rw-r--r-- 1 gvk G-824019 4810764 Apr 22 2021 manifest.tsv
-#drwxr-sr-x 4 gvk G-824019 4096 Apr 21 2021 markers/
-#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 masks/
-#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 metadata/
-#drwxr-sr-x 2 gvk G-824019 4096 Apr 21 2021 mrca_red/
-#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 msa/
-#drwxr-sr-x 4 gvk G-824019 4096 Apr 21 2021 pplacer/
-#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 radii/
-#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 taxonomy/
-release202 GTDB-Tk database release 202 /depot/data2/galaxy/tool-data/gtdbtk_database/release202
diff -r 2814c058a087 -r 6ab422fba1a3 test-data/gtdbtk_database_metadata_versioned.loc
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/gtdbtk_database_metadata_versioned.loc Tue Aug 13 21:13:43 2024 +0000
@@ -0,0 +1,5 @@
+# This is a sample file distributed with Galaxy that enables tools
+# to use a directory of GTDB-Tk databases. The gtdbtk_databases.loc
+# file has this format (longer white space characters are TAB characters):
+#
+#
diff -r 2814c058a087 -r 6ab422fba1a3 test-data/gtdbtk_database_versioned.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/gtdbtk_database_versioned.loc.sample Tue Aug 13 21:13:43 2024 +0000
@@ -0,0 +1,5 @@
+# This is a sample file distributed with Galaxy that enables tools
+# to use a directory of GTDB-Tk databases. The gtdbtk_databases.loc
+# file has this format (longer white space characters are TAB characters):
+#
+#
diff -r 2814c058a087 -r 6ab422fba1a3 tool-data/gtdbtk_database.loc.sample
--- a/tool-data/gtdbtk_database.loc.sample Tue Jan 03 09:05:09 2023 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,25 +0,0 @@
-# This is a sample file distributed with Galaxy that enables tools
-# to use a directory of GTDB-Tk databases. The gtdbtk_databases.loc
-# file has this format (longer white space characters are TAB characters):
-#
-#
-#
-# So, for example, if you have the gtdbtk 202 stored in
-# /depot/data2/galaxy/gtdbtk/202/,
-# then the gtdbtk_databases.loc entry would look like this:
-#
-# release202 gtdbtk database release 202 /depot/data2/galaxy/gtdbtk/release202
-#
-# and your /depot/data2/galaxy/gtdbtk/release202 directory
-# would contain GTDB-Tk database files for release 202, sommething like this:
-#
-#drwxr-sr-x 3 gvk G-824019 4096 Apr 20 2021 fastani/
-#-rw-r--r-- 1 gvk G-824019 4810764 Apr 22 2021 manifest.tsv
-#drwxr-sr-x 4 gvk G-824019 4096 Apr 21 2021 markers/
-#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 masks/
-#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 metadata/
-#drwxr-sr-x 2 gvk G-824019 4096 Apr 21 2021 mrca_red/
-#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 msa/
-#drwxr-sr-x 4 gvk G-824019 4096 Apr 21 2021 pplacer/
-#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 radii/
-#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 taxonomy/
diff -r 2814c058a087 -r 6ab422fba1a3 tool-data/gtdbtk_database_metadata_versioned.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/gtdbtk_database_metadata_versioned.loc.sample Tue Aug 13 21:13:43 2024 +0000
@@ -0,0 +1,5 @@
+# This is a sample file distributed with Galaxy that enables tools
+# to use a directory of GTDB-Tk databases. The gtdbtk_databases.loc
+# file has this format (longer white space characters are TAB characters):
+#
+#
diff -r 2814c058a087 -r 6ab422fba1a3 tool-data/gtdbtk_database_versioned.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/gtdbtk_database_versioned.loc.sample Tue Aug 13 21:13:43 2024 +0000
@@ -0,0 +1,6 @@
+# This is a sample file distributed with Galaxy that enables tools
+# to use a directory of GTDB-Tk databases. The gtdbtk_databases.loc
+# file has this format (longer white space characters are TAB characters):
+#
+#
+s
\ No newline at end of file
diff -r 2814c058a087 -r 6ab422fba1a3 tool_data_table_conf.xml.sample
--- a/tool_data_table_conf.xml.sample Tue Jan 03 09:05:09 2023 +0000
+++ b/tool_data_table_conf.xml.sample Tue Aug 13 21:13:43 2024 +0000
@@ -1,7 +1,12 @@
+
-
- value, name, path
-
+
+ value, name, version, path
+
+
+
+ value, name, version, path
+
diff -r 2814c058a087 -r 6ab422fba1a3 tool_data_table_conf.xml.test
--- a/tool_data_table_conf.xml.test Tue Jan 03 09:05:09 2023 +0000
+++ b/tool_data_table_conf.xml.test Tue Aug 13 21:13:43 2024 +0000
@@ -1,7 +1,11 @@
-
- value, name, path
-
+
+ value, name, version, path
+
+
+
+ value, name, version, path
+