# HG changeset patch
# User iuc
# Date 1723797854 0
# Node ID 10232d2b5062dc1432ddd141313408eac232299b
# Parent c4830a9870fabc8a412785fc4de5674478402ca1
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_gtdbtk_database_installer commit 0bb620025a61de6caa8a93537fec8ea65eda43be
diff -r c4830a9870fa -r 10232d2b5062 data_manager/gtdbtk_database_installer.py
--- a/data_manager/gtdbtk_database_installer.py Wed Aug 14 18:02:46 2024 +0000
+++ b/data_manager/gtdbtk_database_installer.py Fri Aug 16 08:44:14 2024 +0000
@@ -9,7 +9,7 @@
import tarfile
from datetime import datetime
from urllib.parse import urlparse
-from urllib.request import Request, urlopen
+from urllib.request import HTTPError, Request, urlopen
# rather provide the urls based on the release, less error potential for the admins !
urls = {
@@ -33,15 +33,21 @@
"meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_metadata_r220.tsv.gz",
"meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_metadata_r220.tsv.gz",
},
- "test": { # using VERSION to check if files are there
- "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/VERSION.txt",
- "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_metadata_r220.tsv.gz",
- "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_metadata_r220.tsv.gz",
- },
}
-def url_download(url, target_directory):
+def is_urlfile(url):
+ # Check if online file exists
+ try:
+ r = urlopen(url) # response
+ return r.getcode() < 400
+ except HTTPError:
+ return False
+
+
+def url_download(url, target_directory, meta):
+
+ # download the url
url_parts = urlparse(url)
tarball = os.path.abspath(
os.path.join(target_directory, os.path.basename(url_parts.path))
@@ -63,36 +69,56 @@
finally:
if src is not None:
src.close()
- if tarfile.is_tarfile(tarball):
- fh = tarfile.open(tarball, "r:*")
- else:
- # unzip metadata file
- if ".gz" in tarball:
+
+ # extract the metadata
+ if meta:
+ # extract the content of *.tar.gz into the target dir
+ if tarfile.is_tarfile(tarball):
+ fh = tarfile.open(tarball, "r:*")
+ fh.extractall(target_directory)
+ fh.close()
+ os.remove(tarball)
+ return target_directory # return path to output folder
+ # extract the content of *.gz into the target dir
+ elif ".gz" in tarball:
with gzip.open(tarball, "rb") as f_in:
unzipped_file = tarball.strip(".gz")
with open(unzipped_file, "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
- os.remove(tarball)
- folder_of_unzipped_file = os.path.dirname(unzipped_file)
+ os.remove(tarball)
+ folder_of_unzipped_file = os.path.dirname(unzipped_file)
return folder_of_unzipped_file
else:
- # this is basically only the return for the test not using a tarfile
+ sys.exit(
+ "No correct input format for metadata file, must be .tar.gz or .gz"
+ )
+ else:
+ # handle the DB
+ # extract the content of the folder in the tar.gz into the target dir
+ if tarfile.is_tarfile(tarball):
+ fh = tarfile.open(tarball, "r:*")
+ fh.extractall(target_directory)
+ fh.close()
+ os.remove(tarball)
+ else:
+ # handle the test case for the DB
return tarball
- fh.extractall(target_directory)
- fh.close()
- os.remove(tarball)
- # The tarball extraction will create a directory named
- # something like release202 in the target_directory, so
- # we need to move the items in that directory to the
- # target directory.
- subdir = next(os.walk(target_directory))[1][0]
- subdir_path = os.path.join(target_directory, subdir)
- items = os.listdir(subdir_path)
- for item in items:
- item_path = os.path.join(subdir_path, item)
- shutil.move(item_path, target_directory)
- os.rmdir(subdir_path)
- return target_directory
+
+ fh.extractall(target_directory)
+ fh.close()
+ os.remove(tarball)
+ # The tarball extraction will create a directory named
+ # something like release202 in the target_directory, so
+ # we need to move the items in that directory to the
+ # target directory.
+ subdir = next(os.walk(target_directory))[1][0]
+ subdir_path = os.path.join(target_directory, subdir)
+ items = os.listdir(subdir_path)
+ for item in items:
+ item_path = os.path.join(subdir_path, item)
+ shutil.move(item_path, target_directory)
+ os.rmdir(subdir_path)
+ return target_directory
def download(database_name, release, meta, test, out_file):
@@ -104,18 +130,26 @@
os.makedirs(target_directory)
if test:
- release = "test"
+ # switch the DB to use the test case
+ urls[release][
+ "full"
+ ] = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/VERSION.txt"
+
+ # make use of the test to check if all urls exists
+ for _version, items in urls.items():
+ for url in items.values():
+ assert is_urlfile(url)
# download both taxonomy metadata tables
if meta:
url = urls[release]["meta_ar"]
- file_path = url_download(url, target_directory)
+ file_path = url_download(url, target_directory, meta)
url = urls[release]["meta_bac"]
- file_path = url_download(url, target_directory)
+ file_path = url_download(url, target_directory, meta)
# download the full DB
else:
url = urls[release]["full"]
- file_path = url_download(url, target_directory)
+ file_path = url_download(url, target_directory, meta)
time = datetime.utcnow().strftime("%Y-%m-%d")
diff -r c4830a9870fa -r 10232d2b5062 data_manager/gtdbtk_database_installer.xml
--- a/data_manager/gtdbtk_database_installer.xml Wed Aug 14 18:02:46 2024 +0000
+++ b/data_manager/gtdbtk_database_installer.xml Fri Aug 16 08:44:14 2024 +0000
@@ -36,26 +36,38 @@
+
-
-
-
+
+
+
-
+
+
+
+
+
+
+