# HG changeset patch # User iuc # Date 1723797854 0 # Node ID 10232d2b5062dc1432ddd141313408eac232299b # Parent c4830a9870fabc8a412785fc4de5674478402ca1 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_gtdbtk_database_installer commit 0bb620025a61de6caa8a93537fec8ea65eda43be diff -r c4830a9870fa -r 10232d2b5062 data_manager/gtdbtk_database_installer.py --- a/data_manager/gtdbtk_database_installer.py Wed Aug 14 18:02:46 2024 +0000 +++ b/data_manager/gtdbtk_database_installer.py Fri Aug 16 08:44:14 2024 +0000 @@ -9,7 +9,7 @@ import tarfile from datetime import datetime from urllib.parse import urlparse -from urllib.request import Request, urlopen +from urllib.request import HTTPError, Request, urlopen # rather provide the urls based on the release, less error potential for the admins ! urls = { @@ -33,15 +33,21 @@ "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_metadata_r220.tsv.gz", "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_metadata_r220.tsv.gz", }, - "test": { # using VERSION to check if files are there - "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/VERSION.txt", - "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_metadata_r220.tsv.gz", - "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_metadata_r220.tsv.gz", - }, } -def url_download(url, target_directory): +def is_urlfile(url): + # Check if online file exists + try: + r = urlopen(url) # response + return r.getcode() < 400 + except HTTPError: + return False + + +def url_download(url, target_directory, meta): + + # download the url url_parts = urlparse(url) tarball = os.path.abspath( os.path.join(target_directory, os.path.basename(url_parts.path)) @@ -63,36 +69,56 @@ finally: if src is not None: src.close() - if tarfile.is_tarfile(tarball): - fh = tarfile.open(tarball, "r:*") - else: - # unzip metadata file - if ".gz" in tarball: + + # extract the metadata + if meta: + # extract the content of *.tar.gz into the target dir + if tarfile.is_tarfile(tarball): + fh = tarfile.open(tarball, "r:*") + fh.extractall(target_directory) + fh.close() + os.remove(tarball) + return target_directory # return path to output folder + # extract the content of *.gz into the target dir + elif ".gz" in tarball: with gzip.open(tarball, "rb") as f_in: unzipped_file = tarball.strip(".gz") with open(unzipped_file, "wb") as f_out: shutil.copyfileobj(f_in, f_out) - os.remove(tarball) - folder_of_unzipped_file = os.path.dirname(unzipped_file) + os.remove(tarball) + folder_of_unzipped_file = os.path.dirname(unzipped_file) return folder_of_unzipped_file else: - # this is basically only the return for the test not using a tarfile + sys.exit( + "No correct input format for metadata file, must be .tar.gz or .gz" + ) + else: + # handle the DB + # extract the content of the folder in the tar.gz into the target dir + if tarfile.is_tarfile(tarball): + fh = tarfile.open(tarball, "r:*") + fh.extractall(target_directory) + fh.close() + os.remove(tarball) + else: + # handle the test case for the DB return tarball - fh.extractall(target_directory) - fh.close() - os.remove(tarball) - # The tarball extraction will create a directory named - # something like release202 in the target_directory, so - # we need to move the items in that directory to the - # target directory. - subdir = next(os.walk(target_directory))[1][0] - subdir_path = os.path.join(target_directory, subdir) - items = os.listdir(subdir_path) - for item in items: - item_path = os.path.join(subdir_path, item) - shutil.move(item_path, target_directory) - os.rmdir(subdir_path) - return target_directory + + fh.extractall(target_directory) + fh.close() + os.remove(tarball) + # The tarball extraction will create a directory named + # something like release202 in the target_directory, so + # we need to move the items in that directory to the + # target directory. + subdir = next(os.walk(target_directory))[1][0] + subdir_path = os.path.join(target_directory, subdir) + items = os.listdir(subdir_path) + for item in items: + item_path = os.path.join(subdir_path, item) + shutil.move(item_path, target_directory) + os.rmdir(subdir_path) + return target_directory def download(database_name, release, meta, test, out_file): @@ -104,18 +130,26 @@ os.makedirs(target_directory) if test: - release = "test" + # switch the DB to use the test case + urls[release][ + "full" + ] = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/VERSION.txt" + + # make use of the test to check if all urls exists + for _version, items in urls.items(): + for url in items.values(): + assert is_urlfile(url) # download both taxonomy metadata tables if meta: url = urls[release]["meta_ar"] - file_path = url_download(url, target_directory) + file_path = url_download(url, target_directory, meta) url = urls[release]["meta_bac"] - file_path = url_download(url, target_directory) + file_path = url_download(url, target_directory, meta) # download the full DB else: url = urls[release]["full"] - file_path = url_download(url, target_directory) + file_path = url_download(url, target_directory, meta) time = datetime.utcnow().strftime("%Y-%m-%d") diff -r c4830a9870fa -r 10232d2b5062 data_manager/gtdbtk_database_installer.xml --- a/data_manager/gtdbtk_database_installer.xml Wed Aug 14 18:02:46 2024 +0000 +++ b/data_manager/gtdbtk_database_installer.xml Fri Aug 16 08:44:14 2024 +0000 @@ -36,26 +36,38 @@ + - + - - - + + + - - - + + + + + + + + + + + + + +