Mercurial > repos > iuc > data_manager_gtdbtk_database_installer
changeset 4:10232d2b5062 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_gtdbtk_database_installer commit 0bb620025a61de6caa8a93537fec8ea65eda43be
author | iuc |
---|---|
date | Fri, 16 Aug 2024 08:44:14 +0000 |
parents | c4830a9870fa |
children | e7b39a7e0024 |
files | data_manager/gtdbtk_database_installer.py data_manager/gtdbtk_database_installer.xml |
diffstat | 2 files changed, 87 insertions(+), 41 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/gtdbtk_database_installer.py Wed Aug 14 18:02:46 2024 +0000 +++ b/data_manager/gtdbtk_database_installer.py Fri Aug 16 08:44:14 2024 +0000 @@ -9,7 +9,7 @@ import tarfile from datetime import datetime from urllib.parse import urlparse -from urllib.request import Request, urlopen +from urllib.request import HTTPError, Request, urlopen # rather provide the urls based on the release, less error potential for the admins ! urls = { @@ -33,15 +33,21 @@ "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_metadata_r220.tsv.gz", "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_metadata_r220.tsv.gz", }, - "test": { # using VERSION to check if files are there - "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/VERSION.txt", - "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_metadata_r220.tsv.gz", - "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_metadata_r220.tsv.gz", - }, } -def url_download(url, target_directory): +def is_urlfile(url): + # Check if online file exists + try: + r = urlopen(url) # response + return r.getcode() < 400 + except HTTPError: + return False + + +def url_download(url, target_directory, meta): + + # download the url url_parts = urlparse(url) tarball = os.path.abspath( os.path.join(target_directory, os.path.basename(url_parts.path)) @@ -63,36 +69,56 @@ finally: if src is not None: src.close() - if tarfile.is_tarfile(tarball): - fh = tarfile.open(tarball, "r:*") - else: - # unzip metadata file - if ".gz" in tarball: + + # extract the metadata + if meta: + # extract the content of *.tar.gz into the target dir + if tarfile.is_tarfile(tarball): + fh = tarfile.open(tarball, "r:*") + fh.extractall(target_directory) + fh.close() + os.remove(tarball) + return target_directory # return path to output folder + # extract the content of *.gz into the target dir + elif ".gz" in tarball: with gzip.open(tarball, "rb") as f_in: unzipped_file = tarball.strip(".gz") with open(unzipped_file, "wb") as f_out: shutil.copyfileobj(f_in, f_out) - os.remove(tarball) - folder_of_unzipped_file = os.path.dirname(unzipped_file) + os.remove(tarball) + folder_of_unzipped_file = os.path.dirname(unzipped_file) return folder_of_unzipped_file else: - # this is basically only the return for the test not using a tarfile + sys.exit( + "No correct input format for metadata file, must be .tar.gz or .gz" + ) + else: + # handle the DB + # extract the content of the folder in the tar.gz into the target dir + if tarfile.is_tarfile(tarball): + fh = tarfile.open(tarball, "r:*") + fh.extractall(target_directory) + fh.close() + os.remove(tarball) + else: + # handle the test case for the DB return tarball - fh.extractall(target_directory) - fh.close() - os.remove(tarball) - # The tarball extraction will create a directory named - # something like release202 in the target_directory, so - # we need to move the items in that directory to the - # target directory. - subdir = next(os.walk(target_directory))[1][0] - subdir_path = os.path.join(target_directory, subdir) - items = os.listdir(subdir_path) - for item in items: - item_path = os.path.join(subdir_path, item) - shutil.move(item_path, target_directory) - os.rmdir(subdir_path) - return target_directory + + fh.extractall(target_directory) + fh.close() + os.remove(tarball) + # The tarball extraction will create a directory named + # something like release202 in the target_directory, so + # we need to move the items in that directory to the + # target directory. + subdir = next(os.walk(target_directory))[1][0] + subdir_path = os.path.join(target_directory, subdir) + items = os.listdir(subdir_path) + for item in items: + item_path = os.path.join(subdir_path, item) + shutil.move(item_path, target_directory) + os.rmdir(subdir_path) + return target_directory def download(database_name, release, meta, test, out_file): @@ -104,18 +130,26 @@ os.makedirs(target_directory) if test: - release = "test" + # switch the DB to use the test case + urls[release][ + "full" + ] = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/VERSION.txt" + + # make use of the test to check if all urls exists + for _version, items in urls.items(): + for url in items.values(): + assert is_urlfile(url) # download both taxonomy metadata tables if meta: url = urls[release]["meta_ar"] - file_path = url_download(url, target_directory) + file_path = url_download(url, target_directory, meta) url = urls[release]["meta_bac"] - file_path = url_download(url, target_directory) + file_path = url_download(url, target_directory, meta) # download the full DB else: url = urls[release]["full"] - file_path = url_download(url, target_directory) + file_path = url_download(url, target_directory, meta) time = datetime.utcnow().strftime("%Y-%m-%d")
--- a/data_manager/gtdbtk_database_installer.xml Wed Aug 14 18:02:46 2024 +0000 +++ b/data_manager/gtdbtk_database_installer.xml Fri Aug 16 08:44:14 2024 +0000 @@ -36,26 +36,38 @@ <test> <!-- TODO --> <!-- Not actually installing a huge GTDB-Tk database --> + <!-- but it will check if all urls exist --> <param name="release" value="202"/> <param name="database_name" value="GTDB-Tk database release 202"/> <param name="test" value="--test"/> <output name="out_file"> <assert_contents> <has_text text="GTDB-Tk database release 202"/> - <has_text text="release_test"/> + <has_text text="release_202"/> </assert_contents> </output> </test> <test> - <!-- Test meta data download --> - <param name="release" value="202"/> - <param name="database_name" value="GTDB-Tk database release 202 metadata"/> + <!-- Test meta data download with tsv.gz--> + <param name="release" value="220"/> + <param name="database_name" value="GTDB-Tk database release 220 metadata"/> <param name="meta" value="true"/> - <param name="test" value="--test"/> <output name="out_file"> <assert_contents> - <has_text text="GTDB-Tk database release 202 metadata"/> - <has_text text="release_test"/> + <has_text text="GTDB-Tk database release 220 metadata"/> + <has_text text="release_220"/> + </assert_contents> + </output> + </test> + <test> + <!-- Test meta data download with tar.gz --> + <param name="release" value="207"/> + <param name="database_name" value="GTDB-Tk database release 207 metadata"/> + <param name="meta" value="true"/> + <output name="out_file"> + <assert_contents> + <has_text text="GTDB-Tk database release 207 metadata"/> + <has_text text="release_207"/> </assert_contents> </output> </test>