changeset 4:10232d2b5062 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_gtdbtk_database_installer commit 0bb620025a61de6caa8a93537fec8ea65eda43be
author iuc
date Fri, 16 Aug 2024 08:44:14 +0000
parents c4830a9870fa
children e7b39a7e0024
files data_manager/gtdbtk_database_installer.py data_manager/gtdbtk_database_installer.xml
diffstat 2 files changed, 87 insertions(+), 41 deletions(-) [+]
line wrap: on
line diff
--- a/data_manager/gtdbtk_database_installer.py	Wed Aug 14 18:02:46 2024 +0000
+++ b/data_manager/gtdbtk_database_installer.py	Fri Aug 16 08:44:14 2024 +0000
@@ -9,7 +9,7 @@
 import tarfile
 from datetime import datetime
 from urllib.parse import urlparse
-from urllib.request import Request, urlopen
+from urllib.request import HTTPError, Request, urlopen
 
 # rather provide the urls based on the release, less error potential for the admins !
 urls = {
@@ -33,15 +33,21 @@
         "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_metadata_r220.tsv.gz",
         "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_metadata_r220.tsv.gz",
     },
-    "test": {  # using VERSION to check if files are there
-        "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/VERSION.txt",
-        "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_metadata_r220.tsv.gz",
-        "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_metadata_r220.tsv.gz",
-    },
 }
 
 
-def url_download(url, target_directory):
+def is_urlfile(url):
+    # Check if online file exists
+    try:
+        r = urlopen(url)  # response
+        return r.getcode() < 400
+    except HTTPError:
+        return False
+
+
+def url_download(url, target_directory, meta):
+
+    # download the url
     url_parts = urlparse(url)
     tarball = os.path.abspath(
         os.path.join(target_directory, os.path.basename(url_parts.path))
@@ -63,36 +69,56 @@
     finally:
         if src is not None:
             src.close()
-    if tarfile.is_tarfile(tarball):
-        fh = tarfile.open(tarball, "r:*")
-    else:
-        # unzip metadata file
-        if ".gz" in tarball:
+
+    # extract the metadata
+    if meta:
+        # extract the content of *.tar.gz into the target dir
+        if tarfile.is_tarfile(tarball):
+            fh = tarfile.open(tarball, "r:*")
+            fh.extractall(target_directory)
+            fh.close()
+            os.remove(tarball)
+            return target_directory  # return path to output folder
+        # extract the content of *.gz into the target dir
+        elif ".gz" in tarball:
             with gzip.open(tarball, "rb") as f_in:
                 unzipped_file = tarball.strip(".gz")
                 with open(unzipped_file, "wb") as f_out:
                     shutil.copyfileobj(f_in, f_out)
-                os.remove(tarball)
-                folder_of_unzipped_file = os.path.dirname(unzipped_file)
+                    os.remove(tarball)
+                    folder_of_unzipped_file = os.path.dirname(unzipped_file)
             return folder_of_unzipped_file
         else:
-            # this is basically only the return for the test not using a tarfile
+            sys.exit(
+                "No correct input format for metadata file, must be .tar.gz or .gz"
+            )
+    else:
+        # handle the DB
+        # extract the content of the folder in the tar.gz into the target dir
+        if tarfile.is_tarfile(tarball):
+            fh = tarfile.open(tarball, "r:*")
+            fh.extractall(target_directory)
+            fh.close()
+            os.remove(tarball)
+        else:
+            # handle the test case for the DB
             return tarball
-    fh.extractall(target_directory)
-    fh.close()
-    os.remove(tarball)
-    # The tarball extraction will create a directory named
-    # something like release202 in the target_directory, so
-    # we need to move the items in that directory to the
-    # target directory.
-    subdir = next(os.walk(target_directory))[1][0]
-    subdir_path = os.path.join(target_directory, subdir)
-    items = os.listdir(subdir_path)
-    for item in items:
-        item_path = os.path.join(subdir_path, item)
-        shutil.move(item_path, target_directory)
-    os.rmdir(subdir_path)
-    return target_directory
+
+        fh.extractall(target_directory)
+        fh.close()
+        os.remove(tarball)
+        # The tarball extraction will create a directory named
+        # something like release202 in the target_directory, so
+        # we need to move the items in that directory to the
+        # target directory.
+        subdir = next(os.walk(target_directory))[1][0]
+        subdir_path = os.path.join(target_directory, subdir)
+        items = os.listdir(subdir_path)
+        for item in items:
+            item_path = os.path.join(subdir_path, item)
+            shutil.move(item_path, target_directory)
+        os.rmdir(subdir_path)
+        return target_directory
 
 
 def download(database_name, release, meta, test, out_file):
@@ -104,18 +130,26 @@
     os.makedirs(target_directory)
 
     if test:
-        release = "test"
+        # switch the DB to use the test case
+        urls[release][
+            "full"
+        ] = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/VERSION.txt"
+
+        # make use of the test to check if all urls exists
+        for _version, items in urls.items():
+            for url in items.values():
+                assert is_urlfile(url)
 
     # download both taxonomy metadata tables
     if meta:
         url = urls[release]["meta_ar"]
-        file_path = url_download(url, target_directory)
+        file_path = url_download(url, target_directory, meta)
         url = urls[release]["meta_bac"]
-        file_path = url_download(url, target_directory)
+        file_path = url_download(url, target_directory, meta)
     # download the full DB
     else:
         url = urls[release]["full"]
-        file_path = url_download(url, target_directory)
+        file_path = url_download(url, target_directory, meta)
 
     time = datetime.utcnow().strftime("%Y-%m-%d")
 
--- a/data_manager/gtdbtk_database_installer.xml	Wed Aug 14 18:02:46 2024 +0000
+++ b/data_manager/gtdbtk_database_installer.xml	Fri Aug 16 08:44:14 2024 +0000
@@ -36,26 +36,38 @@
         <test>
             <!-- TODO -->
             <!-- Not actually installing a huge GTDB-Tk database -->
+            <!-- but it will check if all urls exist -->
             <param name="release" value="202"/>
             <param name="database_name" value="GTDB-Tk database release 202"/>
             <param name="test" value="--test"/>
             <output name="out_file">
                 <assert_contents>
                     <has_text text="GTDB-Tk database release 202"/>
-                    <has_text text="release_test"/>
+                    <has_text text="release_202"/>
                 </assert_contents>
             </output>
         </test>
         <test>
-            <!-- Test meta data download -->
-            <param name="release" value="202"/>
-            <param name="database_name" value="GTDB-Tk database release 202 metadata"/>
+            <!-- Test meta data download with tsv.gz-->
+            <param name="release" value="220"/>
+            <param name="database_name" value="GTDB-Tk database release 220 metadata"/>
             <param name="meta" value="true"/>
-            <param name="test" value="--test"/>
             <output name="out_file">
                 <assert_contents>
-                    <has_text text="GTDB-Tk database release 202 metadata"/>
-                    <has_text text="release_test"/>
+                    <has_text text="GTDB-Tk database release 220 metadata"/>
+                    <has_text text="release_220"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test>
+            <!-- Test meta data download with tar.gz -->
+            <param name="release" value="207"/>
+            <param name="database_name" value="GTDB-Tk database release 207 metadata"/>
+            <param name="meta" value="true"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text="GTDB-Tk database release 207 metadata"/>
+                    <has_text text="release_207"/>
                 </assert_contents>
             </output>
         </test>