changeset 8:750d902de22c draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_gtdbtk_database_installer commit 1019bf0fda897582e2bbdc773aebb3e08e285aae
author iuc
date Mon, 21 Oct 2024 15:49:53 +0000
parents 3b1d503c6260
children
files data_manager/gtdbtk_database_installer.py data_manager/gtdbtk_database_installer.xml
diffstat 2 files changed, 58 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/data_manager/gtdbtk_database_installer.py	Thu Oct 03 12:42:22 2024 +0000
+++ b/data_manager/gtdbtk_database_installer.py	Mon Oct 21 15:49:53 2024 +0000
@@ -45,6 +45,60 @@
         return False
 
 
+def extract_tar_iteratively(tarball, target_directory):
+    """
+    Extracts a .tar, .tar.gz, or .tar.bz2 archive iteratively in a memory-efficient manner.
+
+    This function processes the contents of the archive member-by-member, ensuring only
+    one file or directory is loaded into memory at any given time. It handles the creation
+    of directories and symbolic links, and streams large files to disk in chunks to avoid
+    memory overload.
+
+    Args:
+        tarball (str): Path to the tar archive (e.g., .tar, .tar.gz, .tar.bz2) to be extracted.
+        target_directory (str): The destination directory where the archive content
+                                will be extracted.
+
+    Raises:
+        OSError: If there is an issue with file or directory creation, or writing to disk.
+        tarfile.TarError: If there is an issue opening or reading the tar archive.
+
+    Example Usage:
+        extract_tar_iteratively("archive.tar.gz", "/path/to/extract")
+
+    Notes:
+        - The function supports symbolic and hard links present in the tar archive.
+        - It ensures that directories are created before files are extracted.
+        - Large files are streamed to disk in 1 MB chunks to minimize memory usage.
+        - This function does not return anything but will populate the target directory with
+          the extracted content.
+    """
+
+    with tarfile.open(tarball, "r:*") as fh:
+        for member in fh:
+            # Full path to where the member should be extracted
+            member_path = os.path.join(target_directory, member.name)
+
+            if member.isdir():
+                # If it's a directory, ensure it exists
+                os.makedirs(member_path, exist_ok=True)
+            elif member.isfile():
+                # If it's a file, extract it in chunks to avoid memory spikes
+                with fh.extractfile(member) as source, open(
+                    member_path, "wb"
+                ) as target:
+                    shutil.copyfileobj(
+                        source, target, length=1024 * 1024
+                    )  # 1 MB chunks
+            elif member.issym() or member.islnk():
+                # Handle symlinks or hard links if necessary
+                target_link = os.path.join(target_directory, member.name)
+                if member.issym():
+                    os.symlink(member.linkname, target_link)
+                elif member.islnk():
+                    os.link(member.linkname, target_link)
+
+
 def url_download(url, target_directory, meta):
 
     # download the url
@@ -59,7 +113,7 @@
         src = urlopen(req)
         with open(tarball, "wb") as dst:
             while True:
-                chunk = src.read(2**10)
+                chunk = src.read(2**16)  # Read in 64 KB chunks instead of 1 KB
                 if chunk:
                     dst.write(chunk)
                 else:
@@ -74,9 +128,7 @@
     if meta:
         # extract the content of *.tar.gz into the target dir
         if tarfile.is_tarfile(tarball):
-            fh = tarfile.open(tarball, "r:*")
-            fh.extractall(target_directory)
-            fh.close()
+            extract_tar_iteratively(tarball, target_directory)
             os.remove(tarball)
             return target_directory  # return path to output folder
         # extract the content of *.gz into the target dir
@@ -96,9 +148,7 @@
         # handle the DB
         # extract the content of the folder in the tar.gz into the target dir
         if tarfile.is_tarfile(tarball):
-            fh = tarfile.open(tarball, "r:*")
-            fh.extractall(target_directory)
-            fh.close()
+            extract_tar_iteratively(tarball, target_directory)
             os.remove(tarball)
         else:
             # handle the test case for the DB
--- a/data_manager/gtdbtk_database_installer.xml	Thu Oct 03 12:42:22 2024 +0000
+++ b/data_manager/gtdbtk_database_installer.xml	Mon Oct 21 15:49:53 2024 +0000
@@ -2,7 +2,7 @@
     <description></description>
     <macros>
         <token name="@TOOL_VERSION@">202</token>
-        <token name="@VERSION_SUFFIX@">3</token>
+        <token name="@VERSION_SUFFIX@">4</token>
         <token name="@PROFILE@">20.09</token>
     </macros>
     <requirements>