# HG changeset patch
# User iuc
# Date 1729525793 0
# Node ID 750d902de22ccf49713ab403802e920b947631fa
# Parent 3b1d503c6260a7fdb805f263b568eac63dc13ef1
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_gtdbtk_database_installer commit 1019bf0fda897582e2bbdc773aebb3e08e285aae
diff -r 3b1d503c6260 -r 750d902de22c data_manager/gtdbtk_database_installer.py
--- a/data_manager/gtdbtk_database_installer.py Thu Oct 03 12:42:22 2024 +0000
+++ b/data_manager/gtdbtk_database_installer.py Mon Oct 21 15:49:53 2024 +0000
@@ -45,6 +45,60 @@
return False
+def extract_tar_iteratively(tarball, target_directory):
+ """
+ Extracts a .tar, .tar.gz, or .tar.bz2 archive iteratively in a memory-efficient manner.
+
+ This function processes the contents of the archive member-by-member, ensuring only
+ one file or directory is loaded into memory at any given time. It handles the creation
+ of directories and symbolic links, and streams large files to disk in chunks to avoid
+ memory overload.
+
+ Args:
+ tarball (str): Path to the tar archive (e.g., .tar, .tar.gz, .tar.bz2) to be extracted.
+ target_directory (str): The destination directory where the archive content
+ will be extracted.
+
+ Raises:
+ OSError: If there is an issue with file or directory creation, or writing to disk.
+ tarfile.TarError: If there is an issue opening or reading the tar archive.
+
+ Example Usage:
+ extract_tar_iteratively("archive.tar.gz", "/path/to/extract")
+
+ Notes:
+ - The function supports symbolic and hard links present in the tar archive.
+ - It ensures that directories are created before files are extracted.
+ - Large files are streamed to disk in 1 MB chunks to minimize memory usage.
+ - This function does not return anything but will populate the target directory with
+ the extracted content.
+ """
+
+ with tarfile.open(tarball, "r:*") as fh:
+ for member in fh:
+ # Full path to where the member should be extracted
+ member_path = os.path.join(target_directory, member.name)
+
+ if member.isdir():
+ # If it's a directory, ensure it exists
+ os.makedirs(member_path, exist_ok=True)
+ elif member.isfile():
+ # If it's a file, extract it in chunks to avoid memory spikes
+ with fh.extractfile(member) as source, open(
+ member_path, "wb"
+ ) as target:
+ shutil.copyfileobj(
+ source, target, length=1024 * 1024
+ ) # 1 MB chunks
+ elif member.issym() or member.islnk():
+ # Handle symlinks or hard links if necessary
+ target_link = os.path.join(target_directory, member.name)
+ if member.issym():
+ os.symlink(member.linkname, target_link)
+ elif member.islnk():
+ os.link(member.linkname, target_link)
+
+
def url_download(url, target_directory, meta):
# download the url
@@ -59,7 +113,7 @@
src = urlopen(req)
with open(tarball, "wb") as dst:
while True:
- chunk = src.read(2**10)
+ chunk = src.read(2**16) # Read in 64 KB chunks instead of 1 KB
if chunk:
dst.write(chunk)
else:
@@ -74,9 +128,7 @@
if meta:
# extract the content of *.tar.gz into the target dir
if tarfile.is_tarfile(tarball):
- fh = tarfile.open(tarball, "r:*")
- fh.extractall(target_directory)
- fh.close()
+ extract_tar_iteratively(tarball, target_directory)
os.remove(tarball)
return target_directory # return path to output folder
# extract the content of *.gz into the target dir
@@ -96,9 +148,7 @@
# handle the DB
# extract the content of the folder in the tar.gz into the target dir
if tarfile.is_tarfile(tarball):
- fh = tarfile.open(tarball, "r:*")
- fh.extractall(target_directory)
- fh.close()
+ extract_tar_iteratively(tarball, target_directory)
os.remove(tarball)
else:
# handle the test case for the DB
diff -r 3b1d503c6260 -r 750d902de22c data_manager/gtdbtk_database_installer.xml
--- a/data_manager/gtdbtk_database_installer.xml Thu Oct 03 12:42:22 2024 +0000
+++ b/data_manager/gtdbtk_database_installer.xml Mon Oct 21 15:49:53 2024 +0000
@@ -2,7 +2,7 @@
202
- 3
+ 4
20.09