Mercurial > repos > iuc > data_manager_gtdbtk_database_installer
changeset 8:750d902de22c draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_gtdbtk_database_installer commit 1019bf0fda897582e2bbdc773aebb3e08e285aae
author | iuc |
---|---|
date | Mon, 21 Oct 2024 15:49:53 +0000 |
parents | 3b1d503c6260 |
children | |
files | data_manager/gtdbtk_database_installer.py data_manager/gtdbtk_database_installer.xml |
diffstat | 2 files changed, 58 insertions(+), 8 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/gtdbtk_database_installer.py Thu Oct 03 12:42:22 2024 +0000 +++ b/data_manager/gtdbtk_database_installer.py Mon Oct 21 15:49:53 2024 +0000 @@ -45,6 +45,60 @@ return False +def extract_tar_iteratively(tarball, target_directory): + """ + Extracts a .tar, .tar.gz, or .tar.bz2 archive iteratively in a memory-efficient manner. + + This function processes the contents of the archive member-by-member, ensuring only + one file or directory is loaded into memory at any given time. It handles the creation + of directories and symbolic links, and streams large files to disk in chunks to avoid + memory overload. + + Args: + tarball (str): Path to the tar archive (e.g., .tar, .tar.gz, .tar.bz2) to be extracted. + target_directory (str): The destination directory where the archive content + will be extracted. + + Raises: + OSError: If there is an issue with file or directory creation, or writing to disk. + tarfile.TarError: If there is an issue opening or reading the tar archive. + + Example Usage: + extract_tar_iteratively("archive.tar.gz", "/path/to/extract") + + Notes: + - The function supports symbolic and hard links present in the tar archive. + - It ensures that directories are created before files are extracted. + - Large files are streamed to disk in 1 MB chunks to minimize memory usage. + - This function does not return anything but will populate the target directory with + the extracted content. + """ + + with tarfile.open(tarball, "r:*") as fh: + for member in fh: + # Full path to where the member should be extracted + member_path = os.path.join(target_directory, member.name) + + if member.isdir(): + # If it's a directory, ensure it exists + os.makedirs(member_path, exist_ok=True) + elif member.isfile(): + # If it's a file, extract it in chunks to avoid memory spikes + with fh.extractfile(member) as source, open( + member_path, "wb" + ) as target: + shutil.copyfileobj( + source, target, length=1024 * 1024 + ) # 1 MB chunks + elif member.issym() or member.islnk(): + # Handle symlinks or hard links if necessary + target_link = os.path.join(target_directory, member.name) + if member.issym(): + os.symlink(member.linkname, target_link) + elif member.islnk(): + os.link(member.linkname, target_link) + + def url_download(url, target_directory, meta): # download the url @@ -59,7 +113,7 @@ src = urlopen(req) with open(tarball, "wb") as dst: while True: - chunk = src.read(2**10) + chunk = src.read(2**16) # Read in 64 KB chunks instead of 1 KB if chunk: dst.write(chunk) else: @@ -74,9 +128,7 @@ if meta: # extract the content of *.tar.gz into the target dir if tarfile.is_tarfile(tarball): - fh = tarfile.open(tarball, "r:*") - fh.extractall(target_directory) - fh.close() + extract_tar_iteratively(tarball, target_directory) os.remove(tarball) return target_directory # return path to output folder # extract the content of *.gz into the target dir @@ -96,9 +148,7 @@ # handle the DB # extract the content of the folder in the tar.gz into the target dir if tarfile.is_tarfile(tarball): - fh = tarfile.open(tarball, "r:*") - fh.extractall(target_directory) - fh.close() + extract_tar_iteratively(tarball, target_directory) os.remove(tarball) else: # handle the test case for the DB
--- a/data_manager/gtdbtk_database_installer.xml Thu Oct 03 12:42:22 2024 +0000 +++ b/data_manager/gtdbtk_database_installer.xml Mon Oct 21 15:49:53 2024 +0000 @@ -2,7 +2,7 @@ <description></description> <macros> <token name="@TOOL_VERSION@">202</token> - <token name="@VERSION_SUFFIX@">3</token> + <token name="@VERSION_SUFFIX@">4</token> <token name="@PROFILE@">20.09</token> </macros> <requirements>