data_manager_gtdbtk_database_installer: data_manager/gtdbtk_database

comparison data_manager/gtdbtk_database_installer.py @ 8:750d902de22c draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_gtdbtk_database_installer commit 1019bf0fda897582e2bbdc773aebb3e08e285aae

author	iuc
date	Mon, 21 Oct 2024 15:49:53 +0000
parents	3b1d503c6260
children

comparison

equal deleted inserted replaced

-:3b1d503c6260
+:750d902de22c
 return r.getcode() < 400
 except HTTPError:
 return False
+def extract_tar_iteratively(tarball, target_directory):
+"""
+Extracts a .tar, .tar.gz, or .tar.bz2 archive iteratively in a memory-efficient manner.
+This function processes the contents of the archive member-by-member, ensuring only
+one file or directory is loaded into memory at any given time. It handles the creation
+of directories and symbolic links, and streams large files to disk in chunks to avoid
+memory overload.
+Args:
+tarball (str): Path to the tar archive (e.g., .tar, .tar.gz, .tar.bz2) to be extracted.
+target_directory (str): The destination directory where the archive content
+will be extracted.
+Raises:
+OSError: If there is an issue with file or directory creation, or writing to disk.
+tarfile.TarError: If there is an issue opening or reading the tar archive.
+Example Usage:
+extract_tar_iteratively("archive.tar.gz", "/path/to/extract")
+Notes:
+- The function supports symbolic and hard links present in the tar archive.
+- It ensures that directories are created before files are extracted.
+- Large files are streamed to disk in 1 MB chunks to minimize memory usage.
+- This function does not return anything but will populate the target directory with
+the extracted content.
+"""
+with tarfile.open(tarball, "r:*") as fh:
+for member in fh:
+# Full path to where the member should be extracted
+member_path = os.path.join(target_directory, member.name)
+if member.isdir():
+# If it's a directory, ensure it exists
+os.makedirs(member_path, exist_ok=True)
+elif member.isfile():
+# If it's a file, extract it in chunks to avoid memory spikes
+with fh.extractfile(member) as source, open(
+member_path, "wb"
+) as target:
+shutil.copyfileobj(
+source, target, length=1024 * 1024
+)  # 1 MB chunks
+elif member.issym() or member.islnk():
+# Handle symlinks or hard links if necessary
+target_link = os.path.join(target_directory, member.name)
+if member.issym():
+os.symlink(member.linkname, target_link)
+elif member.islnk():
+os.link(member.linkname, target_link)
 def url_download(url, target_directory, meta):
 # download the url
 url_parts = urlparse(url)
 tarball = os.path.abspath(
 try:
 req = Request(url)
 src = urlopen(req)
 with open(tarball, "wb") as dst:
 while True:
-chunk = src.read(2**10)
+chunk = src.read(2**16)  # Read in 64 KB chunks instead of 1 KB
 if chunk:
 dst.write(chunk)
 else:
 break
 except Exception as e:
 # extract the metadata
 if meta:
 # extract the content of *.tar.gz into the target dir
 if tarfile.is_tarfile(tarball):
-fh = tarfile.open(tarball, "r:*")
+extract_tar_iteratively(tarball, target_directory)
-fh.extractall(target_directory)
-fh.close()
 os.remove(tarball)
 return target_directory  # return path to output folder
 # extract the content of *.gz into the target dir
 elif ".gz" in tarball:
 with gzip.open(tarball, "rb") as f_in:
 )
 else:
 # handle the DB
 # extract the content of the folder in the tar.gz into the target dir
 if tarfile.is_tarfile(tarball):
-fh = tarfile.open(tarball, "r:*")
+extract_tar_iteratively(tarball, target_directory)
-fh.extractall(target_directory)
-fh.close()
 os.remove(tarball)
 else:
 # handle the test case for the DB
 return tarball

Mercurial > repos > iuc > data_manager_gtdbtk_database_installer

comparison data_manager/gtdbtk_database_installer.py @ 8:750d902de22c draft default tip