comparison data_manager/gtdbtk_database_installer.py @ 8:750d902de22c draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_gtdbtk_database_installer commit 1019bf0fda897582e2bbdc773aebb3e08e285aae
author iuc
date Mon, 21 Oct 2024 15:49:53 +0000
parents 3b1d503c6260
children
comparison
equal deleted inserted replaced
7:3b1d503c6260 8:750d902de22c
43 return r.getcode() < 400 43 return r.getcode() < 400
44 except HTTPError: 44 except HTTPError:
45 return False 45 return False
46 46
47 47
48 def extract_tar_iteratively(tarball, target_directory):
49 """
50 Extracts a .tar, .tar.gz, or .tar.bz2 archive iteratively in a memory-efficient manner.
51
52 This function processes the contents of the archive member-by-member, ensuring only
53 one file or directory is loaded into memory at any given time. It handles the creation
54 of directories and symbolic links, and streams large files to disk in chunks to avoid
55 memory overload.
56
57 Args:
58 tarball (str): Path to the tar archive (e.g., .tar, .tar.gz, .tar.bz2) to be extracted.
59 target_directory (str): The destination directory where the archive content
60 will be extracted.
61
62 Raises:
63 OSError: If there is an issue with file or directory creation, or writing to disk.
64 tarfile.TarError: If there is an issue opening or reading the tar archive.
65
66 Example Usage:
67 extract_tar_iteratively("archive.tar.gz", "/path/to/extract")
68
69 Notes:
70 - The function supports symbolic and hard links present in the tar archive.
71 - It ensures that directories are created before files are extracted.
72 - Large files are streamed to disk in 1 MB chunks to minimize memory usage.
73 - This function does not return anything but will populate the target directory with
74 the extracted content.
75 """
76
77 with tarfile.open(tarball, "r:*") as fh:
78 for member in fh:
79 # Full path to where the member should be extracted
80 member_path = os.path.join(target_directory, member.name)
81
82 if member.isdir():
83 # If it's a directory, ensure it exists
84 os.makedirs(member_path, exist_ok=True)
85 elif member.isfile():
86 # If it's a file, extract it in chunks to avoid memory spikes
87 with fh.extractfile(member) as source, open(
88 member_path, "wb"
89 ) as target:
90 shutil.copyfileobj(
91 source, target, length=1024 * 1024
92 ) # 1 MB chunks
93 elif member.issym() or member.islnk():
94 # Handle symlinks or hard links if necessary
95 target_link = os.path.join(target_directory, member.name)
96 if member.issym():
97 os.symlink(member.linkname, target_link)
98 elif member.islnk():
99 os.link(member.linkname, target_link)
100
101
48 def url_download(url, target_directory, meta): 102 def url_download(url, target_directory, meta):
49 103
50 # download the url 104 # download the url
51 url_parts = urlparse(url) 105 url_parts = urlparse(url)
52 tarball = os.path.abspath( 106 tarball = os.path.abspath(
57 try: 111 try:
58 req = Request(url) 112 req = Request(url)
59 src = urlopen(req) 113 src = urlopen(req)
60 with open(tarball, "wb") as dst: 114 with open(tarball, "wb") as dst:
61 while True: 115 while True:
62 chunk = src.read(2**10) 116 chunk = src.read(2**16) # Read in 64 KB chunks instead of 1 KB
63 if chunk: 117 if chunk:
64 dst.write(chunk) 118 dst.write(chunk)
65 else: 119 else:
66 break 120 break
67 except Exception as e: 121 except Exception as e:
72 126
73 # extract the metadata 127 # extract the metadata
74 if meta: 128 if meta:
75 # extract the content of *.tar.gz into the target dir 129 # extract the content of *.tar.gz into the target dir
76 if tarfile.is_tarfile(tarball): 130 if tarfile.is_tarfile(tarball):
77 fh = tarfile.open(tarball, "r:*") 131 extract_tar_iteratively(tarball, target_directory)
78 fh.extractall(target_directory)
79 fh.close()
80 os.remove(tarball) 132 os.remove(tarball)
81 return target_directory # return path to output folder 133 return target_directory # return path to output folder
82 # extract the content of *.gz into the target dir 134 # extract the content of *.gz into the target dir
83 elif ".gz" in tarball: 135 elif ".gz" in tarball:
84 with gzip.open(tarball, "rb") as f_in: 136 with gzip.open(tarball, "rb") as f_in:
94 ) 146 )
95 else: 147 else:
96 # handle the DB 148 # handle the DB
97 # extract the content of the folder in the tar.gz into the target dir 149 # extract the content of the folder in the tar.gz into the target dir
98 if tarfile.is_tarfile(tarball): 150 if tarfile.is_tarfile(tarball):
99 fh = tarfile.open(tarball, "r:*") 151 extract_tar_iteratively(tarball, target_directory)
100 fh.extractall(target_directory)
101 fh.close()
102 os.remove(tarball) 152 os.remove(tarball)
103 else: 153 else:
104 # handle the test case for the DB 154 # handle the test case for the DB
105 return tarball 155 return tarball
106 156