Mercurial > repos > iuc > data_manager_gtdbtk_database_installer
comparison data_manager/gtdbtk_database_installer.py @ 8:750d902de22c draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_gtdbtk_database_installer commit 1019bf0fda897582e2bbdc773aebb3e08e285aae
author | iuc |
---|---|
date | Mon, 21 Oct 2024 15:49:53 +0000 |
parents | 3b1d503c6260 |
children |
comparison
equal
deleted
inserted
replaced
7:3b1d503c6260 | 8:750d902de22c |
---|---|
43 return r.getcode() < 400 | 43 return r.getcode() < 400 |
44 except HTTPError: | 44 except HTTPError: |
45 return False | 45 return False |
46 | 46 |
47 | 47 |
48 def extract_tar_iteratively(tarball, target_directory): | |
49 """ | |
50 Extracts a .tar, .tar.gz, or .tar.bz2 archive iteratively in a memory-efficient manner. | |
51 | |
52 This function processes the contents of the archive member-by-member, ensuring only | |
53 one file or directory is loaded into memory at any given time. It handles the creation | |
54 of directories and symbolic links, and streams large files to disk in chunks to avoid | |
55 memory overload. | |
56 | |
57 Args: | |
58 tarball (str): Path to the tar archive (e.g., .tar, .tar.gz, .tar.bz2) to be extracted. | |
59 target_directory (str): The destination directory where the archive content | |
60 will be extracted. | |
61 | |
62 Raises: | |
63 OSError: If there is an issue with file or directory creation, or writing to disk. | |
64 tarfile.TarError: If there is an issue opening or reading the tar archive. | |
65 | |
66 Example Usage: | |
67 extract_tar_iteratively("archive.tar.gz", "/path/to/extract") | |
68 | |
69 Notes: | |
70 - The function supports symbolic and hard links present in the tar archive. | |
71 - It ensures that directories are created before files are extracted. | |
72 - Large files are streamed to disk in 1 MB chunks to minimize memory usage. | |
73 - This function does not return anything but will populate the target directory with | |
74 the extracted content. | |
75 """ | |
76 | |
77 with tarfile.open(tarball, "r:*") as fh: | |
78 for member in fh: | |
79 # Full path to where the member should be extracted | |
80 member_path = os.path.join(target_directory, member.name) | |
81 | |
82 if member.isdir(): | |
83 # If it's a directory, ensure it exists | |
84 os.makedirs(member_path, exist_ok=True) | |
85 elif member.isfile(): | |
86 # If it's a file, extract it in chunks to avoid memory spikes | |
87 with fh.extractfile(member) as source, open( | |
88 member_path, "wb" | |
89 ) as target: | |
90 shutil.copyfileobj( | |
91 source, target, length=1024 * 1024 | |
92 ) # 1 MB chunks | |
93 elif member.issym() or member.islnk(): | |
94 # Handle symlinks or hard links if necessary | |
95 target_link = os.path.join(target_directory, member.name) | |
96 if member.issym(): | |
97 os.symlink(member.linkname, target_link) | |
98 elif member.islnk(): | |
99 os.link(member.linkname, target_link) | |
100 | |
101 | |
48 def url_download(url, target_directory, meta): | 102 def url_download(url, target_directory, meta): |
49 | 103 |
50 # download the url | 104 # download the url |
51 url_parts = urlparse(url) | 105 url_parts = urlparse(url) |
52 tarball = os.path.abspath( | 106 tarball = os.path.abspath( |
57 try: | 111 try: |
58 req = Request(url) | 112 req = Request(url) |
59 src = urlopen(req) | 113 src = urlopen(req) |
60 with open(tarball, "wb") as dst: | 114 with open(tarball, "wb") as dst: |
61 while True: | 115 while True: |
62 chunk = src.read(2**10) | 116 chunk = src.read(2**16) # Read in 64 KB chunks instead of 1 KB |
63 if chunk: | 117 if chunk: |
64 dst.write(chunk) | 118 dst.write(chunk) |
65 else: | 119 else: |
66 break | 120 break |
67 except Exception as e: | 121 except Exception as e: |
72 | 126 |
73 # extract the metadata | 127 # extract the metadata |
74 if meta: | 128 if meta: |
75 # extract the content of *.tar.gz into the target dir | 129 # extract the content of *.tar.gz into the target dir |
76 if tarfile.is_tarfile(tarball): | 130 if tarfile.is_tarfile(tarball): |
77 fh = tarfile.open(tarball, "r:*") | 131 extract_tar_iteratively(tarball, target_directory) |
78 fh.extractall(target_directory) | |
79 fh.close() | |
80 os.remove(tarball) | 132 os.remove(tarball) |
81 return target_directory # return path to output folder | 133 return target_directory # return path to output folder |
82 # extract the content of *.gz into the target dir | 134 # extract the content of *.gz into the target dir |
83 elif ".gz" in tarball: | 135 elif ".gz" in tarball: |
84 with gzip.open(tarball, "rb") as f_in: | 136 with gzip.open(tarball, "rb") as f_in: |
94 ) | 146 ) |
95 else: | 147 else: |
96 # handle the DB | 148 # handle the DB |
97 # extract the content of the folder in the tar.gz into the target dir | 149 # extract the content of the folder in the tar.gz into the target dir |
98 if tarfile.is_tarfile(tarball): | 150 if tarfile.is_tarfile(tarball): |
99 fh = tarfile.open(tarball, "r:*") | 151 extract_tar_iteratively(tarball, target_directory) |
100 fh.extractall(target_directory) | |
101 fh.close() | |
102 os.remove(tarball) | 152 os.remove(tarball) |
103 else: | 153 else: |
104 # handle the test case for the DB | 154 # handle the test case for the DB |
105 return tarball | 155 return tarball |
106 | 156 |