Mercurial > repos > iuc > data_manager_gtdbtk_database_installer
comparison data_manager/gtdbtk_database_installer.py @ 4:10232d2b5062 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_gtdbtk_database_installer commit 0bb620025a61de6caa8a93537fec8ea65eda43be
author | iuc |
---|---|
date | Fri, 16 Aug 2024 08:44:14 +0000 |
parents | c4830a9870fa |
children | df84aaed4769 |
comparison
equal
deleted
inserted
replaced
3:c4830a9870fa | 4:10232d2b5062 |
---|---|
7 import shutil | 7 import shutil |
8 import sys | 8 import sys |
9 import tarfile | 9 import tarfile |
10 from datetime import datetime | 10 from datetime import datetime |
11 from urllib.parse import urlparse | 11 from urllib.parse import urlparse |
12 from urllib.request import Request, urlopen | 12 from urllib.request import HTTPError, Request, urlopen |
13 | 13 |
14 # rather provide the urls based on the release, less error potential for the admins ! | 14 # rather provide the urls based on the release, less error potential for the admins ! |
15 urls = { | 15 urls = { |
16 "202": { | 16 "202": { |
17 "full": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz", | 17 "full": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz", |
31 "220": { | 31 "220": { |
32 "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz", | 32 "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz", |
33 "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_metadata_r220.tsv.gz", | 33 "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_metadata_r220.tsv.gz", |
34 "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_metadata_r220.tsv.gz", | 34 "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_metadata_r220.tsv.gz", |
35 }, | 35 }, |
36 "test": { # using VERSION to check if files are there | |
37 "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/VERSION.txt", | |
38 "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_metadata_r220.tsv.gz", | |
39 "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_metadata_r220.tsv.gz", | |
40 }, | |
41 } | 36 } |
42 | 37 |
43 | 38 |
44 def url_download(url, target_directory): | 39 def is_urlfile(url): |
40 # Check if online file exists | |
41 try: | |
42 r = urlopen(url) # response | |
43 return r.getcode() < 400 | |
44 except HTTPError: | |
45 return False | |
46 | |
47 | |
48 def url_download(url, target_directory, meta): | |
49 | |
50 # download the url | |
45 url_parts = urlparse(url) | 51 url_parts = urlparse(url) |
46 tarball = os.path.abspath( | 52 tarball = os.path.abspath( |
47 os.path.join(target_directory, os.path.basename(url_parts.path)) | 53 os.path.join(target_directory, os.path.basename(url_parts.path)) |
48 ) | 54 ) |
49 src = None | 55 src = None |
61 except Exception as e: | 67 except Exception as e: |
62 sys.exit(str(e)) | 68 sys.exit(str(e)) |
63 finally: | 69 finally: |
64 if src is not None: | 70 if src is not None: |
65 src.close() | 71 src.close() |
66 if tarfile.is_tarfile(tarball): | 72 |
67 fh = tarfile.open(tarball, "r:*") | 73 # extract the metadata |
68 else: | 74 if meta: |
69 # unzip metadata file | 75 # extract the content of *.tar.gz into the target dir |
70 if ".gz" in tarball: | 76 if tarfile.is_tarfile(tarball): |
77 fh = tarfile.open(tarball, "r:*") | |
78 fh.extractall(target_directory) | |
79 fh.close() | |
80 os.remove(tarball) | |
81 return target_directory # return path to output folder | |
82 # extract the content of *.gz into the target dir | |
83 elif ".gz" in tarball: | |
71 with gzip.open(tarball, "rb") as f_in: | 84 with gzip.open(tarball, "rb") as f_in: |
72 unzipped_file = tarball.strip(".gz") | 85 unzipped_file = tarball.strip(".gz") |
73 with open(unzipped_file, "wb") as f_out: | 86 with open(unzipped_file, "wb") as f_out: |
74 shutil.copyfileobj(f_in, f_out) | 87 shutil.copyfileobj(f_in, f_out) |
75 os.remove(tarball) | 88 os.remove(tarball) |
76 folder_of_unzipped_file = os.path.dirname(unzipped_file) | 89 folder_of_unzipped_file = os.path.dirname(unzipped_file) |
77 return folder_of_unzipped_file | 90 return folder_of_unzipped_file |
78 else: | 91 else: |
79 # this is basically only the return for the test not using a tarfile | 92 sys.exit( |
93 "No correct input format for metadata file, must be .tar.gz or .gz" | |
94 ) | |
95 else: | |
96 # handle the DB | |
97 # extract the content of the folder in the tar.gz into the target dir | |
98 if tarfile.is_tarfile(tarball): | |
99 fh = tarfile.open(tarball, "r:*") | |
100 fh.extractall(target_directory) | |
101 fh.close() | |
102 os.remove(tarball) | |
103 else: | |
104 # handle the test case for the DB | |
80 return tarball | 105 return tarball |
81 fh.extractall(target_directory) | 106 |
82 fh.close() | 107 fh.extractall(target_directory) |
83 os.remove(tarball) | 108 fh.close() |
84 # The tarball extraction will create a directory named | 109 os.remove(tarball) |
85 # something like release202 in the target_directory, so | 110 # The tarball extraction will create a directory named |
86 # we need to move the items in that directory to the | 111 # something like release202 in the target_directory, so |
87 # target directory. | 112 # we need to move the items in that directory to the |
88 subdir = next(os.walk(target_directory))[1][0] | 113 # target directory. |
89 subdir_path = os.path.join(target_directory, subdir) | 114 subdir = next(os.walk(target_directory))[1][0] |
90 items = os.listdir(subdir_path) | 115 subdir_path = os.path.join(target_directory, subdir) |
91 for item in items: | 116 items = os.listdir(subdir_path) |
92 item_path = os.path.join(subdir_path, item) | 117 for item in items: |
93 shutil.move(item_path, target_directory) | 118 item_path = os.path.join(subdir_path, item) |
94 os.rmdir(subdir_path) | 119 shutil.move(item_path, target_directory) |
95 return target_directory | 120 os.rmdir(subdir_path) |
121 return target_directory | |
96 | 122 |
97 | 123 |
98 def download(database_name, release, meta, test, out_file): | 124 def download(database_name, release, meta, test, out_file): |
99 | 125 |
100 with open(out_file) as fh: | 126 with open(out_file) as fh: |
102 | 128 |
103 target_directory = params["output_data"][0]["extra_files_path"] | 129 target_directory = params["output_data"][0]["extra_files_path"] |
104 os.makedirs(target_directory) | 130 os.makedirs(target_directory) |
105 | 131 |
106 if test: | 132 if test: |
107 release = "test" | 133 # switch the DB to use the test case |
134 urls[release][ | |
135 "full" | |
136 ] = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/VERSION.txt" | |
137 | |
138 # make use of the test to check if all urls exists | |
139 for _version, items in urls.items(): | |
140 for url in items.values(): | |
141 assert is_urlfile(url) | |
108 | 142 |
109 # download both taxonomy metadata tables | 143 # download both taxonomy metadata tables |
110 if meta: | 144 if meta: |
111 url = urls[release]["meta_ar"] | 145 url = urls[release]["meta_ar"] |
112 file_path = url_download(url, target_directory) | 146 file_path = url_download(url, target_directory, meta) |
113 url = urls[release]["meta_bac"] | 147 url = urls[release]["meta_bac"] |
114 file_path = url_download(url, target_directory) | 148 file_path = url_download(url, target_directory, meta) |
115 # download the full DB | 149 # download the full DB |
116 else: | 150 else: |
117 url = urls[release]["full"] | 151 url = urls[release]["full"] |
118 file_path = url_download(url, target_directory) | 152 file_path = url_download(url, target_directory, meta) |
119 | 153 |
120 time = datetime.utcnow().strftime("%Y-%m-%d") | 154 time = datetime.utcnow().strftime("%Y-%m-%d") |
121 | 155 |
122 data_manager_json = {"data_tables": {}} | 156 data_manager_json = {"data_tables": {}} |
123 data_manager_entry = {} | 157 data_manager_entry = {} |