comparison data_manager/gtdbtk_database_installer.py @ 4:10232d2b5062 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_gtdbtk_database_installer commit 0bb620025a61de6caa8a93537fec8ea65eda43be
author iuc
date Fri, 16 Aug 2024 08:44:14 +0000
parents c4830a9870fa
children df84aaed4769
comparison
equal deleted inserted replaced
3:c4830a9870fa 4:10232d2b5062
7 import shutil 7 import shutil
8 import sys 8 import sys
9 import tarfile 9 import tarfile
10 from datetime import datetime 10 from datetime import datetime
11 from urllib.parse import urlparse 11 from urllib.parse import urlparse
12 from urllib.request import Request, urlopen 12 from urllib.request import HTTPError, Request, urlopen
13 13
14 # rather provide the urls based on the release, less error potential for the admins ! 14 # rather provide the urls based on the release, less error potential for the admins !
15 urls = { 15 urls = {
16 "202": { 16 "202": {
17 "full": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz", 17 "full": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz",
31 "220": { 31 "220": {
32 "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz", 32 "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz",
33 "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_metadata_r220.tsv.gz", 33 "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_metadata_r220.tsv.gz",
34 "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_metadata_r220.tsv.gz", 34 "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_metadata_r220.tsv.gz",
35 }, 35 },
36 "test": { # using VERSION to check if files are there
37 "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/VERSION.txt",
38 "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_metadata_r220.tsv.gz",
39 "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_metadata_r220.tsv.gz",
40 },
41 } 36 }
42 37
43 38
44 def url_download(url, target_directory): 39 def is_urlfile(url):
40 # Check if online file exists
41 try:
42 r = urlopen(url) # response
43 return r.getcode() < 400
44 except HTTPError:
45 return False
46
47
48 def url_download(url, target_directory, meta):
49
50 # download the url
45 url_parts = urlparse(url) 51 url_parts = urlparse(url)
46 tarball = os.path.abspath( 52 tarball = os.path.abspath(
47 os.path.join(target_directory, os.path.basename(url_parts.path)) 53 os.path.join(target_directory, os.path.basename(url_parts.path))
48 ) 54 )
49 src = None 55 src = None
61 except Exception as e: 67 except Exception as e:
62 sys.exit(str(e)) 68 sys.exit(str(e))
63 finally: 69 finally:
64 if src is not None: 70 if src is not None:
65 src.close() 71 src.close()
66 if tarfile.is_tarfile(tarball): 72
67 fh = tarfile.open(tarball, "r:*") 73 # extract the metadata
68 else: 74 if meta:
69 # unzip metadata file 75 # extract the content of *.tar.gz into the target dir
70 if ".gz" in tarball: 76 if tarfile.is_tarfile(tarball):
77 fh = tarfile.open(tarball, "r:*")
78 fh.extractall(target_directory)
79 fh.close()
80 os.remove(tarball)
81 return target_directory # return path to output folder
82 # extract the content of *.gz into the target dir
83 elif ".gz" in tarball:
71 with gzip.open(tarball, "rb") as f_in: 84 with gzip.open(tarball, "rb") as f_in:
72 unzipped_file = tarball.strip(".gz") 85 unzipped_file = tarball.strip(".gz")
73 with open(unzipped_file, "wb") as f_out: 86 with open(unzipped_file, "wb") as f_out:
74 shutil.copyfileobj(f_in, f_out) 87 shutil.copyfileobj(f_in, f_out)
75 os.remove(tarball) 88 os.remove(tarball)
76 folder_of_unzipped_file = os.path.dirname(unzipped_file) 89 folder_of_unzipped_file = os.path.dirname(unzipped_file)
77 return folder_of_unzipped_file 90 return folder_of_unzipped_file
78 else: 91 else:
79 # this is basically only the return for the test not using a tarfile 92 sys.exit(
93 "No correct input format for metadata file, must be .tar.gz or .gz"
94 )
95 else:
96 # handle the DB
97 # extract the content of the folder in the tar.gz into the target dir
98 if tarfile.is_tarfile(tarball):
99 fh = tarfile.open(tarball, "r:*")
100 fh.extractall(target_directory)
101 fh.close()
102 os.remove(tarball)
103 else:
104 # handle the test case for the DB
80 return tarball 105 return tarball
81 fh.extractall(target_directory) 106
82 fh.close() 107 fh.extractall(target_directory)
83 os.remove(tarball) 108 fh.close()
84 # The tarball extraction will create a directory named 109 os.remove(tarball)
85 # something like release202 in the target_directory, so 110 # The tarball extraction will create a directory named
86 # we need to move the items in that directory to the 111 # something like release202 in the target_directory, so
87 # target directory. 112 # we need to move the items in that directory to the
88 subdir = next(os.walk(target_directory))[1][0] 113 # target directory.
89 subdir_path = os.path.join(target_directory, subdir) 114 subdir = next(os.walk(target_directory))[1][0]
90 items = os.listdir(subdir_path) 115 subdir_path = os.path.join(target_directory, subdir)
91 for item in items: 116 items = os.listdir(subdir_path)
92 item_path = os.path.join(subdir_path, item) 117 for item in items:
93 shutil.move(item_path, target_directory) 118 item_path = os.path.join(subdir_path, item)
94 os.rmdir(subdir_path) 119 shutil.move(item_path, target_directory)
95 return target_directory 120 os.rmdir(subdir_path)
121 return target_directory
96 122
97 123
98 def download(database_name, release, meta, test, out_file): 124 def download(database_name, release, meta, test, out_file):
99 125
100 with open(out_file) as fh: 126 with open(out_file) as fh:
102 128
103 target_directory = params["output_data"][0]["extra_files_path"] 129 target_directory = params["output_data"][0]["extra_files_path"]
104 os.makedirs(target_directory) 130 os.makedirs(target_directory)
105 131
106 if test: 132 if test:
107 release = "test" 133 # switch the DB to use the test case
134 urls[release][
135 "full"
136 ] = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/VERSION.txt"
137
138 # make use of the test to check if all urls exists
139 for _version, items in urls.items():
140 for url in items.values():
141 assert is_urlfile(url)
108 142
109 # download both taxonomy metadata tables 143 # download both taxonomy metadata tables
110 if meta: 144 if meta:
111 url = urls[release]["meta_ar"] 145 url = urls[release]["meta_ar"]
112 file_path = url_download(url, target_directory) 146 file_path = url_download(url, target_directory, meta)
113 url = urls[release]["meta_bac"] 147 url = urls[release]["meta_bac"]
114 file_path = url_download(url, target_directory) 148 file_path = url_download(url, target_directory, meta)
115 # download the full DB 149 # download the full DB
116 else: 150 else:
117 url = urls[release]["full"] 151 url = urls[release]["full"]
118 file_path = url_download(url, target_directory) 152 file_path = url_download(url, target_directory, meta)
119 153
120 time = datetime.utcnow().strftime("%Y-%m-%d") 154 time = datetime.utcnow().strftime("%Y-%m-%d")
121 155
122 data_manager_json = {"data_tables": {}} 156 data_manager_json = {"data_tables": {}}
123 data_manager_entry = {} 157 data_manager_entry = {}