Mercurial > repos > iuc > data_manager_pangolearn
comparison data_manager/pangolearn_dm.py @ 4:6e24e79d3d69 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_pangolearn commit fd2566abd51c88847437d38a5abea8703b8ee034"
author | iuc |
---|---|
date | Tue, 05 Apr 2022 18:40:07 +0000 |
parents | df30a2f1db55 |
children | 29c738066906 |
comparison
equal
deleted
inserted
replaced
3:df30a2f1db55 | 4:6e24e79d3d69 |
---|---|
10 import tarfile | 10 import tarfile |
11 | 11 |
12 import requests | 12 import requests |
13 | 13 |
14 | 14 |
15 def extract_date(tag_str): | |
16 parts = tag_str.split("_") | |
17 assert len(parts) < 3, "expected maximum of two parts, got " + str(parts) | |
18 # there are tags like: 2021-07-07-2 | |
19 parts[0] = "-".join(parts[0].split("-")[:3]) | |
20 tag_date = datetime.datetime.strptime(parts[0], "%Y-%m-%d") | |
21 if len(parts) == 2: | |
22 version = int(parts[1]) | |
23 assert ( | |
24 version < 24 * 60 | |
25 ) # because the code stores versions as minutes of the day, it can't handle versions > 1440 | |
26 tag_date += datetime.timedelta(minutes=version) | |
27 return tag_date | |
28 | |
29 | |
30 def get_model_list( | 15 def get_model_list( |
31 existing_release_tags, | 16 existing_release_tags, |
32 url="https://api.github.com/repos/cov-lineages/pangoLEARN/releases", | 17 url="https://api.github.com/repos/cov-lineages/pangoLEARN/releases" |
33 ): | 18 ): |
34 response = requests.get(url) | 19 page_num = 0 |
35 if response.status_code == 200: | 20 while True: |
36 release_list = json.loads(response.text) | 21 page_num += 1 |
37 release_info = [ | 22 response = requests.get(url + f'?page={page_num}') |
38 dict( | 23 if response.status_code == 200: |
39 tag_name=e["tag_name"], | 24 release_list_chunk = json.loads(response.text) |
40 name=e["name"], | 25 if not release_list_chunk: |
41 date=extract_date(e["tag_name"]), | 26 # past the last page of results |
42 tarball_url=e["tarball_url"], | 27 return |
43 ) | 28 for e in release_list_chunk: |
44 for e in release_list | 29 if e["tag_name"] in existing_release_tags: |
45 if e["tag_name"] not in existing_release_tags | 30 continue |
46 ] | 31 if e["prerelease"]: |
47 return release_info | 32 continue |
48 else: | 33 yield dict( |
49 response.raise_for_status() | 34 tag_name=e["tag_name"], |
35 name=e["name"], | |
36 date=parse_date(e["tag_name"]), | |
37 tarball_url=e["tarball_url"], | |
38 ) | |
39 else: | |
40 response.raise_for_status() | |
50 | 41 |
51 | 42 |
52 def filter_by_date(existing_release_tags, start_date=None, end_date=None): | 43 def filter_by_date(existing_release_tags, start_date=None, end_date=None): |
53 release_list = get_model_list(existing_release_tags) | 44 ret = [] |
54 return [ | 45 for release in get_model_list(existing_release_tags): |
55 element | 46 if start_date and release["date"] < start_date: |
56 for element in release_list | 47 break |
57 if not ( | 48 if not end_date or release["date"] <= end_date: |
58 (end_date is not None and element["date"] > end_date) | 49 ret.append(release) |
59 or (start_date is not None and element["date"] < start_date) | 50 |
60 ) | 51 return ret |
61 ] | |
62 | 52 |
63 | 53 |
64 def download_and_unpack(url, output_directory): | 54 def download_and_unpack(url, output_directory): |
65 response = requests.get(url) | 55 response = requests.get(url) |
66 if response.status_code == 200: | 56 if response.status_code == 200: |
82 else: | 72 else: |
83 response.raise_for_status() | 73 response.raise_for_status() |
84 | 74 |
85 | 75 |
86 def parse_date(d): | 76 def parse_date(d): |
87 return datetime.datetime.strptime(d, "%Y-%m-%d") | 77 # Tries to parse the first 10 chars of d as a date, which currently |
78 # succeeds for all pangolearn model releases. | |
79 return datetime.datetime.strptime(d[:10], "%Y-%m-%d") | |
88 | 80 |
89 | 81 |
90 if __name__ == "__main__": | 82 if __name__ == "__main__": |
91 | 83 |
92 parser = argparse.ArgumentParser() | 84 parser = argparse.ArgumentParser() |
99 parser.add_argument("datatable_name") | 91 parser.add_argument("datatable_name") |
100 parser.add_argument("galaxy_datamanager_filename") | 92 parser.add_argument("galaxy_datamanager_filename") |
101 args = parser.parse_args() | 93 args = parser.parse_args() |
102 | 94 |
103 if args.testmode: | 95 if args.testmode: |
104 releases = filter_by_date(start_date=args.start_date, end_date=args.end_date) | 96 releases = filter_by_date([], start_date=args.start_date, end_date=args.end_date) |
105 for release in releases: | 97 for release in releases: |
106 print(release["tag_name"], release["tarball_url"].split("/")[-1]) | 98 print(release["tag_name"], release["tarball_url"].split("/")[-1], release["date"]) |
107 sys.exit(0) | 99 sys.exit(0) |
108 | 100 |
109 with open(args.galaxy_datamanager_filename) as fh: | 101 with open(args.galaxy_datamanager_filename) as fh: |
110 config = json.load(fh) | 102 config = json.load(fh) |
111 | 103 |
127 ] | 119 ] |
128 ) | 120 ) |
129 else: | 121 else: |
130 existing_release_tags = set() | 122 existing_release_tags = set() |
131 if args.latest: | 123 if args.latest: |
132 releases = [get_model_list(existing_release_tags)[0]] | 124 releases = [next(get_model_list(existing_release_tags))] |
133 else: | 125 else: |
134 releases = filter_by_date( | 126 releases = filter_by_date( |
135 existing_release_tags, start_date=args.start_date, end_date=args.end_date | 127 existing_release_tags, start_date=args.start_date, end_date=args.end_date |
136 ) | 128 ) |
137 releases_to_download = [ | 129 releases_to_download = [ |
138 release | 130 release |
139 for release in releases | 131 for release in releases |
140 if release["tag_name"] not in existing_release_tags | 132 if release["tag_name"] not in existing_release_tags |
141 ] | 133 ] |
142 for release in releases_to_download: | 134 for release in releases_to_download: |
143 tag = download_and_unpack(release["tarball_url"], output_directory) | 135 fname = download_and_unpack(release["tarball_url"], output_directory) |
144 release_date = parse_date(tag) | |
145 if args.pangolearn_format_version is not None: | 136 if args.pangolearn_format_version is not None: |
146 version = args.pangolearn_format_version | 137 version = args.pangolearn_format_version |
147 else: | 138 else: |
148 # 2021-05-27 was the first release of pangoLEARN for pangolin 3, which changed DB format | 139 # 2021-05-27 was the first release of pangoLEARN for pangolin 3, which changed DB format |
149 if release_date >= datetime.datetime(2021, 5, 27): | 140 if release["date"] >= datetime.datetime(2021, 5, 27): |
150 version = '3.0' | 141 version = '3.0' |
151 else: | 142 else: |
152 version = '1.0' | 143 version = '1.0' |
153 data_manager_dict["data_tables"][args.datatable_name].append( | 144 data_manager_dict["data_tables"][args.datatable_name].append( |
154 dict( | 145 dict( |
155 value=tag, | 146 value=release["tag_name"], |
156 description=release["name"], | 147 description=release["name"], |
157 format_version=version, | 148 format_version=version, |
158 path=output_directory + "/" + tag, | 149 path=output_directory + "/" + fname, |
159 ) | 150 ) |
160 ) | 151 ) |
161 data_manager_dict["data_tables"][args.datatable_name].sort( | 152 data_manager_dict["data_tables"][args.datatable_name].sort( |
162 key=operator.itemgetter("value"), reverse=True | 153 key=operator.itemgetter("value"), reverse=True |
163 ) | 154 ) |