comparison data_manager/pangolearn_dm.py @ 4:6e24e79d3d69 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_pangolearn commit fd2566abd51c88847437d38a5abea8703b8ee034"
author iuc
date Tue, 05 Apr 2022 18:40:07 +0000
parents df30a2f1db55
children 29c738066906
comparison
equal deleted inserted replaced
3:df30a2f1db55 4:6e24e79d3d69
10 import tarfile 10 import tarfile
11 11
12 import requests 12 import requests
13 13
14 14
15 def extract_date(tag_str):
16 parts = tag_str.split("_")
17 assert len(parts) < 3, "expected maximum of two parts, got " + str(parts)
18 # there are tags like: 2021-07-07-2
19 parts[0] = "-".join(parts[0].split("-")[:3])
20 tag_date = datetime.datetime.strptime(parts[0], "%Y-%m-%d")
21 if len(parts) == 2:
22 version = int(parts[1])
23 assert (
24 version < 24 * 60
25 ) # because the code stores versions as minutes of the day, it can't handle versions > 1440
26 tag_date += datetime.timedelta(minutes=version)
27 return tag_date
28
29
30 def get_model_list( 15 def get_model_list(
31 existing_release_tags, 16 existing_release_tags,
32 url="https://api.github.com/repos/cov-lineages/pangoLEARN/releases", 17 url="https://api.github.com/repos/cov-lineages/pangoLEARN/releases"
33 ): 18 ):
34 response = requests.get(url) 19 page_num = 0
35 if response.status_code == 200: 20 while True:
36 release_list = json.loads(response.text) 21 page_num += 1
37 release_info = [ 22 response = requests.get(url + f'?page={page_num}')
38 dict( 23 if response.status_code == 200:
39 tag_name=e["tag_name"], 24 release_list_chunk = json.loads(response.text)
40 name=e["name"], 25 if not release_list_chunk:
41 date=extract_date(e["tag_name"]), 26 # past the last page of results
42 tarball_url=e["tarball_url"], 27 return
43 ) 28 for e in release_list_chunk:
44 for e in release_list 29 if e["tag_name"] in existing_release_tags:
45 if e["tag_name"] not in existing_release_tags 30 continue
46 ] 31 if e["prerelease"]:
47 return release_info 32 continue
48 else: 33 yield dict(
49 response.raise_for_status() 34 tag_name=e["tag_name"],
35 name=e["name"],
36 date=parse_date(e["tag_name"]),
37 tarball_url=e["tarball_url"],
38 )
39 else:
40 response.raise_for_status()
50 41
51 42
52 def filter_by_date(existing_release_tags, start_date=None, end_date=None): 43 def filter_by_date(existing_release_tags, start_date=None, end_date=None):
53 release_list = get_model_list(existing_release_tags) 44 ret = []
54 return [ 45 for release in get_model_list(existing_release_tags):
55 element 46 if start_date and release["date"] < start_date:
56 for element in release_list 47 break
57 if not ( 48 if not end_date or release["date"] <= end_date:
58 (end_date is not None and element["date"] > end_date) 49 ret.append(release)
59 or (start_date is not None and element["date"] < start_date) 50
60 ) 51 return ret
61 ]
62 52
63 53
64 def download_and_unpack(url, output_directory): 54 def download_and_unpack(url, output_directory):
65 response = requests.get(url) 55 response = requests.get(url)
66 if response.status_code == 200: 56 if response.status_code == 200:
82 else: 72 else:
83 response.raise_for_status() 73 response.raise_for_status()
84 74
85 75
86 def parse_date(d): 76 def parse_date(d):
87 return datetime.datetime.strptime(d, "%Y-%m-%d") 77 # Tries to parse the first 10 chars of d as a date, which currently
78 # succeeds for all pangolearn model releases.
79 return datetime.datetime.strptime(d[:10], "%Y-%m-%d")
88 80
89 81
90 if __name__ == "__main__": 82 if __name__ == "__main__":
91 83
92 parser = argparse.ArgumentParser() 84 parser = argparse.ArgumentParser()
99 parser.add_argument("datatable_name") 91 parser.add_argument("datatable_name")
100 parser.add_argument("galaxy_datamanager_filename") 92 parser.add_argument("galaxy_datamanager_filename")
101 args = parser.parse_args() 93 args = parser.parse_args()
102 94
103 if args.testmode: 95 if args.testmode:
104 releases = filter_by_date(start_date=args.start_date, end_date=args.end_date) 96 releases = filter_by_date([], start_date=args.start_date, end_date=args.end_date)
105 for release in releases: 97 for release in releases:
106 print(release["tag_name"], release["tarball_url"].split("/")[-1]) 98 print(release["tag_name"], release["tarball_url"].split("/")[-1], release["date"])
107 sys.exit(0) 99 sys.exit(0)
108 100
109 with open(args.galaxy_datamanager_filename) as fh: 101 with open(args.galaxy_datamanager_filename) as fh:
110 config = json.load(fh) 102 config = json.load(fh)
111 103
127 ] 119 ]
128 ) 120 )
129 else: 121 else:
130 existing_release_tags = set() 122 existing_release_tags = set()
131 if args.latest: 123 if args.latest:
132 releases = [get_model_list(existing_release_tags)[0]] 124 releases = [next(get_model_list(existing_release_tags))]
133 else: 125 else:
134 releases = filter_by_date( 126 releases = filter_by_date(
135 existing_release_tags, start_date=args.start_date, end_date=args.end_date 127 existing_release_tags, start_date=args.start_date, end_date=args.end_date
136 ) 128 )
137 releases_to_download = [ 129 releases_to_download = [
138 release 130 release
139 for release in releases 131 for release in releases
140 if release["tag_name"] not in existing_release_tags 132 if release["tag_name"] not in existing_release_tags
141 ] 133 ]
142 for release in releases_to_download: 134 for release in releases_to_download:
143 tag = download_and_unpack(release["tarball_url"], output_directory) 135 fname = download_and_unpack(release["tarball_url"], output_directory)
144 release_date = parse_date(tag)
145 if args.pangolearn_format_version is not None: 136 if args.pangolearn_format_version is not None:
146 version = args.pangolearn_format_version 137 version = args.pangolearn_format_version
147 else: 138 else:
148 # 2021-05-27 was the first release of pangoLEARN for pangolin 3, which changed DB format 139 # 2021-05-27 was the first release of pangoLEARN for pangolin 3, which changed DB format
149 if release_date >= datetime.datetime(2021, 5, 27): 140 if release["date"] >= datetime.datetime(2021, 5, 27):
150 version = '3.0' 141 version = '3.0'
151 else: 142 else:
152 version = '1.0' 143 version = '1.0'
153 data_manager_dict["data_tables"][args.datatable_name].append( 144 data_manager_dict["data_tables"][args.datatable_name].append(
154 dict( 145 dict(
155 value=tag, 146 value=release["tag_name"],
156 description=release["name"], 147 description=release["name"],
157 format_version=version, 148 format_version=version,
158 path=output_directory + "/" + tag, 149 path=output_directory + "/" + fname,
159 ) 150 )
160 ) 151 )
161 data_manager_dict["data_tables"][args.datatable_name].sort( 152 data_manager_dict["data_tables"][args.datatable_name].sort(
162 key=operator.itemgetter("value"), reverse=True 153 key=operator.itemgetter("value"), reverse=True
163 ) 154 )