Mercurial > repos > iuc > data_manager_pangolearn
comparison data_manager/pangolearn_dm.py @ 4:6e24e79d3d69 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_pangolearn commit fd2566abd51c88847437d38a5abea8703b8ee034"
| author | iuc |
|---|---|
| date | Tue, 05 Apr 2022 18:40:07 +0000 |
| parents | df30a2f1db55 |
| children | 29c738066906 |
comparison
equal
deleted
inserted
replaced
| 3:df30a2f1db55 | 4:6e24e79d3d69 |
|---|---|
| 10 import tarfile | 10 import tarfile |
| 11 | 11 |
| 12 import requests | 12 import requests |
| 13 | 13 |
| 14 | 14 |
| 15 def extract_date(tag_str): | |
| 16 parts = tag_str.split("_") | |
| 17 assert len(parts) < 3, "expected maximum of two parts, got " + str(parts) | |
| 18 # there are tags like: 2021-07-07-2 | |
| 19 parts[0] = "-".join(parts[0].split("-")[:3]) | |
| 20 tag_date = datetime.datetime.strptime(parts[0], "%Y-%m-%d") | |
| 21 if len(parts) == 2: | |
| 22 version = int(parts[1]) | |
| 23 assert ( | |
| 24 version < 24 * 60 | |
| 25 ) # because the code stores versions as minutes of the day, it can't handle versions > 1440 | |
| 26 tag_date += datetime.timedelta(minutes=version) | |
| 27 return tag_date | |
| 28 | |
| 29 | |
| 30 def get_model_list( | 15 def get_model_list( |
| 31 existing_release_tags, | 16 existing_release_tags, |
| 32 url="https://api.github.com/repos/cov-lineages/pangoLEARN/releases", | 17 url="https://api.github.com/repos/cov-lineages/pangoLEARN/releases" |
| 33 ): | 18 ): |
| 34 response = requests.get(url) | 19 page_num = 0 |
| 35 if response.status_code == 200: | 20 while True: |
| 36 release_list = json.loads(response.text) | 21 page_num += 1 |
| 37 release_info = [ | 22 response = requests.get(url + f'?page={page_num}') |
| 38 dict( | 23 if response.status_code == 200: |
| 39 tag_name=e["tag_name"], | 24 release_list_chunk = json.loads(response.text) |
| 40 name=e["name"], | 25 if not release_list_chunk: |
| 41 date=extract_date(e["tag_name"]), | 26 # past the last page of results |
| 42 tarball_url=e["tarball_url"], | 27 return |
| 43 ) | 28 for e in release_list_chunk: |
| 44 for e in release_list | 29 if e["tag_name"] in existing_release_tags: |
| 45 if e["tag_name"] not in existing_release_tags | 30 continue |
| 46 ] | 31 if e["prerelease"]: |
| 47 return release_info | 32 continue |
| 48 else: | 33 yield dict( |
| 49 response.raise_for_status() | 34 tag_name=e["tag_name"], |
| 35 name=e["name"], | |
| 36 date=parse_date(e["tag_name"]), | |
| 37 tarball_url=e["tarball_url"], | |
| 38 ) | |
| 39 else: | |
| 40 response.raise_for_status() | |
| 50 | 41 |
| 51 | 42 |
| 52 def filter_by_date(existing_release_tags, start_date=None, end_date=None): | 43 def filter_by_date(existing_release_tags, start_date=None, end_date=None): |
| 53 release_list = get_model_list(existing_release_tags) | 44 ret = [] |
| 54 return [ | 45 for release in get_model_list(existing_release_tags): |
| 55 element | 46 if start_date and release["date"] < start_date: |
| 56 for element in release_list | 47 break |
| 57 if not ( | 48 if not end_date or release["date"] <= end_date: |
| 58 (end_date is not None and element["date"] > end_date) | 49 ret.append(release) |
| 59 or (start_date is not None and element["date"] < start_date) | 50 |
| 60 ) | 51 return ret |
| 61 ] | |
| 62 | 52 |
| 63 | 53 |
| 64 def download_and_unpack(url, output_directory): | 54 def download_and_unpack(url, output_directory): |
| 65 response = requests.get(url) | 55 response = requests.get(url) |
| 66 if response.status_code == 200: | 56 if response.status_code == 200: |
| 82 else: | 72 else: |
| 83 response.raise_for_status() | 73 response.raise_for_status() |
| 84 | 74 |
| 85 | 75 |
| 86 def parse_date(d): | 76 def parse_date(d): |
| 87 return datetime.datetime.strptime(d, "%Y-%m-%d") | 77 # Tries to parse the first 10 chars of d as a date, which currently |
| 78 # succeeds for all pangolearn model releases. | |
| 79 return datetime.datetime.strptime(d[:10], "%Y-%m-%d") | |
| 88 | 80 |
| 89 | 81 |
| 90 if __name__ == "__main__": | 82 if __name__ == "__main__": |
| 91 | 83 |
| 92 parser = argparse.ArgumentParser() | 84 parser = argparse.ArgumentParser() |
| 99 parser.add_argument("datatable_name") | 91 parser.add_argument("datatable_name") |
| 100 parser.add_argument("galaxy_datamanager_filename") | 92 parser.add_argument("galaxy_datamanager_filename") |
| 101 args = parser.parse_args() | 93 args = parser.parse_args() |
| 102 | 94 |
| 103 if args.testmode: | 95 if args.testmode: |
| 104 releases = filter_by_date(start_date=args.start_date, end_date=args.end_date) | 96 releases = filter_by_date([], start_date=args.start_date, end_date=args.end_date) |
| 105 for release in releases: | 97 for release in releases: |
| 106 print(release["tag_name"], release["tarball_url"].split("/")[-1]) | 98 print(release["tag_name"], release["tarball_url"].split("/")[-1], release["date"]) |
| 107 sys.exit(0) | 99 sys.exit(0) |
| 108 | 100 |
| 109 with open(args.galaxy_datamanager_filename) as fh: | 101 with open(args.galaxy_datamanager_filename) as fh: |
| 110 config = json.load(fh) | 102 config = json.load(fh) |
| 111 | 103 |
| 127 ] | 119 ] |
| 128 ) | 120 ) |
| 129 else: | 121 else: |
| 130 existing_release_tags = set() | 122 existing_release_tags = set() |
| 131 if args.latest: | 123 if args.latest: |
| 132 releases = [get_model_list(existing_release_tags)[0]] | 124 releases = [next(get_model_list(existing_release_tags))] |
| 133 else: | 125 else: |
| 134 releases = filter_by_date( | 126 releases = filter_by_date( |
| 135 existing_release_tags, start_date=args.start_date, end_date=args.end_date | 127 existing_release_tags, start_date=args.start_date, end_date=args.end_date |
| 136 ) | 128 ) |
| 137 releases_to_download = [ | 129 releases_to_download = [ |
| 138 release | 130 release |
| 139 for release in releases | 131 for release in releases |
| 140 if release["tag_name"] not in existing_release_tags | 132 if release["tag_name"] not in existing_release_tags |
| 141 ] | 133 ] |
| 142 for release in releases_to_download: | 134 for release in releases_to_download: |
| 143 tag = download_and_unpack(release["tarball_url"], output_directory) | 135 fname = download_and_unpack(release["tarball_url"], output_directory) |
| 144 release_date = parse_date(tag) | |
| 145 if args.pangolearn_format_version is not None: | 136 if args.pangolearn_format_version is not None: |
| 146 version = args.pangolearn_format_version | 137 version = args.pangolearn_format_version |
| 147 else: | 138 else: |
| 148 # 2021-05-27 was the first release of pangoLEARN for pangolin 3, which changed DB format | 139 # 2021-05-27 was the first release of pangoLEARN for pangolin 3, which changed DB format |
| 149 if release_date >= datetime.datetime(2021, 5, 27): | 140 if release["date"] >= datetime.datetime(2021, 5, 27): |
| 150 version = '3.0' | 141 version = '3.0' |
| 151 else: | 142 else: |
| 152 version = '1.0' | 143 version = '1.0' |
| 153 data_manager_dict["data_tables"][args.datatable_name].append( | 144 data_manager_dict["data_tables"][args.datatable_name].append( |
| 154 dict( | 145 dict( |
| 155 value=tag, | 146 value=release["tag_name"], |
| 156 description=release["name"], | 147 description=release["name"], |
| 157 format_version=version, | 148 format_version=version, |
| 158 path=output_directory + "/" + tag, | 149 path=output_directory + "/" + fname, |
| 159 ) | 150 ) |
| 160 ) | 151 ) |
| 161 data_manager_dict["data_tables"][args.datatable_name].sort( | 152 data_manager_dict["data_tables"][args.datatable_name].sort( |
| 162 key=operator.itemgetter("value"), reverse=True | 153 key=operator.itemgetter("value"), reverse=True |
| 163 ) | 154 ) |
