changeset 3:3e73c97f025d draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_bakta_database commit 487cb35fe55883ac6eeb8dda58b56c9ca2ec0a85
author iuc
date Fri, 23 Jun 2023 21:37:05 +0000
parents adfd6bf710bd
children d74850cf4e42
files data_manager/bakta_build_database.py data_manager/bakta_build_database.xml data_manager/macro.xml test-data/bakta_test.loc test-data/bakta_test_data_manager.json test-data/bakta_test_data_manager1.json test-data/bakta_test_data_manager2.json test-data/bakta_test_data_manager3.json test-data/bakta_test_data_manager_test2.json test-data/db-versions.json
diffstat 10 files changed, 132 insertions(+), 154 deletions(-) [+]
line wrap: on
line diff
--- a/data_manager/bakta_build_database.py	Sun Apr 16 08:29:25 2023 +0000
+++ b/data_manager/bakta_build_database.py	Fri Jun 23 21:37:05 2023 +0000
@@ -2,6 +2,7 @@
 import hashlib
 import json
 import os
+import re
 import sys
 import tarfile
 from datetime import datetime
@@ -16,38 +17,50 @@
     Extract bakta database information to make a json file for data_manager
     """
 
-    def __init__(self,
-                 data_table_name="bakta_database",
-                 db_name=Path.cwd().joinpath("db"),
-                 db_version="latest",
-                 test_mode=False):
+    def __init__(
+        self,
+        data_table_name="bakta_database",
+        db_name=Path.cwd().joinpath("db"),
+        db_version="latest",
+        tarball_name="db.tar.gz",
+        test_mode=False,
+    ):
         self.bakta_table_list = None
         self.db_url = None
+        self.db_type = ""
         self.data_table_entry = None
         self.data_table_name = data_table_name
         self.db_name = db_name
+        self.tar_name = tarball_name
         self.db_version = db_version
-        self.DB_VERSIONS_URL = 'https://raw.githubusercontent.com/oschwengers/bakta/master/db-versions.json'
-        self.DB_TEST_URL = 'https://zenodo.org/record/7360542/files/db-versions.json'
+        self.DB_VERSIONS_URL = "https://raw.githubusercontent.com/oschwengers/bakta/master/db-versions.json"
+        self.DB_TEST_URL = "https://zenodo.org/record/8021032/files/db-versions.json"
         self.test_mode = test_mode
 
+    def get_database_type(self):
+        self.light_db = bool(re.search(pattern="light", string=self.db_version))
+        self.db_version = self.db_version.split(sep="_")[0]
+        if self.light_db:
+            self.db_type = "light"
+            self.tar_name = "db-light.tar.gz"
+            self.md5 = self.fetch_db_versions()["md5-light"]
+        else:
+            self.md5 = self.fetch_db_versions()["md5"]
+
     def get_data_table_format(self):
         """
         Skeleton of a data_table format
         return: a data table formated for json output
         """
-        self.data_table_entry = {
-            "data_tables": {
-                self.data_table_name: {}
-            }
-        }
+        self.data_table_entry = {"data_tables": {self.data_table_name: {}}}
         return self.data_table_entry
 
-    def fetch_db_versions(self, db_version="latest"):
+    def fetch_db_versions(self):
         """
         List bakta database info related to the db_version selected
         """
-        if self.test_mode is True:
+
+        if self.test_mode:
             self.DB_VERSIONS_URL = self.DB_TEST_URL
         try:
             with requests.get(self.DB_VERSIONS_URL) as resp:
@@ -55,38 +68,43 @@
         except IOError as e:
             print(e, file=sys.stderr)
             raise e
+
+        if self.db_version == "latest":
+            db_date_list = []
+            for db_dic in versions:
+                db_date_list.append(
+                    datetime.strptime(db_dic["date"], "%Y-%m-%d").date()
+                )
+            filtered_version = max(versions, key=lambda x: x["date"])
         else:
-            if db_version == "latest":
-                db_date_list = []
-                for db_dic in versions:
-                    db_date_list.append(datetime.strptime(db_dic["date"],
-                                                          '%Y-%m-%d').date())
-                filtered_version = max(versions, key=lambda x: x['date'])
-            else:
-                filtered_version = None
-                for item in versions:
-                    if '{0}.{1}'.format(item["major"], item["minor"]) == db_version:
-                        filtered_version = item
-                        break
-                if filtered_version is None:
-                    print("No matching version detected in the list")
-            if filtered_version is not None:
-                self.db_url = f"https://zenodo.org/record/" \
-                              f"{filtered_version['record']}/files/db.tar.gz"
-                self.db_version = db_version
-                return filtered_version
+            filtered_version = None
+            for item in versions:
+                if "{0}.{1}".format(item["major"], item["minor"]) == self.db_version:
+                    filtered_version = item
+                    break
+        if filtered_version is None:
+            print("No matching version detected in the list")
+        else:
+            self.db_url = f"https://zenodo.org/record/{filtered_version['record']}/files/{self.tar_name}"
+            return filtered_version
 
     def get_data_manager(self, bakta_database_info):
         self.bakta_table_list = self.get_data_table_format()
-        bakta_name = f"V{bakta_database_info['major']}." \
-                     f"{bakta_database_info['minor']}_" \
-                     f"{bakta_database_info['date']}"
-        tool_version = str(f"{bakta_database_info['software-min']['major']}."
-                           f"{bakta_database_info['software-min']['minor']}")
-        data_info = dict(value=bakta_name,
-                         dbkey=bakta_database_info['record'],
-                         bakta_version=tool_version,
-                         path="db")
+        bakta_name = (
+            f"V{bakta_database_info['major']}."
+            f"{bakta_database_info['minor']}{self.db_type}_"
+            f"{bakta_database_info['date']}"
+        )
+        tool_version = str(
+            f"{bakta_database_info['software-min']['major']}."
+            f"{bakta_database_info['software-min']['minor']}"
+        )
+        data_info = dict(
+            value=bakta_name,
+            dbkey=bakta_database_info["record"],
+            bakta_version=tool_version,
+            path="db",
+        )
         self.bakta_table_list["data_tables"][self.data_table_name] = [data_info]
         return self.bakta_table_list
 
@@ -98,110 +116,88 @@
     untar the download db and update for the amrfinderplus database
     """
 
-    def __init__(self,
-                 db_dir=Path.cwd(),
-                 db_name="bakta",
-                 tarball_name="db.tar.gz",
-                 test_mode=False):
+    def __init__(
+        self, db_dir=Path.cwd(), db_name="bakta", db_version="latest", test_mode=False
+    ):
         super().__init__()
         self.md5 = None
+        self.db_version = db_version
         self.db_dir = db_dir
         self.db_name = db_name
-        self.tarball_name = tarball_name
-        self.tarball_path = None
+        self.tarball_path = ""
         self.test_mode = test_mode
+        self.get_database_type()
 
     def download(self):
-        self.db_name = f'{self.db_name}_{self.db_version}'
-        bakta_path = Path(self.db_dir).joinpath(self.tarball_name)
+        self.db_name = f"{self.db_name}_{self.db_version}{self.db_type}"
+        bakta_path = Path(self.db_dir).joinpath(self.tar_name)
         try:
-            with bakta_path.open('wb') as fh_out, \
-                    requests.get(self.db_url, stream=True) as resp:
-                total_length = resp.headers.get('content-length')
+            with bakta_path.open("wb") as fh_out, requests.get(
+                    self.db_url, stream=True) as resp:
+                total_length = resp.headers.get("content-length")
                 if total_length is None:  # no content length header
                     for data in resp.iter_content(chunk_size=1024 * 1024):
                         fh_out.write(data)
                 else:
                     for data in resp.iter_content(chunk_size=1024 * 1024):
                         fh_out.write(data)
-            print(f'Download bakta database {self.db_version}')
+            print(f"Download bakta database {self.db_version}")
             self.tarball_path = bakta_path
         except IOError:
-            print(f'ERROR: Could not download file from Zenodo!'
-                  f' url={self.db_url}, path={self.tarball_name}')
+            print(
+                f"ERROR: Could not download file from Zenodo!"
+                f" url={self.db_url}, to={self.tarball_path}"
+            )
 
     def untar(self):
         db_path = Path(self.db_dir).as_posix()
         try:
-            with self.tarball_path.open('rb') as fh_in, \
-                    tarfile.open(fileobj=fh_in, mode='r:gz') as tar_file:
+            with self.tarball_path.open("rb") as fh_in, tarfile.open(
+                fileobj=fh_in, mode="r:gz"
+            ) as tar_file:
                 tar_file.extractall(path=db_path)
-                print(f'Untar the database in {db_path}')
+                print(f"Untar the database in {db_path}")
                 return db_path
         except OSError:
-            sys.exit(f'ERROR: Could not extract {self.tarball_name} '
-                     f'to {self.db_name}')
+            sys.exit(f"ERROR: Could not extract {self.tar_name} " f"to {self.db_name}")
 
     def calc_md5_sum(self, buffer_size=1048576):
-        tarball_path = Path(self.db_dir).joinpath(self.tarball_name)
-        self.md5 = self.fetch_db_versions(db_version=self.db_version)["md5"]
+        tarball_path = Path(self.db_dir).joinpath(self.tar_name)
         md5 = hashlib.md5()
-        with tarball_path.open('rb') as fh:
+        with tarball_path.open("rb") as fh:
             data = fh.read(buffer_size)
             while data:
                 md5.update(data)
                 data = fh.read(buffer_size)
         if md5.hexdigest() == self.md5:
-            print('\t...md5 control database OK')
+            print("\t...md5 control database OK")
         else:
-            print(f"Error: corrupt database file! "
-                  f"calculated md5 = {md5.hexdigest()}"
-                  f" different from {self.md5} ")
-
-
-"""
-This is the method to download the amrfinderplus database need by bakta.
-Deprecated to use the amrfinderplus data_manager
-    def update_amrfinderplus_db(self):
-        amrfinderplus_db_path = f"{self.db_dir}/{self.db_name}/db/amrfinderplus-db"
-        if self.db_version == "test":
-            cmd = [
-                'amrfinder_update',
-                '--database', str(amrfinderplus_db_path),
-                '--force_update',
-                '--help'
-            ]
-        else:
-            cmd = [
-                'amrfinder_update',
-                '--database', str(amrfinderplus_db_path),
-                '--force_update'
-            ]
-        proc = sp.run(
-            cmd,
-            universal_newlines=True
-        )
-        if proc.returncode != 0:
-            print(f"ERROR: AMRFinderPlus failed! "
-                  f"command: 'amrfinder_update --force_update"
-                  f" --database {amrfinderplus_db_path}'")
-        else:
-            print("AMRFinderPlus database download")
-"""
+            print(
+                f"Error: corrupt database file! "
+                f"calculated md5 = {md5.hexdigest()}"
+                f" different from {self.md5} "
+            )
 
 
 def parse_arguments():
     # parse options and arguments
     arg_parser = argparse.ArgumentParser()
     arg_parser.add_argument("data_manager_json")
-    arg_parser.add_argument("-d", "--database_version",
-                            help='Select the database version '
-                                 '(major and minor eg. 4.0),'
-                                 'default is the latest version',
-                            default="latest",
-                            required=True)
-    arg_parser.add_argument("-t", "--test", action='store_true',
-                            help="option to test the script with an empty database")
+    arg_parser.add_argument(
+        "-d",
+        "--database_version",
+        help="Select the database version "
+        "(major and minor eg. 4.0),"
+        "default is the latest version",
+        default="latest",
+        required=True,
+    )
+    arg_parser.add_argument(
+        "-t",
+        "--test",
+        action="store_true",
+        help="option to test the script with an empty database",
+    )
     return arg_parser.parse_args()
 
 
@@ -209,11 +205,13 @@
     all_args = parse_arguments()
     with open(all_args.data_manager_json) as fh:
         params = json.load(fh)
-    target_dir = params['output_data'][0]['extra_files_path']
+    target_dir = params["output_data"][0]["extra_files_path"]
     os.makedirs(target_dir)
     # init the class to download bakta db
-    bakta_upload = InstallBaktaDatabase(test_mode=all_args.test)
-    bakta_db = bakta_upload.fetch_db_versions(db_version=all_args.database_version)
+    bakta_upload = InstallBaktaDatabase(
+        test_mode=all_args.test, db_version=all_args.database_version
+    )
+    bakta_db = bakta_upload.fetch_db_versions()
     # update the path for galaxy
     bakta_upload.db_dir = target_dir
     # download the database
@@ -224,9 +222,9 @@
     bakta_upload.untar()
     # make the data_manager metadata
     bakta_data_manager = bakta_upload.get_data_manager(bakta_database_info=bakta_db)
-    with open(all_args.data_manager_json, 'w') as fh:
+    with open(all_args.data_manager_json, "w") as fh:
         json.dump(bakta_data_manager, fh, sort_keys=True)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
--- a/data_manager/bakta_build_database.xml	Sun Apr 16 08:29:25 2023 +0000
+++ b/data_manager/bakta_build_database.xml	Fri Jun 23 21:37:05 2023 +0000
@@ -20,6 +20,8 @@
             <option value="3.0">V3.0_2021-08-05</option>
             <option value="3.1">V3.1_2022-02-03</option>
             <option value="4.0">V4.0_2022-08-29</option>
+            <option value="5.0">V5.0_2023-02-20</option>
+            <option value="5.0_light">V5.0_light_2023-02-20</option>
         </param>
         <param name="test_data_manager" type="hidden" value=""/>
     </inputs>
@@ -31,13 +33,19 @@
         <test expect_num_outputs="1">
             <param name="test_data_manager" value="--test"/>
             <param name="database_select" value="1.0"/>
-            <output name="output_file" value="bakta_test_data_manager.json" />
+            <output name="output_file" value="bakta_test_data_manager1.json" />
         </test>
         <!-- Test 2 with the latest option -->
         <test expect_num_outputs="1">
             <param name="test_data_manager" value="--test"/>
             <param name="database_select" value="latest"/>
-            <output name="output_file" value="bakta_test_data_manager_test2.json" />
+            <output name="output_file" value="bakta_test_data_manager2.json" />
+        </test>
+        <!-- Test 3 with light db -->
+        <test expect_num_outputs="1">
+            <param name="test_data_manager" value="--test"/>
+            <param name="database_select" value="5.0_light"/>
+            <output name="output_file" value="bakta_test_data_manager3.json" />
         </test>
     </tests>
     <help><![CDATA[
--- a/data_manager/macro.xml	Sun Apr 16 08:29:25 2023 +0000
+++ b/data_manager/macro.xml	Fri Jun 23 21:37:05 2023 +0000
@@ -1,8 +1,8 @@
 <macros>
-    <token name="@TOOL_VERSION@">1.5.1</token>
+    <token name="@TOOL_VERSION@">1.8.1</token>
     <token name="@REQUESTS_VERSION@">2.27.1</token>
     <token name="@PYTHON_VERSION@">3.8</token>
-    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@VERSION_SUFFIX@">1</token>
     <token name="@PROFILE@">21.05</token>
     <xml name="requirements">
         <requirements>
--- a/test-data/bakta_test.loc	Sun Apr 16 08:29:25 2023 +0000
+++ b/test-data/bakta_test.loc	Fri Jun 23 21:37:05 2023 +0000
@@ -1,9 +1,6 @@
-# this is a tab separated file describing the location of bakta database
-#
-# the columns are:
-# value, dbkey, bakta_version, path
-#
-# for example
-7197299	V0.0_date_test	0.0	${__HERE__}
-V1.0_2022-10-12	7197299	1.4	/tmp/tmpiyh6lcqw/galaxy-dev/tool-data/bakta_database/7197299
-V2.0_2022-11-25	7360139	1.5	/tmp/tmpiyh6lcqw/galaxy-dev/tool-data/bakta_database/7360139
+V1.0_2022-10-12	7197299	1.4	/tmp/tmpq5t7s3c5/galaxy-dev/tool-data/bakta_database/7197299
+V5.0_2023-06-08	8021027	1.8	/tmp/tmpq5t7s3c5/galaxy-dev/tool-data/bakta_database/8021027
+V5.0light_2023-06-08	8021027	1.8	/tmp/tmpq5t7s3c5/galaxy-dev/tool-data/bakta_database/8021027
+V1.0_2022-10-12	7197299	1.4	/tmp/tmpydhjlpxl/galaxy-dev/tool-data/bakta_database/7197299
+V5.0_2023-06-08	8021027	1.8	/tmp/tmpydhjlpxl/galaxy-dev/tool-data/bakta_database/8021027
+V5.0light_2023-06-08	8021027	1.8	/tmp/tmpydhjlpxl/galaxy-dev/tool-data/bakta_database/8021027
--- a/test-data/bakta_test_data_manager.json	Sun Apr 16 08:29:25 2023 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1 +0,0 @@
-{"data_tables": {"bakta_database": [{"bakta_version": "1.4", "dbkey": "7197299", "path": "db", "value": "V1.0_2022-10-12"}]}}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/bakta_test_data_manager1.json	Fri Jun 23 21:37:05 2023 +0000
@@ -0,0 +1,1 @@
+{"data_tables": {"bakta_database": [{"bakta_version": "1.4", "dbkey": "7197299", "path": "db", "value": "V1.0_2022-10-12"}]}}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/bakta_test_data_manager2.json	Fri Jun 23 21:37:05 2023 +0000
@@ -0,0 +1,1 @@
+{"data_tables": {"bakta_database": [{"bakta_version": "1.8", "dbkey": "8021027", "path": "db", "value": "V5.0_2023-06-08"}]}}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/bakta_test_data_manager3.json	Fri Jun 23 21:37:05 2023 +0000
@@ -0,0 +1,1 @@
+{"data_tables": {"bakta_database": [{"bakta_version": "1.8", "dbkey": "8021027", "path": "db", "value": "V5.0light_2023-06-08"}]}}
\ No newline at end of file
--- a/test-data/bakta_test_data_manager_test2.json	Sun Apr 16 08:29:25 2023 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1 +0,0 @@
-{"data_tables": {"bakta_database": [{"bakta_version": "1.5", "dbkey": "7360139", "path": "db", "value": "V2.0_2022-11-25"}]}}
\ No newline at end of file
--- a/test-data/db-versions.json	Sun Apr 16 08:29:25 2023 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,26 +0,0 @@
-[
-    {
-        "date": "2022-10-12",
-        "major": 1,
-        "minor": 0,
-        "doi": "10.5281/zenodo.7197299",
-        "record": "7197299",
-        "md5": "8b0250c17078742fc12207d4efb0fc1a",
-        "software-min": {
-            "major": 1,
-            "minor": 4
-        }
-    },
-    {
-        "date": "2022-11-25",
-        "major": 2,
-        "minor": 0,
-        "doi": "10.5281/zenodo.7360139",
-        "record": "7360139",
-        "md5": "ebdb799a6bd97e56ca359db781ab8bab",
-        "software-min": {
-            "major": 1,
-            "minor": 5
-        }
-    }
-]