Mercurial > repos > iuc > data_manager_gtdbtk_database_installer
changeset 2:6ab422fba1a3 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_gtdbtk_database_installer commit ad14947c3e13babe90a6878b45608fe56a16150d
author | iuc |
---|---|
date | Tue, 13 Aug 2024 21:13:43 +0000 |
parents | 2814c058a087 |
children | c4830a9870fa |
files | data_manager/gtdbtk_database_installer.py data_manager/gtdbtk_database_installer.xml data_manager_conf.xml test-data/gtdbtk_database.loc test-data/gtdbtk_database_metadata_versioned.loc test-data/gtdbtk_database_versioned.loc.sample tool-data/gtdbtk_database.loc.sample tool-data/gtdbtk_database_metadata_versioned.loc.sample tool-data/gtdbtk_database_versioned.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
diffstat | 11 files changed, 197 insertions(+), 92 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/gtdbtk_database_installer.py Tue Jan 03 09:05:09 2023 +0000 +++ b/data_manager/gtdbtk_database_installer.py Tue Aug 13 21:13:43 2024 +0000 @@ -1,25 +1,57 @@ #!/usr/bin/env python import argparse +import gzip import json import os import shutil import sys import tarfile +from datetime import datetime from urllib.parse import urlparse -from urllib.request import Request -from urllib.request import urlopen +from urllib.request import Request, urlopen + +# rather provide the urls based on the release, less error potential for the admins ! +urls = { + "202": { + "full": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz", + "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/ar122_taxonomy_r202.tsv.gz", + "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release202/202.0/bac120_taxonomy_r202.tsv.gz", + }, + "207": { + "full": "https://data.gtdb.ecogenomic.org/releases/release207/207.0/auxillary_files/gtdbtk_r207_data.tar.gz", + "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release207/207.0/ar53_taxonomy_r207.tsv.gz", + "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release207/207.0/bac120_taxonomy_r207.tsv.gz", + }, + "214": { + "full": "https://data.gtdb.ecogenomic.org/releases/release214/214.0/auxillary_files/gtdbtk_r214_data.tar.gz", + "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release214/214.0/ar53_taxonomy_r214.tsv.gz", + "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release214/214.0/bac120_taxonomy_r214.tsv.gz", + }, + "220": { + "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz", + "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_taxonomy_r220.tsv.gz", + "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_taxonomy_r220.tsv.gz", + }, + "test": { # using VERSION to check if files are there + "full": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/VERSION.txt", + "meta_ar": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/ar53_taxonomy_r220.tsv.gz", + "meta_bac": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/bac120_taxonomy_r220.tsv.gz", + }, +} def url_download(url, target_directory): url_parts = urlparse(url) - tarball = os.path.abspath(os.path.join(target_directory, os.path.basename(url_parts.path))) + tarball = os.path.abspath( + os.path.join(target_directory, os.path.basename(url_parts.path)) + ) src = None dst = None try: req = Request(url) src = urlopen(req) - with open(tarball, 'wb') as dst: + with open(tarball, "wb") as dst: while True: chunk = src.read(2**10) if chunk: @@ -32,9 +64,20 @@ if src is not None: src.close() if tarfile.is_tarfile(tarball): - fh = tarfile.open(tarball, 'r:*') + fh = tarfile.open(tarball, "r:*") else: - return tarball + # unzip metadata file + if ".gz" in tarball: + with gzip.open(tarball, "rb") as f_in: + unzipped_file = tarball.strip(".gz") + with open(unzipped_file, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + os.remove(tarball) + folder_of_unzipped_file = os.path.dirname(unzipped_file) + return folder_of_unzipped_file + else: + # this is basically only the return for the test not using a tarfile + return tarball fh.extractall(target_directory) fh.close() os.remove(tarball) @@ -52,33 +95,83 @@ return target_directory -def download(database_id, database_name, url, out_file): +def download(database_name, release, meta, test, out_file): with open(out_file) as fh: params = json.load(fh) - target_directory = params['output_data'][0]['extra_files_path'] + target_directory = params["output_data"][0]["extra_files_path"] os.makedirs(target_directory) - file_path = url_download(url, target_directory) + + if test: + release = "test" + + # download both taxonomy metadata tables + if meta: + url = urls[release]["meta_ar"] + file_path = url_download(url, target_directory) + url = urls[release]["meta_bac"] + file_path = url_download(url, target_directory) + # download the full DB + else: + url = urls[release]["full"] + file_path = url_download(url, target_directory) + + time = datetime.utcnow().strftime("%Y-%m-%d") data_manager_json = {"data_tables": {}} data_manager_entry = {} - data_manager_entry['value'] = database_id - data_manager_entry['name'] = database_name - data_manager_entry['path'] = file_path - data_manager_json["data_tables"]["gtdbtk_database"] = data_manager_entry + data_manager_entry["value"] = f"{database_name}_release_{release}_downloaded_{time}" + data_manager_entry["name"] = database_name + data_manager_entry["path"] = file_path + data_manager_entry["version"] = release - with open(out_file, 'w') as fh: + # store in dedicated metadata table + if meta: + data_manager_json["data_tables"][ + "gtdbtk_database_metadata_versioned" + ] = data_manager_entry + else: + data_manager_json["data_tables"][ + "gtdbtk_database_versioned" + ] = data_manager_entry + + with open(out_file, "w") as fh: json.dump(data_manager_json, fh, sort_keys=True) parser = argparse.ArgumentParser() -parser.add_argument('--database_name', dest='database_name', help='GTDB-Tk database display name') -parser.add_argument('--database_id', dest='database_id', help='Unique GTDB-Tk database id') -parser.add_argument('--url', dest='url', help='URL to download GTDB-Tk databse version') -parser.add_argument('--out_file', dest='out_file', help='JSON output file') +parser.add_argument( + "--database_name", dest="database_name", help="GTDB-Tk database display name" +) + +parser.add_argument("--version", dest="version", help="DB version") + +parser.add_argument( + "--release", dest="release", help="Release of the GTDB-Tk database version" +) +parser.add_argument("--out_file", dest="out_file", help="JSON output file") +parser.add_argument( + "--meta", + dest="meta", + action="store_true", + help="Store meta data flag", +) + +parser.add_argument( + "--test", + dest="test", + action="store_true", + help="Run test", +) args = parser.parse_args() -download(args.database_id, args.database_name, args.url, args.out_file) +download( + args.database_name, + args.release, + args.meta, + args.test, + args.out_file, +)
--- a/data_manager/gtdbtk_database_installer.xml Tue Jan 03 09:05:09 2023 +0000 +++ b/data_manager/gtdbtk_database_installer.xml Tue Aug 13 21:13:43 2024 +0000 @@ -11,41 +11,59 @@ <command> <![CDATA[ python '$__tool_directory__/gtdbtk_database_installer.py' - --database_id '$database_id' --database_name '$database_name' - --url '$url' + --release '$release' --out_file '$out_file' + $meta + $test ]]> </command> <inputs> <param name="database_name" type="text" value="" label="Database name or description" help="This value will be displayed in the GTDB-Tk Database select list"/> - <param name="database_id" type="text" value="" label="Database id" help="This value must be unique with no whitespace allowed - use underscores"/> - <param - name="url" - type="text" - value="https://data.gtdb.ecogenomic.org/releases/release202/202.0/auxillary_files/gtdbtk_r202_data.tar.gz" - label="URL for GTDB release" - help="This should point to a GTDB release tarball. A table of available databases and their version compatability can be found at https://ecogenomics.github.io/GTDBTk/installing/index.html#gtdb-tk-reference-data." - /> + <param name="meta" type="boolean" truevalue="--meta" falsevalue="" checked="false" label="Only store GTDBTK metadata in a dedicated data table. " /> + <param name="test" type="hidden" value="" checked="false" label="Run a dry test run !" /> + <param name="release" type="select" multiple="false" label="GTDB Release"> + <option value="202">202</option> + <option value="207">207</option> + <option value="214">214</option> + <option value="220">220</option> + </param> </inputs> <outputs> <data name="out_file" format="data_manager_json"/> </outputs> <tests> <test> + <!-- TODO --> <!-- Not actually installing a huge GTDB-Tk database --> - <param name="database_id" value="release202"/> + <param name="release" value="202"/> <param name="database_name" value="GTDB-Tk database release 202"/> - <param name="url" value="https://data.gtdb.ecogenomic.org/releases/release202/202.0/VERSION"/> + <param name="test" value="--test"/> <output name="out_file"> <assert_contents> <has_text text="GTDB-Tk database release 202"/> - <has_text text="release202"/> + <has_text text="release_test"/> + </assert_contents> + </output> + </test> + <test> + <!-- Test meta data download --> + <param name="release" value="202"/> + <param name="database_name" value="GTDB-Tk database release 202 metadata"/> + <param name="meta" value="true"/> + <param name="test" value="--test"/> + <output name="out_file"> + <assert_contents> + <has_text text="GTDB-Tk database release 202 metadata"/> + <has_text text="release_test"/> </assert_contents> </output> </test> </tests> <help> +This data manager downloads the DB required for GTDB-Tk tools such as +the `gtdbtk classify_wf`. The meta options allows downloading only the metadata for the +corresponding DB, which is used by tools like `gtdb_to_taxdump`. </help> <citations> <citation type="doi">doi.org/10.1038/s41587-020-0501-8</citation>
--- a/data_manager_conf.xml Tue Jan 03 09:05:09 2023 +0000 +++ b/data_manager_conf.xml Tue Aug 13 21:13:43 2024 +0000 @@ -1,14 +1,29 @@ <data_managers> <data_manager tool_file="data_manager/gtdbtk_database_installer.xml" id="gtdbtk_database_installer"> - <data_table name="gtdbtk_database"> + <data_table name="gtdbtk_database_versioned"> <output> <column name="value"/> <column name="name"/> + <column name="version"/> <column name="path" output_ref="out_file"> <move type="directory" relativize_symlinks="True"> - <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">gtdbtk_database/${value}</target> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">gtdbtk_database_versioned/${value}</target> </move> - <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/gtdbtk_database/${value}</value_translation> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/gtdbtk_database_versioned/${value}</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + <data_table name="gtdbtk_database_metadata_versioned"> + <output> + <column name="value"/> + <column name="name"/> + <column name="version"/> + <column name="path" output_ref="out_file"> + <move type="directory" relativize_symlinks="True"> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">gtdbtk_database_metadata_versioned/${value}</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/gtdbtk_database_metadata_versioned/${value}</value_translation> <value_translation type="function">abspath</value_translation> </column> </output>
--- a/test-data/gtdbtk_database.loc Tue Jan 03 09:05:09 2023 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,26 +0,0 @@ -# This is a sample file distributed with Galaxy that enables tools -# to use a directory of GTDB-Tk databases. The gtdbtk_databases.loc -# file has this format (longer white space characters are TAB characters): -# -# <unique_build_id> <display_name> <directory_path> -# -# So, for example, if you have the gtdbtk 202 stored in -# /depot/data2/galaxy/gtdbtk/202/, -# then the gtdbtk_databases.loc entry would look like this: -# -# release202 gtdbtk database release 202 /depot/data2/galaxy/gtdbtk/release202 -# -# and your /depot/data2/galaxy/gtdbtk/release202 directory -# would contain GTDB-Tk database files for release 202, sommething like this: -# -#drwxr-sr-x 3 gvk G-824019 4096 Apr 20 2021 fastani/ -#-rw-r--r-- 1 gvk G-824019 4810764 Apr 22 2021 manifest.tsv -#drwxr-sr-x 4 gvk G-824019 4096 Apr 21 2021 markers/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 masks/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 metadata/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 21 2021 mrca_red/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 msa/ -#drwxr-sr-x 4 gvk G-824019 4096 Apr 21 2021 pplacer/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 radii/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 taxonomy/ -release202 GTDB-Tk database release 202 /depot/data2/galaxy/tool-data/gtdbtk_database/release202
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/gtdbtk_database_metadata_versioned.loc Tue Aug 13 21:13:43 2024 +0000 @@ -0,0 +1,5 @@ +# This is a sample file distributed with Galaxy that enables tools +# to use a directory of GTDB-Tk databases. The gtdbtk_databases.loc +# file has this format (longer white space characters are TAB characters): +# +# <unique_build_id> <display_name> <version> <directory_path>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/gtdbtk_database_versioned.loc.sample Tue Aug 13 21:13:43 2024 +0000 @@ -0,0 +1,5 @@ +# This is a sample file distributed with Galaxy that enables tools +# to use a directory of GTDB-Tk databases. The gtdbtk_databases.loc +# file has this format (longer white space characters are TAB characters): +# +# <unique_build_id> <display_name> <version> <directory_path>
--- a/tool-data/gtdbtk_database.loc.sample Tue Jan 03 09:05:09 2023 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ -# This is a sample file distributed with Galaxy that enables tools -# to use a directory of GTDB-Tk databases. The gtdbtk_databases.loc -# file has this format (longer white space characters are TAB characters): -# -# <unique_build_id> <display_name> <directory_path> -# -# So, for example, if you have the gtdbtk 202 stored in -# /depot/data2/galaxy/gtdbtk/202/, -# then the gtdbtk_databases.loc entry would look like this: -# -# release202 gtdbtk database release 202 /depot/data2/galaxy/gtdbtk/release202 -# -# and your /depot/data2/galaxy/gtdbtk/release202 directory -# would contain GTDB-Tk database files for release 202, sommething like this: -# -#drwxr-sr-x 3 gvk G-824019 4096 Apr 20 2021 fastani/ -#-rw-r--r-- 1 gvk G-824019 4810764 Apr 22 2021 manifest.tsv -#drwxr-sr-x 4 gvk G-824019 4096 Apr 21 2021 markers/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 masks/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 metadata/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 21 2021 mrca_red/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 msa/ -#drwxr-sr-x 4 gvk G-824019 4096 Apr 21 2021 pplacer/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 radii/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 taxonomy/
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/gtdbtk_database_metadata_versioned.loc.sample Tue Aug 13 21:13:43 2024 +0000 @@ -0,0 +1,5 @@ +# This is a sample file distributed with Galaxy that enables tools +# to use a directory of GTDB-Tk databases. The gtdbtk_databases.loc +# file has this format (longer white space characters are TAB characters): +# +# <unique_build_id> <display_name> <version> <directory_path>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/gtdbtk_database_versioned.loc.sample Tue Aug 13 21:13:43 2024 +0000 @@ -0,0 +1,6 @@ +# This is a sample file distributed with Galaxy that enables tools +# to use a directory of GTDB-Tk databases. The gtdbtk_databases.loc +# file has this format (longer white space characters are TAB characters): +# +# <unique_build_id> <display_name> <version> <directory_path> +s \ No newline at end of file
--- a/tool_data_table_conf.xml.sample Tue Jan 03 09:05:09 2023 +0000 +++ b/tool_data_table_conf.xml.sample Tue Aug 13 21:13:43 2024 +0000 @@ -1,7 +1,12 @@ +<?xml version="1.0"?> <tables> <!-- Locations of GTDB-Tk database versions 202 and higher --> - <table name="gtdbtk_database" comment_char="#"> - <columns>value, name, path</columns> - <file path="tool-data/gtdbtk_database.loc" /> + <table name="gtdbtk_database_versioned" comment_char="#"> + <columns>value, name, version, path</columns> + <file path="tool-data/gtdbtk_database_versioned.loc" /> + </table> + <table name="gtdbtk_database_metadata_versioned" comment_char="#"> + <columns>value, name, version, path</columns> + <file path="tool-data/gtdbtk_database_metadata_versioned.loc" /> </table> </tables>
--- a/tool_data_table_conf.xml.test Tue Jan 03 09:05:09 2023 +0000 +++ b/tool_data_table_conf.xml.test Tue Aug 13 21:13:43 2024 +0000 @@ -1,7 +1,11 @@ <tables> <!-- Location of databases for gtdbtk version 202 and higher --> - <table name="gtdbtk_database" comment_char="#"> - <columns>value, name, path</columns> - <file path="${__HERE__}/test-data/gtdbtk_database.loc" /> + <table name="gtdbtk_database_versioned" comment_char="#"> + <columns>value, name, version, path</columns> + <file path="${__HERE__}/test-data/gtdbtk_database_versioned.loc" /> + </table> + <table name="gtdbtk_database_metadata_versioned" comment_char="#"> + <columns>value, name, version, path</columns> + <file path="${__HERE__}/test-data/gtdbtk_database_metadata_versioned.loc" /> </table> </tables>