Mercurial > repos > iuc > amrfinderplus_data_manager_build
changeset 0:eea0c38a9afd draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_amrfinderplus commit 59077a173599fc9d355a5d36ad7875050dbe3e19
author | iuc |
---|---|
date | Thu, 05 Jan 2023 14:28:07 +0000 |
parents | |
children | 592ef3959907 |
files | data_manager/data_manager_build_amrfinderplus.py data_manager/data_manager_build_amrfinderplus.xml data_manager/macro.xml data_manager_conf.xml test-data/amrfinderplus.loc.test test-data/amrfinderplus_test_data_manager_1.json test-data/amrfinderplus_test_data_manager_2.json tool-data/amrfinderplus.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
diffstat | 10 files changed, 414 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager_build_amrfinderplus.py Thu Jan 05 14:28:07 2023 +0000 @@ -0,0 +1,257 @@ +import argparse +import json +import os +import subprocess as sp +from ftplib import FTP +from io import BytesIO +from pathlib import Path + +import pandas as pd + + +class GetAmrFinderPlusDataManager: + """ + Create the json file with database information for galaxy data manager + """ + + def __init__(self, + amrfinderplus_database="amrfinderplus_database", + db_name="amrfinderplus-db", + amrfinderplus_version="latest", + date_version=None): + self.data_table_name = amrfinderplus_database + self._db_name = db_name + self._amrfinderplus_version = amrfinderplus_version + self._amrfinderplus_date_version = date_version + self.data_table_entry = None + self.amrfinderplus_table_list = None + + def get_data_table_format(self): + """ + Skeleton of a data_table format + return: a data table formatted for json output + """ + self.data_table_entry = { + "data_tables": { + self.data_table_name: {} + } + } + return self.data_table_entry + + def get_data_manager(self): + """ + Create the empty data table format and add all the information into + return: The data table with database information + """ + self.amrfinderplus_table_list = self.get_data_table_format() + amrfinderplus_value = f"amrfinderplus_V{self._amrfinderplus_version}" \ + f"_{self._amrfinderplus_date_version}" + amrfinderplus_name = f"V{self._amrfinderplus_version}" \ + f"-{self._amrfinderplus_date_version}" + data_info = dict(value=amrfinderplus_value, + name=amrfinderplus_name, + path=self._db_name) + self.amrfinderplus_table_list["data_tables"][self.data_table_name] = [data_info] + return self.amrfinderplus_table_list + + +class DownloadAmrFinderPlusDatabase(GetAmrFinderPlusDataManager): + """ + Download the amrfinderplus database from the ncbi. + Make the database available with hmm and indexed files + Build the data manager infos for galaxy + """ + + def __init__(self, + output_dir=Path.cwd(), + ncbi_url="ftp.ncbi.nlm.nih.gov", + ftp_login="anonymous", + ftp_password="anonymous", + amrfinderplus_database="amrfinderplus_database", + db_name="amrfinderplus-db", + amrfinderplus_version="latest", + json_file_path=None, + date_version=None, + amrfinderplus_db_path=None, + test_mode=False): + + super().__init__() + self.json_file_path = json_file_path + self._output_dir = output_dir + self._ncbi_ftp_url = ncbi_url + self._ncbi_database_path = "pathogen/Antimicrobial_resistance/AMRFinderPlus/database" + self._login = ftp_login + self._password = ftp_password + self._amrfinderplus_database = amrfinderplus_database + self._db_name = db_name + self._amrfinderplus_version = amrfinderplus_version + self._amrfinderplus_date_version = date_version + self.species_list = None + self.test_mode = test_mode + self.amrfinderplus_db_path = amrfinderplus_db_path + + @staticmethod + def subprocess_cmd(command, *args): + """ + Method to call external tools with any parameters + :param command: command name from the tool used (e.g. wget or makeblastdb) + :param args: free number of argument need for the command tool (e.g. -r, -P ...) + :return: launch the command line from the system + """ + cmd = [command] + [cmd.append(i) for i in args] + proc = sp.run(cmd, stdout=sp.PIPE, stderr=sp.PIPE) + if proc.returncode != 0: + print(f'Error type {proc.returncode} with : \n {proc}') + + def download_amrfinderplus_db(self): + """ + Download the amrfinderplus database from the ncbi ftp server + """ + self.amrfinderplus_db_path = f'{self._output_dir}/{self._db_name}' + os.makedirs(self.amrfinderplus_db_path) + if self._amrfinderplus_version == 'latest': + self.get_amrfinderplus_version() + + amrfinderplus_ftp_path = f"ftp://{self._login}:" \ + f"{self._password}@{self._ncbi_ftp_url}/" \ + f"{self._ncbi_database_path}/" \ + f"{self._amrfinderplus_version}/" \ + f"{self._amrfinderplus_date_version}" + if self.test_mode is True: + file_list = ["AMR_DNA-Escherichia", "version.txt", "taxgroup.tab", "database_format_version.txt"] + output_option = "-O" + for file in file_list: + self.subprocess_cmd("wget", + "-nd", + "-np", + "-r", + f"{amrfinderplus_ftp_path}/{file}", + output_option, + f"{self.amrfinderplus_db_path}/{file}") + else: + output_option = "-P" + self.subprocess_cmd("wget", + "-nd", + "-np", + "-r", + amrfinderplus_ftp_path, + output_option, + self.amrfinderplus_db_path) + + def make_hmm_profile(self): + """ + Make the hmm profile using the AMR.LIB file previously download + """ + hmm_file = Path(f"{self.amrfinderplus_db_path}/AMR.LIB") + if Path.exists(hmm_file) and self.test_mode is False: + self.subprocess_cmd("hmmpress", "-f", hmm_file) + else: + print("hmm_file file is missing to make hmm profiles") + + def extract_filelist_makeblast(self): + """ + Extract le list of species which have file in the database + return: a filtered species list of available species in the database + """ + taxa_group_path = Path(f"{self.amrfinderplus_db_path}/taxgroup.tab") + if Path.exists(taxa_group_path): + taxa_table = pd.read_table(taxa_group_path) + taxa_table.columns = ["taxgroup", "gpipe_taxgroup", "number_of_nucl_ref_genes"] + taxa_df = taxa_table[taxa_table.number_of_nucl_ref_genes > 0].filter(items=["taxgroup"], axis=1) + if self.test_mode is True: + taxa_df = taxa_df[taxa_df.taxgroup == "Escherichia"].taxgroup + else: + taxa_df = taxa_df.taxgroup + self.species_list = list(taxa_df) + else: + print("taxgroup.tab file is missing to list available species") + + def make_blastdb(self): + """ + Index fasta file for blast + """ + self.extract_filelist_makeblast() + nucl_file_db_list = [f'{self.amrfinderplus_db_path}/AMR_DNA-{specie}' for specie in self.species_list] + amr_dna = f'{self.amrfinderplus_db_path}/AMR_CDS' + amr_prot = f'{self.amrfinderplus_db_path}/AMRProt' + os.chdir(self.amrfinderplus_db_path) + if Path(amr_dna).exists(): + nucl_file_db_list.append(amr_dna) + else: + print("No file AMR_CDS detected for indexing") + if Path(amr_prot).exists(): + self.subprocess_cmd("makeblastdb", "-in", amr_prot, "-dbtype", "prot") + else: + print("No file AMRProt detected for indexing") + [self.subprocess_cmd("makeblastdb", "-in", file, "-dbtype", "nucl") for file in nucl_file_db_list] + + def get_amrfinderplus_version(self, version_file="version.txt", + database_version_file="database_format_version.txt"): + """ + Check the version when latest if provided and update the number + param version_file: name of the file containing version information + param database_version_file: name of the file containing date version information + """ + ftp = FTP(self._ncbi_ftp_url) + ftp.login(self._login, self._password) + ftp.cwd(f"{self._ncbi_database_path}/{self._amrfinderplus_version}") + db_version = BytesIO() + db_date_version = BytesIO() + ftp.retrbinary(f'RETR {version_file}', db_version.write) + ftp.retrbinary(f'RETR {database_version_file}', db_date_version.write) + self._amrfinderplus_date_version = db_version.getvalue().decode("utf-8").splitlines()[0] + self._amrfinderplus_version = '.'.join( + db_date_version.getvalue().decode("utf-8").splitlines()[0].split(".")[:2]) + + def read_json_input_file(self): + """ + Import the json file + """ + with open(self.json_file_path) as fh: + params = json.load(fh) + target_dir = params['output_data'][0]['extra_files_path'] + os.makedirs(target_dir) + self._output_dir = target_dir + + def write_json_infos(self): + """ + Write in the imported json file + """ + with open(self.json_file_path, 'w') as fh: + json.dump(self.get_data_manager(), fh, sort_keys=True) + + +def parse_arguments(): + """ + List of arguments provided by the user + return: parsed arguments + """ + # parse options and arguments + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument("data_manager_json", + help="json file from galaxy") + arg_parser.add_argument("--db_version", default="latest", + help="select the major version of the database (e.g. 3.10, 3.8), default is latest") + arg_parser.add_argument("--db_date", + help="select the date into the database version (e.g. 2022-10-11.2)") + arg_parser.add_argument("--test", action='store_true', + help="option to test the script with an lighted database") + return arg_parser.parse_args() + + +def main(): + all_args = parse_arguments() + amrfinderplus_download = DownloadAmrFinderPlusDatabase(amrfinderplus_version=all_args.db_version, + date_version=all_args.db_date, + json_file_path=all_args.data_manager_json, + test_mode=all_args.test) + amrfinderplus_download.read_json_input_file() + amrfinderplus_download.download_amrfinderplus_db() + amrfinderplus_download.make_hmm_profile() + amrfinderplus_download.make_blastdb() + amrfinderplus_download.write_json_infos() + + +if __name__ == '__main__': + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager_build_amrfinderplus.xml Thu Jan 05 14:28:07 2023 +0000 @@ -0,0 +1,92 @@ +<tool id="data_manager_build_amrfinderplus" name="amrfinderplus_datamanager" tool_type="manage_data" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> + <description>AMRfinderplus database builder</description> + <macros> + <import>macro.xml</import> + </macros> + <expand macro="requirements"/> + <command detect_errors="exit_code"> + <![CDATA[ + python '$__tool_directory__/data_manager_build_amrfinderplus.py' + '$output_file' + --db_version '$database_list.database_version_select' + #if $database_list.database_version_select != 'latest': + --db_date '$database_list.database_date_select' + #end if + $test_data_manager + ]]></command> + <inputs> + <conditional name="database_list"> + <param name="database_version_select" type="select" label="Database version"> + <option value="latest" selected="true">Latest available version</option> + <option value="3.10">V3.10</option> + <option value="3.9">V3.9</option> + <option value="3.8">V3.8</option> + <option value="3.6">V3.6</option> + </param> + <when value="latest"> + </when> + <when value="3.10"> + <param name="database_date_select" type="select" label="Date version"> + <option value="2022-10-11.2" selected="true">2022-10-11.2</option> + <option value="2022-08-09.1">2022-08-09.1</option> + <option value="2022-05-26.1">2022-05-26.1</option> + <option value="2022-04-04.1">2022-04-04.1</option> + <option value="2021-12-21.1">2021-12-21.1</option> + <option value="2021-09-30.1">2021-09-30.1</option> + <option value="2021-09-30.1">2021-08-11.1</option> + <option value="2021-09-30.1">2021-06-01.1</option> + <option value="2021-09-30.1">2021-03-01.1</option> + </param> + </when> + <when value="3.9"> + <param name="database_date_select" type="select" label="Date version"> + <option value="2020-11-09.1" selected="true">2020-11-09.1</option> + <option value="2020-12-17.1">2020-12-17.1</option> + </param> + </when> + <when value="3.8"> + <param name="database_date_select" type="select" label="Date version"> + <option value="2020-09-30.1" selected="true">2020-09-30.1</option> + <option value="2020-09-22.2">2020-09-22.2</option> + <option value="2020-07-16.2">2020-07-16.2</option> + <option value="2020-06-11.1">2020-06-11.1</option> + <option value="2020-05-04.1">2020-05-04.1</option> + </param> + </when> + <when value="3.6"> + <param name="database_date_select" type="select" label="Date version"> + <option value="2020-01-22.1" selected="true">2020-01-22.1</option> + <option value="2020-03-20.1">2020-03-20.1</option> + </param> + </when> + </conditional> + <param name="test_data_manager" type="hidden" value=""/> + </inputs> + <outputs> + <data name="output_file" format="data_manager_json"/> + </outputs> + <tests> + <!-- Test_1 DB latest --> + <test expect_num_outputs="1"> + <param name="test_data_manager" value="--test"/> + <output name="output_file" value="amrfinderplus_test_data_manager_1.json"/> + </test> + <!-- Test_2 DB 3.2 --> + <test expect_num_outputs="1"> + <param name="test_data_manager" value="--test"/> + <conditional name="database_list"> + <param name="database_version_select" value="3.6"/> + <param name="database_date_select" value="2020-03-20.1"/> + </conditional> + <output name="output_file" value="amrfinderplus_test_data_manager_2.json"/> + </test> + + + </tests> + <help><![CDATA[ + Download amrfinderplus database from the NCBI server + ]]></help> + <citations> + <citation type="doi">10.1038/s41598-021-91456-0</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/macro.xml Thu Jan 05 14:28:07 2023 +0000 @@ -0,0 +1,15 @@ +<?xml version="1.0"?> +<macros> + <token name="@TOOL_VERSION@">3.10.45</token> + <token name="@PYTHON_VERSION@">3.10.6</token> + <token name="@PANDAS@">1.5.1</token> + <token name="@VERSION_SUFFIX@">0</token> + <token name="@PROFILE@">21.05</token> + <xml name="requirements"> + <requirements> + <requirement type="package" version="@TOOL_VERSION@">ncbi-amrfinderplus</requirement> + <requirement type="package" version="@PYTHON_VERSION@">python</requirement> + <requirement type="package" version="@PANDAS@">pandas</requirement> + </requirements> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Thu Jan 05 14:28:07 2023 +0000 @@ -0,0 +1,19 @@ +<?xml version="1.0"?> +<data_managers> + <data_manager tool_file="data_manager/data_manager_build_amrfinderplus.xml" id="data_manager_build_amrfinderplus" version="@TOOL_VERSION@"> + <data_table name="amrfinderplus_database"> + <output> + <column name="value" /> + <column name="name" /> + <column name="path" output_ref="output_file"> + <move type="directory" relativize_symlinks="True"> + <source>${path}</source> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">amrfinderplus-db/${value}</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/amrfinderplus-db/${value}</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + </data_manager> +</data_managers>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/amrfinderplus.loc.test Thu Jan 05 14:28:07 2023 +0000 @@ -0,0 +1,8 @@ +# this is a tab separated file describing the location of amrfinderplus database +# +# the columns are: +# value, name, path +# +# for example +amrfinderplus_V3.10_2022-10-11.2 V3.10-2022-10-11.2 amrfinderplus-db +amrfinderplus_V3.6_2020-03-20.1 V3.6-2020-03-20.1 amrfinderplus-db
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/amrfinderplus_test_data_manager_1.json Thu Jan 05 14:28:07 2023 +0000 @@ -0,0 +1,1 @@ +{"data_tables": {"amrfinderplus_database": [{"name": "V3.11-2022-12-19.1", "path": "amrfinderplus-db", "value": "amrfinderplus_V3.11_2022-12-19.1"}]}} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/amrfinderplus_test_data_manager_2.json Thu Jan 05 14:28:07 2023 +0000 @@ -0,0 +1,1 @@ +{"data_tables": {"amrfinderplus_database": [{"name": "V3.6-2020-03-20.1", "path": "amrfinderplus-db", "value": "amrfinderplus_V3.6_2020-03-20.1"}]}} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/amrfinderplus.loc.sample Thu Jan 05 14:28:07 2023 +0000 @@ -0,0 +1,7 @@ +# this is a tab separated file describing the location of amrfinderplus database +# +# the columns are: +# value, name, path +# +# for example +#amrfinderplus_V3.6_2020-03-20.1 V3.6-2020-03-20.1 amrfinderplus-db
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Thu Jan 05 14:28:07 2023 +0000 @@ -0,0 +1,7 @@ +<tables> + <!-- Locations of amrfinderplus database in the required format --> + <table name="amrfinderplus_database" comment_char="#"> + <columns>value, name, path</columns> + <file path="tool-data/amrfinderplus.loc" /> + </table> +</tables> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Thu Jan 05 14:28:07 2023 +0000 @@ -0,0 +1,7 @@ +<tables> + <!-- Locations of amrfinderplus database in the required format --> + <table name="amrfinderplus_database" comment_char="#"> + <columns>value, name, path</columns> + <file path="${__HERE__}/test-data/amrfinderplus.loc.test"/> + </table> +</tables>