Mercurial > repos > iuc > data_manager_ncbi_fcs_gx_database_downloader
changeset 0:6be6e6198ac3 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/data_managers/data_manager_ncbi_fcs_gx_database_downloader commit 4a6561ed00e004260be3f3c29d81e814c60e20af
author | iuc |
---|---|
date | Fri, 12 Jan 2024 22:11:17 +0000 |
parents | |
children | 35cef758050c |
files | data_manager/data_manager_ncbi_fcs_gx_database_downloader.py data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml data_manager/macros.xml data_manager_conf.xml test-data/ncbi_fcs_gx_databases.loc test-data/ncbi_fcs_gx_divisions.tsv test-data/test.json tool-data/ncbi_fcs_gx_databases.loc.sample tool-data/ncbi_fcs_gx_divisions.tsv.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
diffstat | 11 files changed, 322 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager_ncbi_fcs_gx_database_downloader.py Fri Jan 12 22:11:17 2024 +0000 @@ -0,0 +1,111 @@ +#!/usr/bin/env python + +import argparse +import json +import os +import subprocess + + +def main(): + opts = parse_args() + + output_dict = { + "data_tables": { + "ncbi_fcs_gx_databases": sync_files(opts), + "ncbi_fcs_gx_divisions": get_divisions(opts), + } + } + + with open(opts.output_file, "w") as f: + print(json.dumps(output_dict, sort_keys=True, indent=2), file=f) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--tag", required=True) + parser.add_argument("--source_manifest", required=True) + parser.add_argument("--output_file", required=True) + parser.add_argument("--output_dir", required=True) + + return parser.parse_args() + + +def sync_files(opts): + os.makedirs(opts.output_dir, exist_ok=True) + + args = [ + "sync_files.py", + "--mft", + opts.source_manifest, + "--dir", + opts.output_dir, + "get", + ] + + try: + subprocess.run(args, capture_output=True, check=True) + except subprocess.CalledProcessError: + raise + + entries_dict = { + "add": [ + { + "value": opts.tag, + "source_manifest": opts.source_manifest, + "name": opts.output_dir, + } + ] + } + + return entries_dict + + +def get_divisions(opts): + # descriptions for the top-level gx divisions + top_level_description = { + "anml": "Animals (Metazoa)", + "arch": "Archaea", + "fung": "Fungi", + "plnt": "Plants (Viridiplantae)", + "prok": "Bacteria", + "prst": "Protists (other Eukaryota)", + "synt": "Synthetic", + "virs": "Virus", + } + + # get the pathname for the taxa file + manifest_filename = os.path.basename(opts.source_manifest) + assert manifest_filename.lower().endswith( + ".manifest" + ), 'source_manifest does not end with ".manifest"' + manifest_tag = manifest_filename[:-9] + taxa_pathname = os.path.join(opts.output_dir, f"{manifest_tag}.taxa.tsv") + + gx_divisions = set() + with open(taxa_pathname) as f: + for line in f: + if line.startswith("#"): + continue + line = line.rstrip("\n") + tax_id, species, common_name, blast_div, div = line.split("\t", 4) + gx_divisions.add(div) + + elements = [] + for division in gx_divisions: + top, bottom = division.split(":", 1) + description = f"{top_level_description[top]} - {bottom}" + elements.append((description, division)) + + # add an element to support unknown/unclassified samples + elements.append(("Unknown / Unclassified", "unkn:unknown")) + + entries_dict = {"add": []} + + for name, gx_div in sorted(elements): + entries_dict["add"].append({"value": gx_div, "tag": opts.tag, "name": name}) + + return entries_dict + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml Fri Jan 12 22:11:17 2024 +0000 @@ -0,0 +1,35 @@ +<tool id="data_manager_ncbi_fcs_gx_database_downloader" name="NCBI FCS GX database downloader" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" tool_type="manage_data" profile="@PROFILE@"> + <description>Downoad the NCBI FCS GX database</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="edam_ontology"/> + <expand macro="requirements"/> + <command detect_errors="exit_code"><![CDATA[ +python '$__tool_directory__/data_manager_ncbi_fcs_gx_database_downloader.py' +--tag '$tag' +--source_manifest '$source_manifest' +--output_file '$output_file' +--output_dir '$output_file.extra_files_path' + ]]></command> + <inputs> + <param name="tag" type="text" label="Unique identifier for this database"/> + <param name="source_manifest" type="text" label="Source Manifest"/> + </inputs> + <outputs> + <data name="output_file" format="data_manager_json"/> + </outputs> + <tests> + <test> + <param name="tag" value="test-only"/> + <param name="source_manifest" value="https://ncbi-fcs-gx.s3.amazonaws.com/gxdb/test-only/test-only.manifest"/> + <output name="output_file" file="test.json" compare="re_match"/> + </test> + </tests> + <help><![CDATA[ +This tool downloads NCBI FCS GX databases makes them available to the NCBI FCX GX tool. + +See https://github.com/ncbi/fcs/wiki/FCS-GX#b-download-the-database + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/macros.xml Fri Jan 12 22:11:17 2024 +0000 @@ -0,0 +1,22 @@ +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="@TOOL_VERSION@">ncbi-fcs-gx</requirement> + <yield/> + </requirements> + </xml> + <token name="@TOOL_VERSION@">0.5.0</token> + <token name="@VERSION_SUFFIX@">0</token> + <token name="@PROFILE@">21.05</token> + <xml name="edam_ontology"> + <edam_operations> + <edam_operation>operation_3187</edam_operation> + </edam_operations> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1101/2023.06.02.543519</citation> + <yield/> + </citations> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Fri Jan 12 22:11:17 2024 +0000 @@ -0,0 +1,29 @@ +<data_managers> + <data_manager tool_file="data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml" id="data_manager_ncbi_fcs_gx_database_downloader"> + <data_table name="ncbi_fcs_gx_databases"> + <output> + <column name="value" /> + <column name="source_manifest" /> + <column name="name" output_ref="output_file"> + <move type="directory"> + <source>${name}</source> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">ncbi_fcs_gx_databases/${value}</target> + </move> + <value_translation><![CDATA[ +#import os +#set manifest_filename = os.path.basename($source_manifest) +$GALAXY_DATA_MANAGER_DATA_PATH/ncbi_fcs_gx_databases/$value/$manifest_filename + ]]></value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + <data_table name="ncbi_fcs_gx_divisions"> + <output> + <column name="value" /> + <column name="tag" /> + <column name="name" /> + </output> + </data_table> + </data_manager> +</data_managers>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ncbi_fcs_gx_databases.loc Fri Jan 12 22:11:17 2024 +0000 @@ -0,0 +1,1 @@ +#tag source_manifest local_manifest
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ncbi_fcs_gx_divisions.tsv Fri Jan 12 22:11:17 2024 +0000 @@ -0,0 +1,1 @@ +#gx_div tag description
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test.json Fri Jan 12 22:11:17 2024 +0000 @@ -0,0 +1,27 @@ +\{ + "data_tables": \{ + "ncbi_fcs_gx_databases": \{ + "add": \[ + \{ + "name": "[^"]+", + "source_manifest": "https://ncbi-fcs-gx.s3.amazonaws.com/gxdb/test-only/test-only.manifest", + "value": "test-only" + \} + \] + \}, + "ncbi_fcs_gx_divisions": \{ + "add": \[ + \{ + "name": "Bacteria - CFB group bacteria", + "tag": "test-only", + "value": "prok:CFB group bacteria" + \}, + \{ + "name": "Unknown / Unclassified", + "tag": "test-only", + "value": "unkn:unknown" + \} + \] + \} + \} +\}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/ncbi_fcs_gx_databases.loc.sample Fri Jan 12 22:11:17 2024 +0000 @@ -0,0 +1,8 @@ +## NCBI FCS GX Databases +# +#tag manifest path +#r2022-01-24 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/r2022-01-24/all.manifest /big/data/dir/ncbi_fcs_gx_databases/r2022-01-24/all.manifest +#r2022-07-08 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/r2022-07-08/all.manifest /big/data/dir/ncbi_fcs_gx_databases/r2022-07-08/all.manifest +#r2023-01-24 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/r2023-01-24/all.manifest /big/data/dir/ncbi_fcs_gx_databases/r2023-01-24/all.manifest +#latest https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/latest/all.manifest /big/data/dir/ncbi_fcs_gx_databases/latest/all.manifest +#test-only https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.manifest /big/data/dir/ncbi_fcs_gx_databases/test-only/test-only.manifest
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/ncbi_fcs_gx_divisions.tsv.sample Fri Jan 12 22:11:17 2024 +0000 @@ -0,0 +1,64 @@ +#gx_div tag description +#anml:amphibians latest Animals (Metazoa) - amphibians +#anml:basal metazoans latest Animals (Metazoa) - basal metazoans +#anml:birds latest Animals (Metazoa) - birds +#anml:brachiopods latest Animals (Metazoa) - brachiopods +#anml:crustaceans latest Animals (Metazoa) - crustaceans +#anml:echinoderms latest Animals (Metazoa) - echinoderms +#anml:fishes latest Animals (Metazoa) - fishes +#anml:insects latest Animals (Metazoa) - insects +#anml:mammals latest Animals (Metazoa) - mammals +#anml:marsupials latest Animals (Metazoa) - marsupials +#anml:molluscs latest Animals (Metazoa) - molluscs +#anml:nematodes latest Animals (Metazoa) - nematodes +#anml:primates latest Animals (Metazoa) - primates +#anml:reptiles latest Animals (Metazoa) - reptiles +#anml:rodents latest Animals (Metazoa) - rodents +#anml:rotifers latest Animals (Metazoa) - rotifers +#anml:tardigrades latest Animals (Metazoa) - tardigrades +#anml:worms latest Animals (Metazoa) - worms +#arch:archaea latest Archaea - archaea +#prok:CFB group bacteria latest Bacteria - CFB group bacteria +#prok:GNS bacteria latest Bacteria - GNS bacteria +#prok:a-proteobacteria latest Bacteria - a-proteobacteria +#prok:actinobacteria latest Bacteria - actinobacteria +#prok:aquificales latest Bacteria - aquificales +#prok:b-proteobacteria latest Bacteria - b-proteobacteria +#prok:bacteria latest Bacteria - bacteria +#prok:chlamydias latest Bacteria - chlamydias +#prok:cyanobacteria latest Bacteria - cyanobacteria +#prok:d-proteobacteria latest Bacteria - d-proteobacteria +#prok:firmicutes latest Bacteria - firmicutes +#prok:fusobacteria latest Bacteria - fusobacteria +#prok:g-proteobacteria latest Bacteria - g-proteobacteria +#prok:green sulfur bacteria latest Bacteria - green sulfur bacteria +#prok:high GC Gram+ latest Bacteria - high GC Gram+ +#prok:mycoplasmas latest Bacteria - mycoplasmas +#prok:planctomycetes latest Bacteria - planctomycetes +#prok:proteobacteria latest Bacteria - proteobacteria +#prok:spirochetes latest Bacteria - spirochetes +#prok:thermotogales latest Bacteria - thermotogales +#prok:verrucomicrobia latest Bacteria - verrucomicrobia +#fung:ascomycetes latest Fungi - ascomycetes +#fung:basidiomycetes latest Fungi - basidiomycetes +#fung:budding yeasts latest Fungi - budding yeasts +#fung:chytrids latest Fungi - chytrids +#fung:fungi latest Fungi - fungi +#fung:microsporidians latest Fungi - microsporidians +#plnt:green algae latest Plants (Viridiplantae) - green algae +#plnt:mosses latest Plants (Viridiplantae) - mosses +#plnt:plants latest Plants (Viridiplantae) - plants +#prst:algae latest Protists (other Eukaryota) - algae +#prst:alveolates latest Protists (other Eukaryota) - alveolates +#prst:cellular slime molds latest Protists (other Eukaryota) - cellular slime molds +#prst:cercozoans latest Protists (other Eukaryota) - cercozoans +#prst:choanoflagellates latest Protists (other Eukaryota) - choanoflagellates +#prst:euglenoids latest Protists (other Eukaryota) - euglenoids +#prst:monads latest Protists (other Eukaryota) - monads +#prst:protists latest Protists (other Eukaryota) - protists +#prst:slime nets latest Protists (other Eukaryota) - slime nets +#synt:synthetic latest Synthetic - synthetic +#unkn:unknown latest Unknown / Unclassified +#virs:eukaryotic viruses latest Virus - eukaryotic viruses +#virs:prokaryotic viruses latest Virus - prokaryotic viruses +#virs:viruses latest Virus - viruses
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Fri Jan 12 22:11:17 2024 +0000 @@ -0,0 +1,12 @@ +<tables> + <!-- Locations of NCBI FCS GX databases --> + <table name="ncbi_fcs_gx_databases" comment_char="#"> + <columns>value, source_manifest, name</columns> + <file path="tool-data/ncbi_fcs_gx_databases.loc" /> + </table> + <!-- NCBI FCS GX divisions --> + <table name="ncbi_fcs_gx_divisions" comment_char="#"> + <columns>value, tag, name</columns> + <file path="tool-data/ncbi_fcs_gx_divisions.tsv" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Fri Jan 12 22:11:17 2024 +0000 @@ -0,0 +1,12 @@ +<tables> + <!-- Locations of NCBI FCS GX databases --> + <table name="ncbi_fcs_gx_databases" comment_char="#"> + <columns>value, source_manifest, name</columns> + <file path="${__HERE__}/test-data/ncbi_fcs_gx_databases.loc" /> + </table> + <!-- NCBI FCS GX divisions --> + <table name="ncbi_fcs_gx_divisions" comment_char="#"> + <columns>value, tag, name</columns> + <file path="${__HERE__}/test-data/ncbi_fcs_gx_divisions.tsv" /> + </table> +</tables>