# HG changeset patch # User iuc # Date 1705097477 0 # Node ID 6be6e6198ac3af1bc533b82a04e84e549fc8e63a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/data_managers/data_manager_ncbi_fcs_gx_database_downloader commit 4a6561ed00e004260be3f3c29d81e814c60e20af diff -r 000000000000 -r 6be6e6198ac3 data_manager/data_manager_ncbi_fcs_gx_database_downloader.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager_ncbi_fcs_gx_database_downloader.py Fri Jan 12 22:11:17 2024 +0000 @@ -0,0 +1,111 @@ +#!/usr/bin/env python + +import argparse +import json +import os +import subprocess + + +def main(): + opts = parse_args() + + output_dict = { + "data_tables": { + "ncbi_fcs_gx_databases": sync_files(opts), + "ncbi_fcs_gx_divisions": get_divisions(opts), + } + } + + with open(opts.output_file, "w") as f: + print(json.dumps(output_dict, sort_keys=True, indent=2), file=f) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--tag", required=True) + parser.add_argument("--source_manifest", required=True) + parser.add_argument("--output_file", required=True) + parser.add_argument("--output_dir", required=True) + + return parser.parse_args() + + +def sync_files(opts): + os.makedirs(opts.output_dir, exist_ok=True) + + args = [ + "sync_files.py", + "--mft", + opts.source_manifest, + "--dir", + opts.output_dir, + "get", + ] + + try: + subprocess.run(args, capture_output=True, check=True) + except subprocess.CalledProcessError: + raise + + entries_dict = { + "add": [ + { + "value": opts.tag, + "source_manifest": opts.source_manifest, + "name": opts.output_dir, + } + ] + } + + return entries_dict + + +def get_divisions(opts): + # descriptions for the top-level gx divisions + top_level_description = { + "anml": "Animals (Metazoa)", + "arch": "Archaea", + "fung": "Fungi", + "plnt": "Plants (Viridiplantae)", + "prok": "Bacteria", + "prst": "Protists (other Eukaryota)", + "synt": "Synthetic", + "virs": "Virus", + } + + # get the pathname for the taxa file + manifest_filename = os.path.basename(opts.source_manifest) + assert manifest_filename.lower().endswith( + ".manifest" + ), 'source_manifest does not end with ".manifest"' + manifest_tag = manifest_filename[:-9] + taxa_pathname = os.path.join(opts.output_dir, f"{manifest_tag}.taxa.tsv") + + gx_divisions = set() + with open(taxa_pathname) as f: + for line in f: + if line.startswith("#"): + continue + line = line.rstrip("\n") + tax_id, species, common_name, blast_div, div = line.split("\t", 4) + gx_divisions.add(div) + + elements = [] + for division in gx_divisions: + top, bottom = division.split(":", 1) + description = f"{top_level_description[top]} - {bottom}" + elements.append((description, division)) + + # add an element to support unknown/unclassified samples + elements.append(("Unknown / Unclassified", "unkn:unknown")) + + entries_dict = {"add": []} + + for name, gx_div in sorted(elements): + entries_dict["add"].append({"value": gx_div, "tag": opts.tag, "name": name}) + + return entries_dict + + +if __name__ == "__main__": + main() diff -r 000000000000 -r 6be6e6198ac3 data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml Fri Jan 12 22:11:17 2024 +0000 @@ -0,0 +1,35 @@ + + Downoad the NCBI FCS GX database + + macros.xml + + + + + + + + + + + + + + + + + + + + + diff -r 000000000000 -r 6be6e6198ac3 data_manager/macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/macros.xml Fri Jan 12 22:11:17 2024 +0000 @@ -0,0 +1,22 @@ + + + + ncbi-fcs-gx + + + + 0.5.0 + 0 + 21.05 + + + operation_3187 + + + + + 10.1101/2023.06.02.543519 + + + + diff -r 000000000000 -r 6be6e6198ac3 data_manager_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Fri Jan 12 22:11:17 2024 +0000 @@ -0,0 +1,29 @@ + + + + + + + + + ${name} + ncbi_fcs_gx_databases/${value} + + + abspath + + + + + + + + + + + + diff -r 000000000000 -r 6be6e6198ac3 test-data/ncbi_fcs_gx_databases.loc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ncbi_fcs_gx_databases.loc Fri Jan 12 22:11:17 2024 +0000 @@ -0,0 +1,1 @@ +#tag source_manifest local_manifest diff -r 000000000000 -r 6be6e6198ac3 test-data/ncbi_fcs_gx_divisions.tsv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ncbi_fcs_gx_divisions.tsv Fri Jan 12 22:11:17 2024 +0000 @@ -0,0 +1,1 @@ +#gx_div tag description diff -r 000000000000 -r 6be6e6198ac3 test-data/test.json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test.json Fri Jan 12 22:11:17 2024 +0000 @@ -0,0 +1,27 @@ +\{ + "data_tables": \{ + "ncbi_fcs_gx_databases": \{ + "add": \[ + \{ + "name": "[^"]+", + "source_manifest": "https://ncbi-fcs-gx.s3.amazonaws.com/gxdb/test-only/test-only.manifest", + "value": "test-only" + \} + \] + \}, + "ncbi_fcs_gx_divisions": \{ + "add": \[ + \{ + "name": "Bacteria - CFB group bacteria", + "tag": "test-only", + "value": "prok:CFB group bacteria" + \}, + \{ + "name": "Unknown / Unclassified", + "tag": "test-only", + "value": "unkn:unknown" + \} + \] + \} + \} +\} diff -r 000000000000 -r 6be6e6198ac3 tool-data/ncbi_fcs_gx_databases.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/ncbi_fcs_gx_databases.loc.sample Fri Jan 12 22:11:17 2024 +0000 @@ -0,0 +1,8 @@ +## NCBI FCS GX Databases +# +#tag manifest path +#r2022-01-24 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/r2022-01-24/all.manifest /big/data/dir/ncbi_fcs_gx_databases/r2022-01-24/all.manifest +#r2022-07-08 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/r2022-07-08/all.manifest /big/data/dir/ncbi_fcs_gx_databases/r2022-07-08/all.manifest +#r2023-01-24 https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/r2023-01-24/all.manifest /big/data/dir/ncbi_fcs_gx_databases/r2023-01-24/all.manifest +#latest https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/latest/all.manifest /big/data/dir/ncbi_fcs_gx_databases/latest/all.manifest +#test-only https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.manifest /big/data/dir/ncbi_fcs_gx_databases/test-only/test-only.manifest diff -r 000000000000 -r 6be6e6198ac3 tool-data/ncbi_fcs_gx_divisions.tsv.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/ncbi_fcs_gx_divisions.tsv.sample Fri Jan 12 22:11:17 2024 +0000 @@ -0,0 +1,64 @@ +#gx_div tag description +#anml:amphibians latest Animals (Metazoa) - amphibians +#anml:basal metazoans latest Animals (Metazoa) - basal metazoans +#anml:birds latest Animals (Metazoa) - birds +#anml:brachiopods latest Animals (Metazoa) - brachiopods +#anml:crustaceans latest Animals (Metazoa) - crustaceans +#anml:echinoderms latest Animals (Metazoa) - echinoderms +#anml:fishes latest Animals (Metazoa) - fishes +#anml:insects latest Animals (Metazoa) - insects +#anml:mammals latest Animals (Metazoa) - mammals +#anml:marsupials latest Animals (Metazoa) - marsupials +#anml:molluscs latest Animals (Metazoa) - molluscs +#anml:nematodes latest Animals (Metazoa) - nematodes +#anml:primates latest Animals (Metazoa) - primates +#anml:reptiles latest Animals (Metazoa) - reptiles +#anml:rodents latest Animals (Metazoa) - rodents +#anml:rotifers latest Animals (Metazoa) - rotifers +#anml:tardigrades latest Animals (Metazoa) - tardigrades +#anml:worms latest Animals (Metazoa) - worms +#arch:archaea latest Archaea - archaea +#prok:CFB group bacteria latest Bacteria - CFB group bacteria +#prok:GNS bacteria latest Bacteria - GNS bacteria +#prok:a-proteobacteria latest Bacteria - a-proteobacteria +#prok:actinobacteria latest Bacteria - actinobacteria +#prok:aquificales latest Bacteria - aquificales +#prok:b-proteobacteria latest Bacteria - b-proteobacteria +#prok:bacteria latest Bacteria - bacteria +#prok:chlamydias latest Bacteria - chlamydias +#prok:cyanobacteria latest Bacteria - cyanobacteria +#prok:d-proteobacteria latest Bacteria - d-proteobacteria +#prok:firmicutes latest Bacteria - firmicutes +#prok:fusobacteria latest Bacteria - fusobacteria +#prok:g-proteobacteria latest Bacteria - g-proteobacteria +#prok:green sulfur bacteria latest Bacteria - green sulfur bacteria +#prok:high GC Gram+ latest Bacteria - high GC Gram+ +#prok:mycoplasmas latest Bacteria - mycoplasmas +#prok:planctomycetes latest Bacteria - planctomycetes +#prok:proteobacteria latest Bacteria - proteobacteria +#prok:spirochetes latest Bacteria - spirochetes +#prok:thermotogales latest Bacteria - thermotogales +#prok:verrucomicrobia latest Bacteria - verrucomicrobia +#fung:ascomycetes latest Fungi - ascomycetes +#fung:basidiomycetes latest Fungi - basidiomycetes +#fung:budding yeasts latest Fungi - budding yeasts +#fung:chytrids latest Fungi - chytrids +#fung:fungi latest Fungi - fungi +#fung:microsporidians latest Fungi - microsporidians +#plnt:green algae latest Plants (Viridiplantae) - green algae +#plnt:mosses latest Plants (Viridiplantae) - mosses +#plnt:plants latest Plants (Viridiplantae) - plants +#prst:algae latest Protists (other Eukaryota) - algae +#prst:alveolates latest Protists (other Eukaryota) - alveolates +#prst:cellular slime molds latest Protists (other Eukaryota) - cellular slime molds +#prst:cercozoans latest Protists (other Eukaryota) - cercozoans +#prst:choanoflagellates latest Protists (other Eukaryota) - choanoflagellates +#prst:euglenoids latest Protists (other Eukaryota) - euglenoids +#prst:monads latest Protists (other Eukaryota) - monads +#prst:protists latest Protists (other Eukaryota) - protists +#prst:slime nets latest Protists (other Eukaryota) - slime nets +#synt:synthetic latest Synthetic - synthetic +#unkn:unknown latest Unknown / Unclassified +#virs:eukaryotic viruses latest Virus - eukaryotic viruses +#virs:prokaryotic viruses latest Virus - prokaryotic viruses +#virs:viruses latest Virus - viruses diff -r 000000000000 -r 6be6e6198ac3 tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Fri Jan 12 22:11:17 2024 +0000 @@ -0,0 +1,12 @@ + + + + value, source_manifest, name + +
+ + + value, tag, name + +
+
diff -r 000000000000 -r 6be6e6198ac3 tool_data_table_conf.xml.test --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Fri Jan 12 22:11:17 2024 +0000 @@ -0,0 +1,12 @@ + + + + value, source_manifest, name + +
+ + + value, tag, name + +
+