changeset 0:6be6e6198ac3 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/data_managers/data_manager_ncbi_fcs_gx_database_downloader commit 4a6561ed00e004260be3f3c29d81e814c60e20af
author iuc
date Fri, 12 Jan 2024 22:11:17 +0000
parents
children 35cef758050c
files data_manager/data_manager_ncbi_fcs_gx_database_downloader.py data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml data_manager/macros.xml data_manager_conf.xml test-data/ncbi_fcs_gx_databases.loc test-data/ncbi_fcs_gx_divisions.tsv test-data/test.json tool-data/ncbi_fcs_gx_databases.loc.sample tool-data/ncbi_fcs_gx_divisions.tsv.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test
diffstat 11 files changed, 322 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/data_manager_ncbi_fcs_gx_database_downloader.py	Fri Jan 12 22:11:17 2024 +0000
@@ -0,0 +1,111 @@
+#!/usr/bin/env python
+
+import argparse
+import json
+import os
+import subprocess
+
+
+def main():
+    opts = parse_args()
+
+    output_dict = {
+        "data_tables": {
+            "ncbi_fcs_gx_databases": sync_files(opts),
+            "ncbi_fcs_gx_divisions": get_divisions(opts),
+        }
+    }
+
+    with open(opts.output_file, "w") as f:
+        print(json.dumps(output_dict, sort_keys=True, indent=2), file=f)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--tag", required=True)
+    parser.add_argument("--source_manifest", required=True)
+    parser.add_argument("--output_file", required=True)
+    parser.add_argument("--output_dir", required=True)
+
+    return parser.parse_args()
+
+
+def sync_files(opts):
+    os.makedirs(opts.output_dir, exist_ok=True)
+
+    args = [
+        "sync_files.py",
+        "--mft",
+        opts.source_manifest,
+        "--dir",
+        opts.output_dir,
+        "get",
+    ]
+
+    try:
+        subprocess.run(args, capture_output=True, check=True)
+    except subprocess.CalledProcessError:
+        raise
+
+    entries_dict = {
+        "add": [
+            {
+                "value": opts.tag,
+                "source_manifest": opts.source_manifest,
+                "name": opts.output_dir,
+            }
+        ]
+    }
+
+    return entries_dict
+
+
+def get_divisions(opts):
+    # descriptions for the top-level gx divisions
+    top_level_description = {
+        "anml": "Animals (Metazoa)",
+        "arch": "Archaea",
+        "fung": "Fungi",
+        "plnt": "Plants (Viridiplantae)",
+        "prok": "Bacteria",
+        "prst": "Protists (other Eukaryota)",
+        "synt": "Synthetic",
+        "virs": "Virus",
+    }
+
+    # get the pathname for the taxa file
+    manifest_filename = os.path.basename(opts.source_manifest)
+    assert manifest_filename.lower().endswith(
+        ".manifest"
+    ), 'source_manifest does not end with ".manifest"'
+    manifest_tag = manifest_filename[:-9]
+    taxa_pathname = os.path.join(opts.output_dir, f"{manifest_tag}.taxa.tsv")
+
+    gx_divisions = set()
+    with open(taxa_pathname) as f:
+        for line in f:
+            if line.startswith("#"):
+                continue
+            line = line.rstrip("\n")
+            tax_id, species, common_name, blast_div, div = line.split("\t", 4)
+            gx_divisions.add(div)
+
+    elements = []
+    for division in gx_divisions:
+        top, bottom = division.split(":", 1)
+        description = f"{top_level_description[top]} - {bottom}"
+        elements.append((description, division))
+
+    # add an element to support unknown/unclassified samples
+    elements.append(("Unknown / Unclassified", "unkn:unknown"))
+
+    entries_dict = {"add": []}
+
+    for name, gx_div in sorted(elements):
+        entries_dict["add"].append({"value": gx_div, "tag": opts.tag, "name": name})
+
+    return entries_dict
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml	Fri Jan 12 22:11:17 2024 +0000
@@ -0,0 +1,35 @@
+<tool id="data_manager_ncbi_fcs_gx_database_downloader" name="NCBI FCS GX database downloader" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" tool_type="manage_data" profile="@PROFILE@">
+    <description>Downoad the NCBI FCS GX database</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="edam_ontology"/>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+python '$__tool_directory__/data_manager_ncbi_fcs_gx_database_downloader.py'
+--tag '$tag'
+--source_manifest '$source_manifest'
+--output_file '$output_file'
+--output_dir '$output_file.extra_files_path'
+    ]]></command>
+    <inputs>
+        <param name="tag" type="text" label="Unique identifier for this database"/>
+        <param name="source_manifest" type="text" label="Source Manifest"/>
+    </inputs>
+    <outputs>
+        <data name="output_file" format="data_manager_json"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="tag" value="test-only"/>
+            <param name="source_manifest" value="https://ncbi-fcs-gx.s3.amazonaws.com/gxdb/test-only/test-only.manifest"/>
+            <output name="output_file" file="test.json" compare="re_match"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+This tool downloads NCBI FCS GX databases makes them available to the NCBI FCX GX tool.
+
+See https://github.com/ncbi/fcs/wiki/FCS-GX#b-download-the-database
+    ]]></help>
+    <expand macro="citations"/>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/macros.xml	Fri Jan 12 22:11:17 2024 +0000
@@ -0,0 +1,22 @@
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">ncbi-fcs-gx</requirement>
+            <yield/>
+        </requirements>
+    </xml>
+    <token name="@TOOL_VERSION@">0.5.0</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@PROFILE@">21.05</token>
+    <xml name="edam_ontology">
+        <edam_operations>
+            <edam_operation>operation_3187</edam_operation>
+        </edam_operations>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1101/2023.06.02.543519</citation>
+            <yield/>
+        </citations>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml	Fri Jan 12 22:11:17 2024 +0000
@@ -0,0 +1,29 @@
+<data_managers>
+    <data_manager tool_file="data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml" id="data_manager_ncbi_fcs_gx_database_downloader">
+        <data_table name="ncbi_fcs_gx_databases">
+            <output>
+                <column name="value" />
+                <column name="source_manifest" />
+                <column name="name" output_ref="output_file">
+                    <move type="directory">
+                        <source>${name}</source>
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">ncbi_fcs_gx_databases/${value}</target>
+                    </move>
+                    <value_translation><![CDATA[
+#import os
+#set manifest_filename = os.path.basename($source_manifest)
+$GALAXY_DATA_MANAGER_DATA_PATH/ncbi_fcs_gx_databases/$value/$manifest_filename
+                    ]]></value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
+        <data_table name="ncbi_fcs_gx_divisions">
+            <output>
+                <column name="value" />
+                <column name="tag" />
+                <column name="name" />
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ncbi_fcs_gx_databases.loc	Fri Jan 12 22:11:17 2024 +0000
@@ -0,0 +1,1 @@
+#tag	source_manifest	local_manifest
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ncbi_fcs_gx_divisions.tsv	Fri Jan 12 22:11:17 2024 +0000
@@ -0,0 +1,1 @@
+#gx_div	tag	description
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test.json	Fri Jan 12 22:11:17 2024 +0000
@@ -0,0 +1,27 @@
+\{
+  "data_tables": \{
+    "ncbi_fcs_gx_databases": \{
+      "add": \[
+        \{
+          "name": "[^"]+",
+          "source_manifest": "https://ncbi-fcs-gx.s3.amazonaws.com/gxdb/test-only/test-only.manifest",
+          "value": "test-only"
+        \}
+      \]
+    \},
+    "ncbi_fcs_gx_divisions": \{
+      "add": \[
+        \{
+          "name": "Bacteria - CFB group bacteria",
+          "tag": "test-only",
+          "value": "prok:CFB group bacteria"
+        \},
+        \{
+          "name": "Unknown / Unclassified",
+          "tag": "test-only",
+          "value": "unkn:unknown"
+        \}
+      \]
+    \}
+  \}
+\}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/ncbi_fcs_gx_databases.loc.sample	Fri Jan 12 22:11:17 2024 +0000
@@ -0,0 +1,8 @@
+## NCBI FCS GX Databases
+# 
+#tag	manifest	path
+#r2022-01-24	https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/r2022-01-24/all.manifest	/big/data/dir/ncbi_fcs_gx_databases/r2022-01-24/all.manifest
+#r2022-07-08	https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/r2022-07-08/all.manifest	/big/data/dir/ncbi_fcs_gx_databases/r2022-07-08/all.manifest
+#r2023-01-24	https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/r2023-01-24/all.manifest	/big/data/dir/ncbi_fcs_gx_databases/r2023-01-24/all.manifest
+#latest	https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/latest/all.manifest	/big/data/dir/ncbi_fcs_gx_databases/latest/all.manifest
+#test-only	https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.manifest	/big/data/dir/ncbi_fcs_gx_databases/test-only/test-only.manifest
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/ncbi_fcs_gx_divisions.tsv.sample	Fri Jan 12 22:11:17 2024 +0000
@@ -0,0 +1,64 @@
+#gx_div	tag	description
+#anml:amphibians	latest	Animals (Metazoa) - amphibians
+#anml:basal metazoans	latest	Animals (Metazoa) - basal metazoans
+#anml:birds	latest	Animals (Metazoa) - birds
+#anml:brachiopods	latest	Animals (Metazoa) - brachiopods
+#anml:crustaceans	latest	Animals (Metazoa) - crustaceans
+#anml:echinoderms	latest	Animals (Metazoa) - echinoderms
+#anml:fishes	latest	Animals (Metazoa) - fishes
+#anml:insects	latest	Animals (Metazoa) - insects
+#anml:mammals	latest	Animals (Metazoa) - mammals
+#anml:marsupials	latest	Animals (Metazoa) - marsupials
+#anml:molluscs	latest	Animals (Metazoa) - molluscs
+#anml:nematodes	latest	Animals (Metazoa) - nematodes
+#anml:primates	latest	Animals (Metazoa) - primates
+#anml:reptiles	latest	Animals (Metazoa) - reptiles
+#anml:rodents	latest	Animals (Metazoa) - rodents
+#anml:rotifers	latest	Animals (Metazoa) - rotifers
+#anml:tardigrades	latest	Animals (Metazoa) - tardigrades
+#anml:worms	latest	Animals (Metazoa) - worms
+#arch:archaea	latest	Archaea - archaea
+#prok:CFB group bacteria	latest	Bacteria - CFB group bacteria
+#prok:GNS bacteria	latest	Bacteria - GNS bacteria
+#prok:a-proteobacteria	latest	Bacteria - a-proteobacteria
+#prok:actinobacteria	latest	Bacteria - actinobacteria
+#prok:aquificales	latest	Bacteria - aquificales
+#prok:b-proteobacteria	latest	Bacteria - b-proteobacteria
+#prok:bacteria	latest	Bacteria - bacteria
+#prok:chlamydias	latest	Bacteria - chlamydias
+#prok:cyanobacteria	latest	Bacteria - cyanobacteria
+#prok:d-proteobacteria	latest	Bacteria - d-proteobacteria
+#prok:firmicutes	latest	Bacteria - firmicutes
+#prok:fusobacteria	latest	Bacteria - fusobacteria
+#prok:g-proteobacteria	latest	Bacteria - g-proteobacteria
+#prok:green sulfur bacteria	latest	Bacteria - green sulfur bacteria
+#prok:high GC Gram+	latest	Bacteria - high GC Gram+
+#prok:mycoplasmas	latest	Bacteria - mycoplasmas
+#prok:planctomycetes	latest	Bacteria - planctomycetes
+#prok:proteobacteria	latest	Bacteria - proteobacteria
+#prok:spirochetes	latest	Bacteria - spirochetes
+#prok:thermotogales	latest	Bacteria - thermotogales
+#prok:verrucomicrobia	latest	Bacteria - verrucomicrobia
+#fung:ascomycetes	latest	Fungi - ascomycetes
+#fung:basidiomycetes	latest	Fungi - basidiomycetes
+#fung:budding yeasts	latest	Fungi - budding yeasts
+#fung:chytrids	latest	Fungi - chytrids
+#fung:fungi	latest	Fungi - fungi
+#fung:microsporidians	latest	Fungi - microsporidians
+#plnt:green algae	latest	Plants (Viridiplantae) - green algae
+#plnt:mosses	latest	Plants (Viridiplantae) - mosses
+#plnt:plants	latest	Plants (Viridiplantae) - plants
+#prst:algae	latest	Protists (other Eukaryota) - algae
+#prst:alveolates	latest	Protists (other Eukaryota) - alveolates
+#prst:cellular slime molds	latest	Protists (other Eukaryota) - cellular slime molds
+#prst:cercozoans	latest	Protists (other Eukaryota) - cercozoans
+#prst:choanoflagellates	latest	Protists (other Eukaryota) - choanoflagellates
+#prst:euglenoids	latest	Protists (other Eukaryota) - euglenoids
+#prst:monads	latest	Protists (other Eukaryota) - monads
+#prst:protists	latest	Protists (other Eukaryota) - protists
+#prst:slime nets	latest	Protists (other Eukaryota) - slime nets
+#synt:synthetic	latest	Synthetic - synthetic
+#unkn:unknown	latest	Unknown / Unclassified
+#virs:eukaryotic viruses	latest	Virus - eukaryotic viruses
+#virs:prokaryotic viruses	latest	Virus - prokaryotic viruses
+#virs:viruses	latest	Virus - viruses
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Fri Jan 12 22:11:17 2024 +0000
@@ -0,0 +1,12 @@
+<tables>
+    <!-- Locations of NCBI FCS GX databases -->
+    <table name="ncbi_fcs_gx_databases" comment_char="#">
+        <columns>value, source_manifest, name</columns>
+        <file path="tool-data/ncbi_fcs_gx_databases.loc" />
+    </table>
+    <!-- NCBI FCS GX divisions -->
+    <table name="ncbi_fcs_gx_divisions" comment_char="#">
+        <columns>value, tag, name</columns>
+        <file path="tool-data/ncbi_fcs_gx_divisions.tsv" />
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.test	Fri Jan 12 22:11:17 2024 +0000
@@ -0,0 +1,12 @@
+<tables>
+    <!-- Locations of NCBI FCS GX databases -->
+    <table name="ncbi_fcs_gx_databases" comment_char="#">
+        <columns>value, source_manifest, name</columns>
+        <file path="${__HERE__}/test-data/ncbi_fcs_gx_databases.loc" />
+    </table>
+    <!-- NCBI FCS GX divisions -->
+    <table name="ncbi_fcs_gx_divisions" comment_char="#">
+        <columns>value, tag, name</columns>
+        <file path="${__HERE__}/test-data/ncbi_fcs_gx_divisions.tsv" />
+    </table>
+</tables>