Mercurial > repos > iuc > data_manager_build_kraken2_database
changeset 16:54871a78828e draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/data_managers/data_manager_build_kraken2_database commit a108f20aebc04574a8bd0a90b955064439a50852
| author | iuc |
|---|---|
| date | Wed, 05 Nov 2025 13:32:18 +0000 |
| parents | 201eff2131d6 |
| children | |
| files | data_manager/kraken2_build_database.py data_manager/kraken2_build_database.xml |
| diffstat | 2 files changed, 352 insertions(+), 566 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/kraken2_build_database.py Sat Jan 25 17:41:48 2025 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,506 +0,0 @@ -#!/usr/bin/env python - -import argparse -import datetime -import errno -import json -import os -import re -import shutil -import subprocess -import sys -import tarfile -from enum import Enum - -try: - # Python3 - from urllib.request import urlopen - from urllib.error import URLError -except ImportError: - from urllib2 import urlopen - from urllib2 import URLError - - -DATA_TABLE_NAME = "kraken2_databases" - - -class KrakenDatabaseTypes(Enum): - standard_local_build = 'standard_local_build' - standard_prebuilt = 'standard_prebuilt' - minikraken = 'minikraken' - special_prebuilt = 'special_prebuilt' - special = 'special' - custom = 'custom' - - def __str__(self): - return self.value - - -class SpecialDatabaseTypes(Enum): - rdp = 'rdp' - greengenes = 'greengenes' - silva = 'silva' - - def __str__(self): - return self.value - - -class Minikraken2Versions(Enum): - v1 = 'v1' - v2 = 'v2' - - def __str__(self): - return self.value - - -class StandardPrebuiltSizes(Enum): - viral = "viral" - minusb = "minusb" - standard = "standard" - standard_08gb = "standard_08gb" - standard_16gb = "standard_16gb" - pluspf = "pluspf" - pluspf_08gb = "pluspf_08gb" - pluspf_16gb = "pluspf_16gb" - pluspfp = "pluspfp" - pluspfp_08gb = "pluspfp_08gb" - pluspfp_16gb = "pluspfp_16gb" - eupathdb48 = "eupathdb48" - core_nt = "core_nt" - gtdb_genome_reps = "gtdb_genome_reps" - - def __str__(self): - return self.value - - -def kraken2_build_standard(kraken2_args, target_directory, data_table_name=DATA_TABLE_NAME): - now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") - - database_value = "_".join([ - now, - "standard", - "kmer-len", str(kraken2_args["kmer_len"]), - "minimizer-len", str(kraken2_args["minimizer_len"]), - "minimizer-spaces", str(kraken2_args["minimizer_spaces"]), - "load-factor", str(kraken2_args["load_factor"]), - ]) - - database_name = " ".join([ - "Standard (Local Build)", - "(Created:", - now + ",", - "kmer-len=" + str(kraken2_args["kmer_len"]) + ",", - "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",", - "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")", - "load-factor", str(kraken2_args["load_factor"]), - ]) - - database_path = database_value - - args = [ - '--threads', str(kraken2_args["threads"]), - '--standard', - '--kmer-len', str(kraken2_args["kmer_len"]), - '--minimizer-len', str(kraken2_args["minimizer_len"]), - '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]), - '--load-factor', str(kraken2_args["load_factor"]), - '--db', database_path - ] - - subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) - - if kraken2_args["clean"]: - args = [ - '--threads', str(kraken2_args["threads"]), - '--clean', - '--db', database_path - ] - - subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) - - data_table_entry = { - 'data_tables': { - data_table_name: [ - { - "value": database_value, - "name": database_name, - "path": database_path, - } - ] - } - } - - return data_table_entry - - -def kraken2_build_standard_prebuilt(prebuilt_db, prebuilt_date, target_directory, data_table_name=DATA_TABLE_NAME): - - now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") - - prebuild_name = { - 'viral': "Viral", - 'minusb': "MinusB (archaea, viral, plasmid, human, UniVec_Core)", - 'standard': "Standard-Full (archaea, bacteria, viral, plasmid, human,UniVec_Core)", - 'standard_08gb': "Standard-8 (Standard with DB capped at 8 GB)", - 'standard_16gb': "Standard-16 (Standard with DB capped at 16 GB)", - 'pluspf': "PlusPF (Standard plus protozoa and fungi)", - 'pluspf_08gb': "PlusPF-8 (PlusPF with DB capped at 8 GB)", - 'pluspf_16gb': "PlusPF-16 (PlusPF with DB capped at 16 GB)", - 'pluspfp': "PlusPFP (Standard plus protozoa, fungi and plant)", - 'pluspfp_08gb': "PlusPFP-8 (PlusPFP with DB capped at 8 GB)", - 'pluspfp_16gb': "PlusPFP-16 (PlusPFP with DB capped at 16 GB)", - 'eupathdb48': "EuPathDB-46", - 'core_nt': "core_nt (Very large collection, inclusive of GenBank, RefSeq, TPA and PDB)", - 'gtdb_genome_reps': "GTDB v220 (Bacterial and archaeal)", - } - - database_value = "_".join([ - now, - "standard_prebuilt", - prebuilt_db, - prebuilt_date - ]) - - database_name = " ".join([ - "Prebuilt Refseq indexes: ", - prebuild_name[prebuilt_db], - "(Version: ", - prebuilt_date, - "- Downloaded:", - now + ")" - ]) - - database_path = database_value - - # we may need to let the user choose the date when new DBs are posted. - date_url_str = prebuilt_date.replace('-', '') - # download the pre-built database - try: - download_url = 'https://genome-idx.s3.amazonaws.com/kraken/k2_%s_%s.tar.gz' % (prebuilt_db, date_url_str) - src = urlopen(download_url) - except URLError as e: - print('url: ' + download_url, file=sys.stderr) - print(e, file=sys.stderr) - exit(1) - - with open('tmp_data.tar.gz', 'wb') as dst: - shutil.copyfileobj(src, dst) - # unpack the downloaded archive to the target directory - with tarfile.open('tmp_data.tar.gz', 'r:gz') as fh: - for member in fh.getmembers(): - if member.isreg(): - member.name = os.path.basename(member.name) - fh.extract(member, os.path.join(target_directory, database_path)) - - data_table_entry = { - 'data_tables': { - data_table_name: [ - { - "value": database_value, - "name": database_name, - "path": database_path, - } - ] - } - } - - return data_table_entry - - -def kraken2_build_minikraken(minikraken2_version, target_directory, data_table_name=DATA_TABLE_NAME): - - now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") - - database_value = "_".join([ - now, - "minikraken2", - minikraken2_version, - "8GB", - ]) - - database_name = " ".join([ - "Minikraken2", - minikraken2_version, - "(Created:", - now + ")" - ]) - - database_path = database_value - - # download the minikraken2 data - try: - download_url = 'https://genome-idx.s3.amazonaws.com/kraken/minikraken2_%s_8GB_201904.tgz' % minikraken2_version - src = urlopen(download_url) - except URLError as e: - print('url: ' + download_url, file=sys.stderr) - print(e, file=sys.stderr) - exit(1) - - with open('tmp_data.tar.gz', 'wb') as dst: - shutil.copyfileobj(src, dst) - # unpack the downloaded archive to the target directory - with tarfile.open('tmp_data.tar.gz', 'r:gz') as fh: - for member in fh.getmembers(): - if member.isreg(): - member.name = os.path.basename(member.name) - fh.extract(member, os.path.join(target_directory, database_path)) - - data_table_entry = { - 'data_tables': { - data_table_name: [ - { - "value": database_value, - "name": database_name, - "path": database_path, - } - ] - } - } - - return data_table_entry - - -def kraken2_build_special(kraken2_args, target_directory, data_table_name=DATA_TABLE_NAME): - - now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") - - special_database_names = { - "rdp": "RDP", - "greengenes": "Greengenes", - "silva": "Silva", - } - - database_value = "_".join([ - now, - kraken2_args["special_database_type"], - "kmer-len", str(kraken2_args["kmer_len"]), - "minimizer-len", str(kraken2_args["minimizer_len"]), - "minimizer-spaces", str(kraken2_args["minimizer_spaces"]), - "load-factor", str(kraken2_args["load_factor"]), - ]) - - database_name = " ".join([ - special_database_names[kraken2_args["special_database_type"]], - "(Created:", - now + ",", - "kmer-len=" + str(kraken2_args["kmer_len"]) + ",", - "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",", - "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ")", - "load-factor=" + str(kraken2_args["load_factor"]) + ")", - ]) - - database_path = database_value - - args = [ - '--threads', str(kraken2_args["threads"]), - '--special', kraken2_args["special_database_type"], - '--kmer-len', str(kraken2_args["kmer_len"]), - '--minimizer-len', str(kraken2_args["minimizer_len"]), - '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]), - '--load-factor', str(kraken2_args["load_factor"]), - '--db', database_path - ] - - subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) - - if kraken2_args["clean"]: - args = [ - '--threads', str(kraken2_args["threads"]), - '--clean', - '--db', database_path - ] - - subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) - - data_table_entry = { - 'data_tables': { - data_table_name: [ - { - "value": database_value, - "name": database_name, - "path": database_path, - } - ] - } - } - - return data_table_entry - - -def kraken2_build_custom(kraken2_args, custom_database_name, custom_source_info, target_directory, data_table_name=DATA_TABLE_NAME): - now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") - - database_value = "_".join([ - now, - re.sub(r'[^\w_.-]+', '_', custom_database_name).strip('_'), - "kmer-len", str(kraken2_args["kmer_len"]), - "minimizer-len", str(kraken2_args["minimizer_len"]), - "minimizer-spaces", str(kraken2_args["minimizer_spaces"]), - "load-factor", str(kraken2_args["load_factor"]), - ]) - - database_name = " ".join([ - custom_database_name, - "(" + custom_source_info + ",", - "kmer-len=" + str(kraken2_args["kmer_len"]) + ",", - "minimizer-len=" + str(kraken2_args["minimizer_len"]) + ",", - "minimizer-spaces=" + str(kraken2_args["minimizer_spaces"]) + ",", - "load-factor=" + str(kraken2_args["load_factor"]) + ")", - ]) - - database_path = database_value - - args = [ - '--threads', str(kraken2_args["threads"]), - '--download-taxonomy', - '--db', database_path, - ] - - if kraken2_args['skip_maps']: - args.append('--skip-maps') - - subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) - - args = [ - '--threads', str(kraken2_args["threads"]), - '--add-to-library', kraken2_args["custom_fasta"], - '--db', database_path, - ] - - subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) - - args = [ - '--threads', str(kraken2_args["threads"]), - '--build', - '--kmer-len', str(kraken2_args["kmer_len"]), - '--minimizer-len', str(kraken2_args["minimizer_len"]), - '--minimizer-spaces', str(kraken2_args["minimizer_spaces"]), - '--load-factor', str(kraken2_args["load_factor"]), - '--db', database_path, - ] - - subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) - - if kraken2_args["clean"]: - args = [ - '--threads', str(kraken2_args["threads"]), - '--clean', - '--db', database_path, - ] - - subprocess.check_call(['kraken2-build'] + args, cwd=target_directory) - - data_table_entry = { - 'data_tables': { - data_table_name: [ - { - "value": database_value, - "name": database_name, - "path": database_path, - } - ] - } - } - - return data_table_entry - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('data_manager_json') - parser.add_argument('--kmer-len', dest='kmer_len', type=int, default=35, help='kmer length') - parser.add_argument('--minimizer-len', dest='minimizer_len', type=int, default=31, help='minimizer length') - parser.add_argument('--minimizer-spaces', dest='minimizer_spaces', default=6, help='minimizer spaces') - parser.add_argument('--load-factor', dest='load_factor', type=float, default=0.7, help='load factor') - parser.add_argument('--threads', dest='threads', default=1, help='threads') - parser.add_argument('--database-type', dest='database_type', type=KrakenDatabaseTypes, choices=list(KrakenDatabaseTypes), required=True, help='type of kraken database to build') - parser.add_argument('--minikraken2-version', dest='minikraken2_version', type=Minikraken2Versions, choices=list(Minikraken2Versions), help='MiniKraken2 version (only applies to --database-type minikraken)') - parser.add_argument('--prebuilt-db', dest='prebuilt_db', type=StandardPrebuiltSizes, choices=list(StandardPrebuiltSizes), help='Prebuilt database to download. Only applies to --database-type standard_prebuilt or special_prebuilt.') - parser.add_argument('--prebuilt-date', dest='prebuilt_date', help='Database build date (YYYY-MM-DD). Only applies to --database-type standard_prebuilt.') - parser.add_argument('--special-database-type', dest='special_database_type', type=SpecialDatabaseTypes, choices=list(SpecialDatabaseTypes), help='type of special database to build (only applies to --database-type special)') - parser.add_argument('--custom-fasta', dest='custom_fasta', help='fasta file for custom database (only applies to --database-type custom)') - parser.add_argument('--custom-database-name', dest='custom_database_name', help='Name for custom database (only applies to --database-type custom)') - parser.add_argument('--custom-source-info', dest='custom_source_info', help='Description of how this build has been sourced (only applies to --database-type custom)') - parser.add_argument('--skip-maps', dest='skip_maps', action='store_true', help='') - parser.add_argument('--clean', dest='clean', action='store_true', help='Clean up extra files') - args = parser.parse_args() - - with open(args.data_manager_json) as fh: - data_manager_input = json.load(fh) - - target_directory = data_manager_input['output_data'][0]['extra_files_path'] - - try: - os.mkdir(target_directory) - except OSError as exc: - if exc.errno == errno.EEXIST and os.path.isdir(target_directory): - pass - else: - raise - - data_manager_output = {} - - if str(args.database_type) == 'standard_local_build': - kraken2_args = { - "kmer_len": args.kmer_len, - "minimizer_len": args.minimizer_len, - "minimizer_spaces": args.minimizer_spaces, - "load_factor": args.load_factor, - "threads": args.threads, - "clean": args.clean, - } - data_manager_output = kraken2_build_standard( - kraken2_args, - target_directory, - ) - elif str(args.database_type) in ('standard_prebuilt', 'special_prebuilt'): - data_manager_output = kraken2_build_standard_prebuilt( - str(args.prebuilt_db), - str(args.prebuilt_date), - target_directory - ) - elif str(args.database_type) == 'minikraken': - data_manager_output = kraken2_build_minikraken( - str(args.minikraken2_version), - target_directory - ) - elif str(args.database_type) == 'special': - kraken2_args = { - "special_database_type": str(args.special_database_type), - "kmer_len": args.kmer_len, - "minimizer_len": args.minimizer_len, - "minimizer_spaces": args.minimizer_spaces, - "load_factor": args.load_factor, - "threads": args.threads, - "clean": args.clean, - } - data_manager_output = kraken2_build_special( - kraken2_args, - target_directory, - ) - elif str(args.database_type) == 'custom': - kraken2_args = { - "custom_fasta": args.custom_fasta, - "skip_maps": args.skip_maps, - "kmer_len": args.kmer_len, - "minimizer_len": args.minimizer_len, - "minimizer_spaces": args.minimizer_spaces, - "load_factor": args.load_factor, - "threads": args.threads, - "clean": args.clean, - } - data_manager_output = kraken2_build_custom( - kraken2_args, - args.custom_database_name, - args.custom_source_info, - target_directory, - ) - else: - sys.exit("Invalid database type") - - with open(args.data_manager_json, 'w') as fh: - json.dump(data_manager_output, fh, sort_keys=True) - - -if __name__ == "__main__": - main()
--- a/data_manager/kraken2_build_database.xml Sat Jan 25 17:41:48 2025 +0000 +++ b/data_manager/kraken2_build_database.xml Wed Nov 05 13:32:18 2025 +0000 @@ -1,15 +1,15 @@ <tool id="kraken2_build_database" name="Kraken2" tool_type="manage_data" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> <description>database builder</description> <macros> - <token name="@TOOL_VERSION@">2.1.3</token> - <token name="@VERSION_SUFFIX@">6</token> - <token name="@PROFILE@">22.01</token> + <token name="@TOOL_VERSION@">2.1.6</token> + <token name="@VERSION_SUFFIX@">0</token> + <token name="@PROFILE@">24.0</token> <xml name="common_params"> <param name="kmer_len" type="integer" value="35" label="K-mer length in BP" /> <param name="minimizer_len" type="integer" value="31" label="Minimizer length" /> <param name="minimizer_spaces" type="integer" value="7" label="Minimizer spaces" /> <param name="load_factor" type="float" value="0.7" min="0" max="1" label="Load factor" help="Proportion of the hash table to be populated" /> - <param name="clean" type="boolean" truevalue="--clean" falsevalue="" checked="true" label="Clean up extra files" /> + <param name="clean" type="boolean" truevalue="--clean" falsevalue="" checked="false" label="Clean up extra files. Note: If the extra files are removed this DB cannot be used to build a bracken DB!" /> </xml> <xml name="viral"> <option value="viral">Viral (viral; ~0.5 GB)</option> @@ -44,57 +44,151 @@ <xml name="pluspfp_16gb"> <option value="pluspfp_16gb">PlusPFP-16 (PlusPFP with DB capped at 16 GB; ~15 GB)</option> </xml> - <xml name="core_nt"> - <option value="core_nt">core_nt (Very large collection, inclusive of GenBank, RefSeq, TPA and PDB; ~182 GB)</option> - </xml> </macros> <xrefs> <xref type="bio.tools">kraken2</xref> </xrefs> <requirements> <requirement type="package" version="@TOOL_VERSION@">kraken2</requirement> - <requirement type="package" version="3.13">python</requirement> </requirements> <version_command>kraken2 -version | head -n 1 | awk '{print $NF}'</version_command> <command detect_errors="exit_code"><![CDATA[ -python '$__tool_directory__/kraken2_build_database.py' - '$out_file' - --database-type '$database_type.database_type' -#if $database_type.database_type == "standard_local_build" - --threads \${GALAXY_SLOTS:-1} - --kmer-len $database_type.kmer_len - --minimizer-len $database_type.minimizer_len - --minimizer-spaces $database_type.minimizer_spaces - --load-factor $database_type.load_factor - $database_type.clean -#else if $database_type.database_type == "standard_prebuilt" - --prebuilt-db '$database_type.prebuild.prebuilt_db' - --prebuilt-date '$database_type.prebuild.prebuilt_date' +#import datetime +#import re + +#set now = datetime.datetime.utcnow().strftime("%Y-%m-%dT%H%M%SZ") +#set commands = [] +mkdir '$out_file.extra_files_path' && + +#if $database_type.database_type == "standard_prebuilt" or $database_type.database_type == "special_prebuilt" or $database_type.database_type == "amplicon_prebuilt" + #set prebuilt_name = { + 'viral': "Viral", + 'minusb': "MinusB (archaea, viral, plasmid, human, UniVec_Core)", + 'standard': "Standard-Full (archaea, bacteria, viral, plasmid, human,UniVec_Core)", + 'standard_08gb': "Standard-8 (Standard with DB capped at 8 GB)", + 'standard_16gb': "Standard-16 (Standard with DB capped at 16 GB)", + 'pluspf': "PlusPF (Standard plus protozoa and fungi)", + 'pluspf_08gb': "PlusPF-8 (PlusPF with DB capped at 8 GB)", + 'pluspf_16gb': "PlusPF-16 (PlusPF with DB capped at 16 GB)", + 'pluspfp': "PlusPFP (Standard plus protozoa, fungi and plant)", + 'pluspfp_08gb': "PlusPFP-8 (PlusPFP with DB capped at 8 GB)", + 'pluspfp_16gb': "PlusPFP-16 (PlusPFP with DB capped at 16 GB)", + } + #set special_name = { + "core_nt_20250609": "Very large collection, inclusive of GenBank, RefSeq, TPA and PDB (July, 2025)", + "core_nt_20241228": "Very large collection, inclusive of GenBank, RefSeq, TPA and PDB (December, 2024)", + "core_nt_20240904": "Very large collection, inclusive of GenBank, RefSeq, TPA and PDB (September, 2024)", + "gtdb_genome_reps_20250609": "GTDB v226 (Bacterial and archaeal)", + "gtdb_genome_reps_20241109": "GTDB v220 (Bacterial and archaeal)", + "eupathdb48_20230407": "EuPathDB-46 (April 18, 2023)", + "eupathdb48_20201113": "EuPathDB-46 (November 13, 2020)" + } + #set amplicon_name = { + "16S_Greengenes13.5_20200326": "Greengenes 13.5", + "16S_RDP11.5_20200326": "RDP 11.5", + "16S_Silva132_20200326": "Silva 132", + "16S_Silva138_20200326": "Silva 138" + } + + #set date_url_str = str($database_type.prebuilt.prebuilt_date).replace('-', '') + #set display_name = prebuilt_name.get(str($database_type.prebuilt.prebuilt_db)) + #if not display_name + #set display_name = special_name.get(str($database_type.prebuilt.prebuilt_db)+"_"+date_url_str) + #end if + #if not display_name + #set display_name = amplicon_name.get(str($database_type.prebuilt.prebuilt_db)+"_"+date_url_str) + #end if + + #set database_value = "_".join([now, "standard_prebuilt", str($database_type.prebuilt.prebuilt_db), str($database_type.prebuilt.prebuilt_date)]) + #set database_name = " ".join(["Prebuilt Refseq indexes: ", display_name, "(Version: ", str($database_type.prebuilt.prebuilt_date), "- Downloaded:", now + ")"]) + + ## the 16S dbs have a different link and file name + ## and are stored in a subfolder + #if $database_type.database_type == "amplicon_prebuilt" + #silent commands.append("wget https://genome-idx.s3.amazonaws.com/kraken/" + str($database_type.prebuilt.prebuilt_db) + "_" + date_url_str + ".tgz") + #silent commands.append("mkdir -p '" + $out_file.extra_files_path + "/" + database_value + "'/tmp_extract") + #silent commands.append("tar -xzf " + str($database_type.prebuilt.prebuilt_db) + "_" + date_url_str + ".tgz -C '" + $out_file.extra_files_path + "/" + database_value + "'/tmp_extract") + #silent commands.append("topdir=$(find '" + $out_file.extra_files_path + "/" + database_value + "/tmp_extract' -mindepth 1 -maxdepth 1 -type d | head -n 1)") + #silent commands.append("if [ -n \"$topdir\" ]") + #silent commands.append("then") + #silent commands.append(" mv \"$topdir\"/* '" + $out_file.extra_files_path + "/" + database_value + "/'") + #silent commands.append("fi") + #silent commands.append("rm -rf '" + $out_file.extra_files_path + "/" + database_value + "/tmp_extract'") + #else + #silent commands.append("wget https://genome-idx.s3.amazonaws.com/kraken/k2_" + str($database_type.prebuilt.prebuilt_db) + "_" + date_url_str + ".tar.gz") + #silent commands.append("mkdir -p '" + $out_file.extra_files_path + "/" + database_value + "'") + #silent commands.append("tar -xzf k2_" + str($database_type.prebuilt.prebuilt_db) + "_" + date_url_str + ".tar.gz -C '" + $out_file.extra_files_path + "/" + database_value + "'") + #end if + #else if $database_type.database_type == "minikraken" - --minikraken2-version '$database_type.minikraken2_version' -#else if $database_type.database_type == "special_prebuilt" - --prebuilt-db '$database_type.special_prebuild.prebuilt_db' - --prebuilt-date '$database_type.special_prebuild.prebuilt_date' -#else if $database_type.database_type == "special" - --threads \${GALAXY_SLOTS:-1} - --special-database-type '$database_type.special_database_type' - --kmer-len $database_type.kmer_len - --minimizer-len $database_type.minimizer_len - --minimizer-spaces $database_type.minimizer_spaces - --load-factor $database_type.load_factor - $database_type.clean -#else if $database_type.database_type == "custom" - --threads \${GALAXY_SLOTS:-1} - --custom-fasta '$database_type.custom_fasta' - --custom-database-name '$database_type.custom_database_name' - --custom-source-info '$database_type.custom_source_info' - $database_type.skip_maps - --kmer-len $database_type.kmer_len - --minimizer-len $database_type.minimizer_len - --minimizer-spaces $database_type.minimizer_spaces - --load-factor $database_type.load_factor - $database_type.clean + #set database_value = "_".join([now, "minikraken2", str($database_type.minikraken2_version), "8GB"]) + #set database_name = " ".join(["Minikraken2", str($database_type.minikraken2_version), "(Created:", now + ")"]) + + #silent commands.append("wget 'https://genome-idx.s3.amazonaws.com/kraken/minikraken2_" + str($database_type.minikraken2_version) + "_8GB_201904.tgz'") + #silent commands.append("mkdir -p '" + $out_file.extra_files_path + "'/'" + database_value + "'") + #silent commands.append("tar -xzf 'minikraken2_" + str($database_type.minikraken2_version) + "_8GB_201904.tgz' -C '" + $out_file.extra_files_path + "'/'" + database_value + "'") +#else + #if $database_type.database_type == "standard_local_build" + #set database_value = "_".join([now, "standard", "kmer-len", str($database_type.kmer_len), "minimizer-len", str($database_type.minimizer_len), "minimizer-spaces", str($database_type.minimizer_spaces), "load-factor", str($database_type.load_factor)]) + #set database_name = " ".join(["Standard (Local Build)", "Created:", now + ",", "kmer-len=" + str($database_type.kmer_len) + ",", "minimizer-len=" + str($database_type.minimizer_len) + ",", "minimizer-spaces=" + str($database_type.minimizer_spaces) + ",", "load-factor=" + str($database_type.load_factor) + ")"]) + #else if $database_type.database_type == "special" + #set special_database_names = {"rdp": "RDP", "greengenes": "Greengenes", "silva": "Silva"} + #set database_value = "_".join([now, str($database_type.special_database_type), "kmer-len", str($database_type.kmer_len), "minimizer-len", str($database_type.minimizer_len), "minimizer-spaces", str($database_type.minimizer_spaces), "load-factor", str($database_type.load_factor)]) + #set database_name = " ".join([special_database_names[str($database_type.special_database_type)], "(Created:", now + ",", "kmer-len=" + str($database_type.kmer_len) + ",", "minimizer-len=" + str($database_type.minimizer_len) + ",", "minimizer-spaces=" + str($database_type.minimizer_spaces) + ",", "load-factor=" + str($database_type.load_factor) + ")"]) + #else if $database_type.database_type == "custom" + #set custom_database_name = re.sub(r'[^\w_.-]+', '_', str($database_type.custom_database_name)).strip('_') + #set database_name = " ".join([custom_database_name, "(" + str($database_type.custom_source_info) + ",", "kmer-len=" + str($database_type.kmer_len) + ",", "minimizer-len=" + str($database_type.minimizer_len) + ",", "minimizer-spaces=" + str($database_type.minimizer_spaces) + ",", "load-factor=" + str($database_type.load_factor) + ")"]) + #set database_value = "_".join([now, custom_database_name, "kmer-len", str($database_type.kmer_len), "minimizer-len", str($database_type.minimizer_len), "minimizer-spaces", str($database_type.minimizer_spaces), "load-factor", str($database_type.load_factor)]) + #else + >&2 echo "invalid database_type: $database_type.database_type" + #end if + + #if $database_type.database_type == "custom" + #silent command = ["kraken2-build", "--threads", '"${GALAXY_SLOTS:-1}"', + "--download-taxonomy", + "--db", "'" + $out_file.extra_files_path + "'/'" + database_value + "'", + str($database_type.skip_maps)] + #silent commands.append(" ".join(command)) + #silent command = ["kraken2-build", "--threads", '"${GALAXY_SLOTS:-1}"', + "--add-to-library", "'" + str($database_type.custom_fasta) + "'", + "--db", "'" + $out_file.extra_files_path + "'/'" + database_value + "'"] + #silent commands.append(" ".join(command)) + #end if + + #silent command = ["kraken2-build", "--threads", '"${GALAXY_SLOTS:-1}"'] + #if $database_type.database_type == "standard_local_build" + #silent command.append("--standard") + #else if $database_type.database_type == "special" + #silent command.extend(["--special", str($database_type.special_database_type)]) + #else if $database_type.database_type == "custom" + #silent command.append("--build") + #end if + #silent command.extend([ + "--kmer-len", str($database_type.kmer_len), + "--minimizer-len", str($database_type.minimizer_len), + "--minimizer-spaces", str($database_type.minimizer_spaces), + "--load-factor", str($database_type.load_factor), + "--db", "'" + $out_file.extra_files_path + "'/'" + database_value + "'"]) + #silent commands.append(" ".join(command)) + + #if $database_type.clean + #silent command = ["kraken2-build", "--threads", '"${GALAXY_SLOTS:-1}"', + "--clean", + "--db", "'" + $out_file.extra_files_path + "'/'" + database_value + "'"] + #silent commands.append(" ".join(command)) + #end if #end if + +#for command in commands + ## In test mode the tool executes `echo COMMAND` instead of `COMMAND` + #if $run_test_command == "false" + echo + #end if + #echo command + && +#end for + +echo '{"data_tables": {"kraken2_databases": [{"value": "$database_value", "name": "$database_name", "path": "$database_value"}]}}' > '$out_file' ]]> </command> <inputs> @@ -104,6 +198,7 @@ <option value="standard_prebuilt">Pre-Built Refseq indexes</option> <option value="minikraken">MiniKraken</option> <option value="special_prebuilt">Special Pre-Built indexes</option> + <option value="amplicon_prebuilt">16S Pre-Built indexes</option> <option value="special">Special</option> <option value="custom">Custom</option> </param> @@ -111,8 +206,9 @@ <expand macro="common_params" /> </when> <when value="standard_prebuilt"> - <conditional name="prebuild"> + <conditional name="prebuilt"> <param name="prebuilt_date" type="select" label="Select index build date"> + <option value="2025-07-14">July 14, 2025</option> <option value="2024-12-28">December 28, 2024</option> <option value="2024-09-04">September 4, 2024</option> <option value="2024-06-05">June 5, 2024</option> @@ -124,6 +220,21 @@ <option value="2020-12-02">December 2, 2020</option> <option value="2020-09-19">September 19, 2020</option> </param> + <when value="2025-07-14"> + <param name="prebuilt_db" type="select" label="Select a prebuilt Refseq index to download"> + <expand macro="viral"/> + <expand macro="minusb"/> + <expand macro="standard"/> + <expand macro="standard_08gb"/> + <expand macro="standard_16gb"/> + <expand macro="pluspf"/> + <expand macro="pluspf_08gb"/> + <expand macro="pluspf_16gb"/> + <expand macro="pluspfp"/> + <expand macro="pluspfp_08gb"/> + <expand macro="pluspfp_16gb"/> + </param> + </when> <when value="2024-12-28"> <param name="prebuilt_db" type="select" label="Select a prebuilt Refseq index to download"> <expand macro="viral"/> @@ -137,7 +248,6 @@ <expand macro="pluspfp"/> <expand macro="pluspfp_08gb"/> <expand macro="pluspfp_16gb"/> - <expand macro="core_nt"/> </param> </when> <when value="2024-09-04"> @@ -152,7 +262,6 @@ <expand macro="pluspf_16gb"/> <expand macro="pluspfp"/> <expand macro="pluspfp_08gb"/> - <expand macro="core_nt"/> </param> </when> <when value="2024-06-05"> @@ -263,12 +372,32 @@ </conditional> </when> <when value="special_prebuilt"> - <conditional name="special_prebuild"> - <param name="special_prebuilt_db" type="select" multiple="false" label="Select pre-built database to download"> - <option value="gtdb_genome_reps_20241109">GTDB v220 (Bacterial and archaeal; ~497 GB) (December 13, 2024)</option> + <conditional name="prebuilt"> + <param name="xyz" type="select" multiple="false" label="Select pre-built database to download"> + <option value="core_nt_20250609">Very large collection, inclusive of GenBank, RefSeq, TPA and PDB (July, 2025)</option> + <option value="core_nt_20241228">Very large collection, inclusive of GenBank, RefSeq, TPA and PDB (December, 2024)</option> + <option value="core_nt_20240904">Very large collection, inclusive of GenBank, RefSeq, TPA and PDB (September, 2024)</option> + <option value="gtdb_genome_reps_20250609">GTDB v226 (Bacterial and archaeal; ~497 GB) (July, 2025)</option> + <option value="gtdb_genome_reps_20241109">GTDB v220 (Bacterial and archaeal; ~644 GB) (December 13, 2024)</option> <option value="eupathdb48_20230407">EuPathDB-46 (April 18, 2023)</option> <option value="eupathdb48_20201113">EuPathDB-46 (November 13, 2020)</option> </param> + <when value="core_nt_20250609"> + <param name="prebuilt_db" type="hidden" value="core_nt"/> + <param name="prebuilt_date" type="hidden" value="20250609"/> + </when> + <when value="core_nt_20241228"> + <param name="prebuilt_db" type="hidden" value="core_nt"/> + <param name="prebuilt_date" type="hidden" value="20241228"/> + </when> + <when value="core_nt_20240904"> + <param name="prebuilt_db" type="hidden" value="core_nt"/> + <param name="prebuilt_date" type="hidden" value="20240904"/> + </when> + <when value="gtdb_genome_reps_20250609"> + <param name="prebuilt_db" type="hidden" value="gtdb_genome_reps"/> + <param name="prebuilt_date" type="hidden" value="2025-06-09"/> + </when> <when value="gtdb_genome_reps_20241109"> <param name="prebuilt_db" type="hidden" value="gtdb_genome_reps"/> <param name="prebuilt_date" type="hidden" value="2024-11-09"/> @@ -283,6 +412,32 @@ </when> </conditional> </when> + <when value="amplicon_prebuilt"> + <conditional name="prebuilt"> + <param name="xyz" type="select" multiple="false" label="Select pre-built database to download"> + <option value="16S_Greengenes13.5_20200326">Greengenes 13.5</option> + <option value="16S_RDP11.5_20200326">RDP 11.5</option> + <option value="16S_Silva132_20200326">Silva 132</option> + <option value="16S_Silva138_20200326">Silva 138</option> + </param> + <when value="16S_Greengenes13.5_20200326"> + <param name="prebuilt_db" type="hidden" value="16S_Greengenes13.5"/> + <param name="prebuilt_date" type="hidden" value="20200326"/> + </when> + <when value="16S_RDP11.5_20200326"> + <param name="prebuilt_db" type="hidden" value="16S_RDP11.5"/> + <param name="prebuilt_date" type="hidden" value="20200326"/> + </when> + <when value="16S_Silva132_20200326"> + <param name="prebuilt_db" type="hidden" value="16S_Silva132"/> + <param name="prebuilt_date" type="hidden" value="20200326"/> + </when> + <when value="16S_Silva138_20200326"> + <param name="prebuilt_db" type="hidden" value="16S_Silva138"/> + <param name="prebuilt_date" type="hidden" value="20200326"/> + </when> + </conditional> + </when> <when value="minikraken"> <param name="minikraken2_version" type="select" multiple="false" label="Select MiniKraken2 database version to download"> <option value="v2">Version 2</option> @@ -293,7 +448,7 @@ <param name="special_database_type" type="select" multiple="false" label="Select database to build"> <option value="greengenes">Greengenes</option> <option value="silva">Silva</option> - <option value="rdp">RDP</option> + <!-- <option value="rdp">RDP</option> https://github.com/DerrickWood/kraken2/issues/736 --> </param> <expand macro="common_params" /> </when> @@ -305,39 +460,43 @@ <expand macro="common_params" /> </when> </conditional> + <param name="run_test_command" type="hidden"/> </inputs> <outputs> <data name="out_file" format="data_manager_json" /> </outputs> <tests> + <!-- standard_local_build --> + <test expect_num_outputs="1"> <conditional name="database_type"> - <param name="database_type" value="custom" /> - <param name="custom_fasta" value="adapter.fa" /> - <param name="custom_database_name" value="custom_database" /> - <param name="custom_source_info" value="from adapter.fa test data" /> - <param name="skip_maps" value="true" /> + <param name="database_type" value="standard_local_build" /> <param name="kmer_len" value="35" /> <param name="minimizer_spaces" value="6"/> <param name="load_factor" value="0.7" /> <param name="clean" value="true"/> </conditional> + <param name="run_test_command" value="false"/> <output name="out_file"> <assert_contents> <has_text text="kraken2_databases"/> <has_text text="path"/> - <has_text text="custom_database (from adapter.fa test data, kmer-len=35, minimizer-len=31, minimizer-spaces=6, load-factor=0.7)"/> + <has_text text="Standard (Local Build)"/> + <has_text text="kmer-len=35, minimizer-len=31, minimizer-spaces=6, load-factor=0.7"/> </assert_contents> </output> </test> + + <!-- standard_prebuilt --> <test> <conditional name="database_type"> <param name="database_type" value="standard_prebuilt" /> - <conditional name="prebuild"> + <conditional name="prebuilt"> <param name="prebuilt_date" value="2022-06-07"/> <param name="prebuilt_db" value="viral"/> </conditional> </conditional> + <param name="run_test_command" value="true"/> <output name="out_file"> <assert_contents> <has_text text="kraken2_databases"/> @@ -353,11 +512,12 @@ <test> <conditional name="database_type"> <param name="database_type" value="standard_prebuilt" /> - <conditional name="prebuild"> + <conditional name="prebuilt"> <param name="prebuilt_date" value="2024-01-12"/> <param name="prebuilt_db" value="viral"/> </conditional> </conditional> + <param name="run_test_command" value="true"/> <output name="out_file"> <assert_contents> <has_text text="kraken2_databases"/> @@ -373,11 +533,12 @@ <test> <conditional name="database_type"> <param name="database_type" value="standard_prebuilt" /> - <conditional name="prebuild"> + <conditional name="prebuilt"> <param name="prebuilt_date" value="2024-06-05"/> <param name="prebuilt_db" value="viral"/> </conditional> </conditional> + <param name="run_test_command" value="true"/> <output name="out_file"> <assert_contents> <has_text text="kraken2_databases"/> @@ -389,6 +550,137 @@ </assert_contents> </output> </test> + + <!-- minikraken --> + + <test> + <conditional name="database_type"> + <param name="database_type" value="minikraken" /> + <param name="minikraken2_version" value="v1"/> + </conditional> + <param name="run_test_command" value="false"/> + <output name="out_file"> + <assert_contents> + <has_text text="kraken2_databases"/> + <has_text text="path"/> + <has_text text="Minikraken2"/> + <has_text text="v1"/> + <has_text text="Created"/> + </assert_contents> + </output> + </test> + + <!-- special_prebuilt --> + + <test> + <conditional name="database_type"> + <param name="database_type" value="special_prebuilt" /> + <conditional name="prebuilt"> + <param name="xyz" value="eupathdb48_20201113"/> + <param name="prebuilt_date" value="2020-11-13"/> + <param name="prebuilt_db" value="eupathdb48"/> + </conditional> + </conditional> + <param name="run_test_command" value="false"/> + <output name="out_file"> + <assert_contents> + <has_text text="kraken2_databases"/> + <has_text text="path"/> + <has_text text="Prebuilt Refseq indexes: EuPathDB-46"/> + <has_text text="standard_prebuilt_eupathdb48_2020-11-13"/> + <has_text text="Prebuilt Refseq indexes"/> + <has_text text="Downloaded"/> + </assert_contents> + </output> + </test> + + <!-- amplicon_prebuilt --> + + <test> + <conditional name="database_type"> + <param name="database_type" value="amplicon_prebuilt" /> + <conditional name="prebuilt"> + <param name="xyz" value="16S_Greengenes13.5_20200326"/> + <param name="prebuilt_date" value="20200326"/> + <param name="prebuilt_db" value="16S_Greengenes13.5"/> + </conditional> + </conditional> + <param name="run_test_command" value="false"/> + <output name="out_file"> + <assert_contents> + <has_text text="kraken2_databases"/> + <has_text text="path"/> + <has_text text="16S_Greengenes13.5"/> + </assert_contents> + </output> + </test> + + <!-- special --> + + <test expect_num_outputs="1"> + <conditional name="database_type"> + <param name="database_type" value="special" /> + <param name="special_database_type" value="greengenes" /> + <param name="kmer_len" value="35" /> + <param name="minimizer_spaces" value="6"/> + <param name="load_factor" value="0.7" /> + <param name="clean" value="true"/> + </conditional> + <param name="run_test_command" value="true"/> + <output name="out_file"> + <assert_contents> + <has_text text="kraken2_databases"/> + <has_text text="path"/> + <has_text text="Greengenes"/> + <has_text text="kmer-len=35, minimizer-len=31, minimizer-spaces=6, load-factor=0.7"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="1"> + <conditional name="database_type"> + <param name="database_type" value="special" /> + <param name="special_database_type" value="silva" /> + <param name="kmer_len" value="35" /> + <param name="minimizer_spaces" value="6"/> + <param name="load_factor" value="0.7" /> + <param name="clean" value="true"/> + </conditional> + <param name="run_test_command" value="true"/> + <output name="out_file"> + <assert_contents> + <has_text text="kraken2_databases"/> + <has_text text="path"/> + <has_text text="Silva"/> + <has_text text="kmer-len=35, minimizer-len=31, minimizer-spaces=6, load-factor=0.7"/> + </assert_contents> + </output> + </test> + + <!-- custom --> + + <test expect_num_outputs="1"> + <conditional name="database_type"> + <param name="database_type" value="custom" /> + <param name="custom_fasta" value="adapter.fa" /> + <param name="custom_database_name" value="custom_database" /> + <param name="custom_source_info" value="from adapter.fa test data" /> + <param name="skip_maps" value="true" /> + <param name="kmer_len" value="35" /> + <param name="minimizer_spaces" value="6"/> + <param name="load_factor" value="0.7" /> + <param name="clean" value="true"/> + </conditional> + <param name="run_test_command" value="true"/> + <output name="out_file"> + <assert_contents> + <has_text text="kraken2_databases"/> + <has_text text="path"/> + <has_text text="custom_database (from adapter.fa test data, kmer-len=35, minimizer-len=31, minimizer-spaces=6, load-factor=0.7)"/> + </assert_contents> + </output> + </test> + + </tests> <help><