# HG changeset patch # User iuc # Date 1571578641 14400 # Node ID c2e4127fb5bf5eb3069bb614ac7e52867821eed5 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_build_bracken_database/ commit 74e81c69c8806d98beb15a889741bcd702866ce3" diff -r 000000000000 -r c2e4127fb5bf data_manager/bracken_build_database.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/bracken_build_database.py Sun Oct 20 09:37:21 2019 -0400 @@ -0,0 +1,90 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import argparse +import errno +import json +import os +import subprocess +import uuid + + +DATA_TABLE_NAME = "bracken_databases" + + +def bracken_build_database(target_directory, bracken_build_args, database_name, data_table_name=DATA_TABLE_NAME): + + database_value = str(uuid.uuid4()) + + database_name = database_name + + database_path = os.path.join(bracken_build_args['kraken_database'], 'database' + str(bracken_build_args['read_len']) + 'mers.kmer_distrib') + + bracken_build_args_list = [ + '-t', bracken_build_args['threads'], + '-k', bracken_build_args['kmer_len'], + '-l', bracken_build_args['read_len'], + '-d', bracken_build_args['kraken_database'], + ] + + subprocess.check_call(['bracken-build'] + bracken_build_args_list) + + data_table_entry = { + "data_tables": { + data_table_name: [ + { + "value": database_value, + "name": database_name, + "path": database_path, + } + ] + } + } + + return data_table_entry + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('data_manager_json') + parser.add_argument('--threads', dest='threads', default=1, help='threads') + parser.add_argument('--kmer-len', dest='kmer_len', help='K-mer length') + parser.add_argument('--read-len', dest='read_len', help='Read length') + parser.add_argument('--kraken-db', dest='kraken_database', help='Kraken Database') + parser.add_argument('--database-name', dest='database_name', help='Database Name') + args = parser.parse_args() + + data_manager_input = json.loads(open(args.data_manager_json).read()) + + target_directory = data_manager_input['output_data'][0]['extra_files_path'] + + bracken_build_args = { + 'threads': args.threads, + 'kmer_len': args.kmer_len, + 'read_len': args.read_len, + 'kraken_database': args.kraken_database, + } + + try: + os.mkdir(target_directory) + except OSError as exc: + if exc.errno == errno.EEXIST and os.path.isdir( target_directory ): + pass + else: + raise + + data_manager_output = {} + + data_manager_output = bracken_build_database( + target_directory, + bracken_build_args, + args.database_name, + ) + + with open(args.data_manager_json, 'w') as out: + out.write(json.dumps(data_manager_output, sort_keys=True)) + + +if __name__ == "__main__": + main() diff -r 000000000000 -r c2e4127fb5bf data_manager/bracken_build_database.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/bracken_build_database.xml Sun Oct 20 09:37:21 2019 -0400 @@ -0,0 +1,48 @@ + + + bracken database builder + + bracken + kraken2 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 10.7717/peerj-cs.104 + + diff -r 000000000000 -r c2e4127fb5bf data_manager_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Sun Oct 20 09:37:21 2019 -0400 @@ -0,0 +1,11 @@ + + + + + + + + + + + diff -r 000000000000 -r c2e4127fb5bf test-data/kraken2_databases.loc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/kraken2_databases.loc Sun Oct 20 09:37:21 2019 -0400 @@ -0,0 +1,6 @@ +# Tab separated with three columns: +# - value (Galaxy records this in the Galaxy DB) +# - name (Galaxy shows this in the UI) +# - path (folder name containing the Kraken DB) +# +test_entry "Test Database" ${__HERE__}/test_db diff -r 000000000000 -r c2e4127fb5bf test-data/nodes_patterns.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/nodes_patterns.txt Sun Oct 20 09:37:21 2019 -0400 @@ -0,0 +1,15 @@ +^220341\s +^90370\s +^59201\s +^28901\s +^590\s +^543\s +^91347\s +^1236\s +^1224\s +^2\s +^131567\s +^1\s +^585057\s +^562\s +^561\s diff -r 000000000000 -r c2e4127fb5bf test-data/reproduce_test_dataset.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/reproduce_test_dataset.sh Sun Oct 20 09:37:21 2019 -0400 @@ -0,0 +1,18 @@ +#!/bin/bash + +# This script produces a small kraken2 database containing only a ~1kb portion each of a salmonella and ecoli genome +# It requires kraken2, and entrez-direct (available on bioconda) +kraken2-build --db test_db --download_taxonomy +mv test_db/taxonomy/nucl_gb.accession2taxid test_db/taxonomy/nucl_gb.accession2taxid_full +grep -e 'NC_003198.1' -e 'NC_011750.1' test_db/taxonomy/nucl_gb.accession2taxid_full > test_db/taxonomy/nucl_gb.accession2taxid +mv test_db/taxonomy/nodes.dmp test_db/taxonomy/nodes.dmp_full +grep -f node_patterns.txt test_db/taxonomy/nodes.dmp_full > test_db/taxonomy/nodes.dmp +mv test_db/taxonomy/names.dmp test_db/taxonomy/names.dmp_full +grep -e '^220341\s' -e '^585057\s' test_db/taxonomy/names.dmp_full > test_db/taxonomy/names.dmp +esearch -db nucleotide -query "NC_003198.1" | efetch -format fasta > NC_003198.1.fasta +esearch -db nucleotide -query "NC_011750.1" | efetch -format fasta > NC_011750.1.fasta +head -n 14 NC_003198.1.fasta > NC_003198.1_1kb.fasta +head -n 14 NC_011750.1.fasta > NC_011750.1_1kb.fasta +kraken2-build --db test_db --add-to-library NC_003198.1_1kb.fasta +kraken2-build --db test_db --add-to-library NC_011750.1_1kb.fasta +kraken2-build --db test_db --build diff -r 000000000000 -r c2e4127fb5bf test-data/test_db/hash.k2d Binary file test-data/test_db/hash.k2d has changed diff -r 000000000000 -r c2e4127fb5bf test-data/test_db/library/added/9C7DdW7GAD.fna --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/library/added/9C7DdW7GAD.fna Sun Oct 20 09:37:21 2019 -0400 @@ -0,0 +1,17 @@ +>NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome +AGAGATTACGTCTGGTTGCAAGAGATCATAACAGGGGAAATTGATTGAAAATAAATATAT +CGCCAGCAGCACATGAACAAGTTTCGGAATGTGATCAATTTAAAAATTTATTGACTTAGG +CGGGCAGATACTTTAACCAATATAGGAATACAAGACAGACAAATAAAAATGACAGAGTAC +ACAACATCCATGAACCGCATCAGxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxAGGT +AACGGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAGCCCGCACCTGAACAGTGCGG +GCxxxxxxxxCGACCAGAGATCACGAGGTAACAACCATGCGAGTGTTGAAGTTCGGCGGT +ACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATTCC +AGGCAAGGGCAGGTAGCGACCGTACTTTCCGCCCCCGCGAAAATTACCAACCATCTGGTG +GCGATGATTGAAAAAACTATCGGCGGCCAGGATGCTTTGCCGAATATCAGCGATGCCGAA +CGTATTTTTTCTGACCTGCTCGCAGGACTTGCCAGCGCGCAGCCGGGATTCCCGCTTGCA +CGGTTGAAAATGGTTGTCGAACAAGAATTCGCTCAGATCAAACATGTTTTGCATGGTATC +AGCCTGCTGGGTCAGTGCCCGGATAGCATCAACGCCGCGCTGATTTGCCGTGGCGAAAAA +ATGTCGATCGCGATTATGGCGGGACTCCTGGAGGCGCGTGGACATCGCGTCACGGTGATC +GATCCGGTAGAAAAACTGCTGGCGGTGGGCCATTACCTTGAATCTACCGTCGATATCGCG +GAATCGACTCGCCGTATCGCCGCCAGCCAGATCCCGGCCGATCACATGATCCTGATGGCG +GGCTTTACTG diff -r 000000000000 -r c2e4127fb5bf test-data/test_db/library/added/9C7DdW7GAD.fna.masked diff -r 000000000000 -r c2e4127fb5bf test-data/test_db/library/added/cWk1IBlK73.fna --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/library/added/cWk1IBlK73.fna Sun Oct 20 09:37:21 2019 -0400 @@ -0,0 +1,17 @@ +>NC_011750.1 Escherichia coli IAI39 chromosome, complete genome +GCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTxxxxxxxGAGTGTCT +GATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGGT +CACTAAATACTTTAACCAATATAGGCATAGCGCACAGACAGATAAAAATTACAGAGTACA +CAACATCCATGAAACGCATTAGxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxAGGTA +ACGGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGGC +xxxxxxxxCGACCAAAGGTAACGAGGTAACAACCATGCGAGTGTTGAAGTTCGGCGGTAC +ATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATGCCAG +GCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTGGC +GATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAACG +TATTTTTGCCGAACTTCTGACGGGACTCGCCGCTGCCCAACCGGGATTCCCGCTGGCGCA +ACTGAAAACTTTCGTCGATCAGGAATTTGCCCAAATAAAACATGTCCTGCATGGCATTAG +TTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTGATTTGCCGTGGCGAGAAAAT +GTCGATCGCCATTATGGCCGGCGTATTAGAAGCGCGCGGTCACAACGTTACCGTTATCGA +TCCGGTCGAAAAACTGCTGGCAGTGGGGCATTACCTCGAATCTACCGTCGATATTGCTGA +GTCCACCCGCCGTATTGCGGCAAGTCGTATTCCGGCTGATCACATGGTGCTGATGGCAGG +TTTCACCGCC diff -r 000000000000 -r c2e4127fb5bf test-data/test_db/library/added/cWk1IBlK73.fna.masked diff -r 000000000000 -r c2e4127fb5bf test-data/test_db/library/added/prelim_map.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/library/added/prelim_map.txt Sun Oct 20 09:37:21 2019 -0400 @@ -0,0 +1,2 @@ +ACCNUM NC_011750.1 NC_011750 +ACCNUM NC_003198.1 NC_003198 diff -r 000000000000 -r c2e4127fb5bf test-data/test_db/library/added/prelim_map_QXr8C5PiOX.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/library/added/prelim_map_QXr8C5PiOX.txt Sun Oct 20 09:37:21 2019 -0400 @@ -0,0 +1,1 @@ +ACCNUM NC_003198.1 NC_003198 diff -r 000000000000 -r c2e4127fb5bf test-data/test_db/library/added/prelim_map_l8ftMYsZv0.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/library/added/prelim_map_l8ftMYsZv0.txt Sun Oct 20 09:37:21 2019 -0400 @@ -0,0 +1,1 @@ +ACCNUM NC_011750.1 NC_011750 diff -r 000000000000 -r c2e4127fb5bf test-data/test_db/opts.k2d Binary file test-data/test_db/opts.k2d has changed diff -r 000000000000 -r c2e4127fb5bf test-data/test_db/seqid2taxid.map --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/seqid2taxid.map Sun Oct 20 09:37:21 2019 -0400 @@ -0,0 +1,1 @@ +NC_011750.1 585057 diff -r 000000000000 -r c2e4127fb5bf test-data/test_db/taxo.k2d Binary file test-data/test_db/taxo.k2d has changed diff -r 000000000000 -r c2e4127fb5bf test-data/test_db/taxonomy/names.dmp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/taxonomy/names.dmp Sun Oct 20 09:37:21 2019 -0400 @@ -0,0 +1,5 @@ +220341 | Salmonella enterica subsp. enterica serovar Typhi CT18 | | equivalent name | +220341 | Salmonella enterica subsp. enterica serovar Typhi str. CT18 | | scientific name | +220341 | Salmonella enterica subsp. enterica serovar Typhi strain CT18 | | equivalent name | +220341 | Salmonella typhi CT18 | | equivalent name | +585057 | Escherichia coli IAI39 | | scientific name | diff -r 000000000000 -r c2e4127fb5bf test-data/test_db/taxonomy/nodes.dmp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/taxonomy/nodes.dmp Sun Oct 20 09:37:21 2019 -0400 @@ -0,0 +1,15 @@ +1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | | +2 | 131567 | superkingdom | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | | +543 | 91347 | family | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +561 | 543 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +562 | 561 | species | EC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +590 | 543 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +1224 | 2 | phylum | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +1236 | 1224 | class | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +28901 | 590 | species | SE | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +59201 | 28901 | subspecies | SE | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +90370 | 59201 | no rank | | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +91347 | 1236 | order | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +131567 | 1 | no rank | | 8 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | | +220341 | 90370 | no rank | | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +585057 | 562 | no rank | | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | diff -r 000000000000 -r c2e4127fb5bf test-data/test_db/taxonomy/nucl_gb.accession2taxid --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/taxonomy/nucl_gb.accession2taxid Sun Oct 20 09:37:21 2019 -0400 @@ -0,0 +1,2 @@ +NC_003198 NC_003198.1 220341 16758993 +NC_011750 NC_011750.1 585057 218698419 diff -r 000000000000 -r c2e4127fb5bf test-data/test_db/taxonomy/prelim_map.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/taxonomy/prelim_map.txt Sun Oct 20 09:37:21 2019 -0400 @@ -0,0 +1,2 @@ +ACCNUM NC_011750.1 NC_011750 +ACCNUM NC_003198.1 NC_003198 diff -r 000000000000 -r c2e4127fb5bf test-data/test_db/unmapped.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/unmapped.txt Sun Oct 20 09:37:21 2019 -0400 @@ -0,0 +1,1 @@ +NC_003198 diff -r 000000000000 -r c2e4127fb5bf tool-data/bracken_databases.loc.sample diff -r 000000000000 -r c2e4127fb5bf tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Sun Oct 20 09:37:21 2019 -0400 @@ -0,0 +1,8 @@ + + + + + value, name, path + +
+
diff -r 000000000000 -r c2e4127fb5bf tool_data_table_conf.xml.test --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Sun Oct 20 09:37:21 2019 -0400 @@ -0,0 +1,8 @@ + + + + + value, name, path + +
+