Mercurial > repos > iuc > kraken_taxonomy_report

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kraken_taxonomy_report.py	Wed Jun 01 17:25:40 2016 -0400
@@ -0,0 +1,278 @@
+#!/usr/bin/env python
+
+# Reports a summary of Kraken's results
+# and optionally creates a newick Tree
+# Copyright (c) 2016 Daniel Blankenberg
+# Licensed under the Academic Free License version 3.0
+# https://github.com/blankenberg/Kraken-Taxonomy-Report
+
+import sys
+import os
+import optparse
+import re
+
+__VERSION__ = '0.0.1'
+
+__URL__ = "https://github.com/blankenberg/Kraken-Taxonomy-Report"
+
+# Rank names were pulled from ncbi nodes.dmp on 02/02/2016
+# cat nodes.dmp | cut -f 5 | sort | uniq
+# "root" is added manually
+NO_RANK_NAME = "no rank"
+RANK_NAMES = [ NO_RANK_NAME,
+               "root",
+               "superkingdom",
+               "kingdom",
+               "subkingdom",
+               "superphylum",
+               "phylum",
+               "subphylum",
+               "superclass",
+               "class",
+               "subclass",
+               "infraclass",
+               "superorder",
+               "order",
+               "suborder",
+               "infraorder",
+               "parvorder",
+               "superfamily",
+               "family",
+               "subfamily",
+               "tribe",
+               "subtribe",
+               "genus",
+               "subgenus",
+               "species group",
+               "species subgroup",
+               "species",
+               "subspecies",
+               "varietas",
+               "forma" ]
+# NB: We put 'no rank' at top of list for generating trees, due to e.g.
+# root (root) -> cellular organisms (no rank) -> bacteria (superkingdom)
+
+RANK_NAME_TO_INTS = dict( [ (y, x) for (x, y) in enumerate( RANK_NAMES ) ] )
+RANK_NAMES_INTS = range( len( RANK_NAMES ) )
+
+NO_RANK_INT = RANK_NAMES.index( NO_RANK_NAME )
+NO_RANK_CODE = 'n'
+
+PRIMARY_RANK_NAMES = [ 'species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom' ]
+RANK_INT_TO_CODE = {}
+for name in PRIMARY_RANK_NAMES:
+    RANK_INT_TO_CODE[ RANK_NAMES.index( name ) ] = name[0]
+RANK_INT_TO_CODE[ RANK_NAMES.index( 'superkingdom' ) ] = 'd'
+PRIMARY_RANK_NAMES.append( 'superkingdom' )
+
+NAME_STUB = "%s__%s"
+NAME_RE = re.compile( "(\t| |\||\.;)" )
+NAME_REPL = "_"
+
+
+def get_kraken_db_path( db ):
+    assert db, ValueError( "You must provide a kraken database" )
+    k_db_path = os.getenv('KRAKEN_DB_PATH', None )
+    if k_db_path:
+        db = os.path.join( k_db_path, db )
+    return db
+
+
+def load_taxonomy( db_path, sanitize_names=False ):
+    child_lists = {}
+    name_map = {}
+    rank_map = {}
+    with open( os.path.join( db_path, "taxonomy/names.dmp" ) ) as fh:
+        for line in fh:
+            line = line.rstrip( "\n\r" )
+            if line.endswith( "\t|" ):
+                line = line[:-2]
+            fields = line.split( "\t|\t" )
+            node_id = fields[0]
+            name = fields[1]
+            if sanitize_names:
+                name = NAME_RE.sub( NAME_REPL, name )
+            name_type = fields[3]
+            if name_type == "scientific name":
+                name_map[ node_id ] = name
+
+    with open( os.path.join( db_path, "taxonomy/nodes.dmp" ) ) as fh:
+        for line in fh:
+            line = line.rstrip( "\n\r" )
+            fields = line.split( "\t|\t" )
+            node_id = fields[0]
+            parent_id = fields[1]
+            rank = RANK_NAME_TO_INTS.get( fields[2].lower(), None )
+            if rank is None:
+                # This should never happen, unless new taxonomy ranks are created
+                print >> sys.stderr, 'Unrecognized rank: Node "%s" is "%s", setting to "%s"' % ( node_id, fields[2], NO_RANK_NAME )
+                rank = NO_RANK_INT
+            if node_id == '1':
+                parent_id = '0'
+            if parent_id not in child_lists:
+                child_lists[ parent_id ] = []
+            child_lists[ parent_id ].append( node_id )
+            rank_map[node_id] = rank
+    return ( child_lists, name_map, rank_map )
+
+
+def dfs_summation( node, counts, child_lists ):
+    children = child_lists.get( node, None )
+    if children:
+        for child in children:
+            dfs_summation( child, counts, child_lists )
+            counts[ node ] = counts.get( node, 0 ) + counts.get( child, 0 )
+
+
+def dfs_report( node, file_data, hit_taxa, rank_map, name_map, child_lists, output_lines, options, name=None, tax=None ):
+    if not options.summation and ( not options.show_zeros and node not in hit_taxa ):
+        return
+    rank_int = rank_map[node]
+    code = RANK_INT_TO_CODE.get( rank_int, NO_RANK_CODE )
+    if ( code != NO_RANK_CODE or options.intermediate ) and ( options.show_zeros or node in hit_taxa):
+        if name is None:
+            name = ""
+        else:
+            name = "%s|" % name
+        if tax is None:
+            tax = ''
+        else:
+            tax = "%s;" % tax
+        sanitized_name = name_map[ node ]
+        name_stub = NAME_STUB % ( code, sanitized_name )
+        name = name + name_stub
+        tax = tax + name_stub
+        if options.name_id:
+            output = node
+        elif options.name_long:
+            output = name
+        else:
+            output = sanitized_name
+        for val in file_data:
+            output = "%s\t%i" % ( output, val.get( node, 0 ) )
+        if options.show_rank:
+            output = "%s\t%s" % ( output, RANK_NAMES[ rank_int ] )
+        if options.taxonomy:
+            output = "%s\t%s" % ( output, tax )
+        output_lines[ rank_int ].append( output )
+    children = child_lists.get( node )
+    if children:
+        for child in children:
+            dfs_report( child, file_data, hit_taxa, rank_map, name_map, child_lists, output_lines, options, name=name, tax=tax )
+
+
+def write_tree( child_lists, name_map, rank_map, options, branch_length=1 ):
+    # Uses Biopython, only load if making tree
+    import Bio.Phylo
+    from Bio.Phylo import BaseTree
+
+    def _get_name( node_id ):
+        if options.name_id:
+            return node_id
+        return name_map[node_id]
+    nodes = {}
+    root_node_id = child_lists["0"][0]
+    nodes[root_node_id] = BaseTree.Clade( name=_get_name( root_node_id), branch_length=branch_length )
+
+    def recurse_children( parent_id ):
+        if options.cluster is not None and rank_map[parent_id] == options.cluster:
+            # Short circuit if we found our rank, prevents 'hanging' no ranks from being output
+            # e.g. clustering by "species" (Escherichia coli), but have "no rank" below (Escherichia coli K-12) in test_db
+            return
+        if parent_id not in nodes:
+            nodes[parent_id] = BaseTree.Clade( name=_get_name( parent_id ), branch_length=branch_length )
+        for child_id in child_lists.get( parent_id, [] ):
+            if options.cluster is None or ( rank_map[child_id] <= options.cluster  ):
+                if child_id not in nodes:
+                    nodes[child_id] = BaseTree.Clade(name=_get_name( child_id ), branch_length=branch_length)
+                nodes[parent_id].clades.append(nodes[child_id])
+                recurse_children( child_id )
+    recurse_children( root_node_id )
+    tree = BaseTree.Tree(root=nodes[root_node_id])
+    Bio.Phylo.write( [tree], options.output_tree, 'newick' )
+
+
+def __main__():
+    parser = optparse.OptionParser( usage="%prog [options] file1 file...fileN" )
+    parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='print version and exit' )
+    parser.add_option( '', '--show-zeros', dest='show_zeros', action='store_true', default=False, help='Show empty nodes' )
+    parser.add_option( '', '--header-line', dest='header_line', action='store_true', default=False, help='Provide a header on output' )
+    parser.add_option( '', '--intermediate', dest='intermediate', action='store_true', default=False, help='Intermediate Ranks' )
+    parser.add_option( '', '--name-id', dest='name_id', action='store_true', default=False, help='Use Taxa ID instead of Name' )
+    parser.add_option( '', '--name-long', dest='name_long', action='store_true', default=False, help='Use Long taxa ID instead of base name' )
+    parser.add_option( '', '--taxonomy', dest='taxonomy', action='store_true', default=False, help='Output taxonomy in last column' )
+    parser.add_option( '', '--cluster', dest='cluster', action='store', type="string", default=None, help='Cluster counts to specified rank' )
+    parser.add_option( '', '--summation', dest='summation', action='store_true', default=False, help='Add summation of child counts to each taxa' )
+    parser.add_option( '', '--sanitize-names', dest='sanitize_names', action='store_true', default=False, help='Replace special chars (\t| |\||\.;) with underscore (_)' )
+    parser.add_option( '', '--show-rank', dest='show_rank', action='store_true', default=False, help='Output column with Rank name' )
+    parser.add_option( '', '--db', dest='db', action='store', type="string", default=None, help='Name of Kraken database' )
+    parser.add_option( '', '--output', dest='output', action='store', type="string", default=None, help='Name of output file' )
+    parser.add_option( '', '--output-tree', dest='output_tree', action='store', type="string", default=None, help='Name of output file to place newick tree' )
+    (options, args) = parser.parse_args()
+    if options.version:
+        print >> sys.stderr, "Kraken Taxonomy Report (%s) version %s" % ( __URL__, __VERSION__ )
+        sys.exit()
+    if not args:
+        print >> sys.stderr, parser.get_usage()
+        sys.exit()
+
+    if options.cluster:
+        cluster_name = options.cluster.lower()
+        cluster = RANK_NAME_TO_INTS.get( cluster_name, None )
+        assert cluster is not None, ValueError( '"%s" is not a valid rank for clustering.' % options.cluster )
+        if cluster_name not in PRIMARY_RANK_NAMES:
+            assert options.intermediate, ValueError( 'You cannot cluster by "%s", unless you enable intermediate ranks.' % options.cluster )
+        ranks_to_report = [ cluster ]
+        options.cluster = cluster
+        # When clustering we need to do summatation
+        options.summation = True
+    else:
+        options.cluster = None  # make empty string into None
+        ranks_to_report = RANK_NAMES_INTS
+
+    if options.output:
+        output_fh = open( options.output, 'wb+' )
+    else:
+        output_fh = sys.stdout
+
+    db_path = get_kraken_db_path( options.db )
+    ( child_lists, name_map, rank_map ) = load_taxonomy( db_path, sanitize_names=options.sanitize_names )
+    file_data = []
+    hit_taxa = []
+    for input_filename in args:
+        taxo_counts = {}
+        with open( input_filename ) as fh:
+            for line in fh:
+                fields = line.split( "\t" )
+                taxo_counts[ fields[2] ] = taxo_counts.get( fields[2], 0 ) + 1
+        clade_counts = taxo_counts.copy()  # fixme remove copying?
+        if options.summation:
+            dfs_summation( '1', clade_counts, child_lists )
+        for key, value in clade_counts.items():
+            if value and key not in hit_taxa:
+                hit_taxa.append( key )
+        file_data.append( clade_counts )
+
+    if options.header_line:
+        output_fh.write( "#ID\t" )
+        output_fh.write( "\t".join( args ) )
+        if options.show_rank:
+            output_fh.write( "\trank" )
+        if options.taxonomy:
+            output_fh.write( "\ttaxonomy" )
+        output_fh.write( '\n' )
+
+    output_lines = dict( [ ( x, [] ) for x in RANK_NAMES_INTS ] )
+    dfs_report( '1', file_data, hit_taxa, rank_map, name_map, child_lists, output_lines, options, name=None, tax=None )
+
+    for rank_int in ranks_to_report:
+        for line in output_lines.get( rank_int, [] ):
+            output_fh.write( line )
+            output_fh.write( '\n' )
+    fh.close()
+    if options.output_tree:
+        write_tree( child_lists, name_map, rank_map, options )
+
+
+if __name__ == "__main__":
+    __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kraken_taxonomy_report.xml	Wed Jun 01 17:25:40 2016 -0400
@@ -0,0 +1,242 @@
+<?xml version="1.0"?>
+<tool id="kraken_taxonomy_report" name="Kraken taxonomic report" version="0.0.1">
+    <description>view report of classification for multiple samples</description>
+    <requirements>
+        <requirement type="package" version="1.66">biopython</requirement>
+    </requirements>
+    <stdio>
+        <exit_code range="1:" />
+        <exit_code range=":-1" />
+    </stdio>
+    <version_command>python ${__tool_directory__}/kraken_taxonomy_report.py --version</version_command>
+    <command>
+<![CDATA[
+
+#for $input_classification in $classification:
+    ln -s "${input_classification}" "${input_classification.element_identifier}" &&
+#end for
+
+export KRAKEN_DB_PATH="${kraken_database.fields.path}" &&
+python ${__tool_directory__}/kraken_taxonomy_report.py
+
+--db "${kraken_database.fields.name}"
+${show_zeros}
+${header_line}
+${otu_name}
+${taxonomy}
+${show_rank}
+${intermediate}
+${sanitize_names}
+#if str( $cluster.cluster ):
+    --cluster "${cluster.cluster}"
+#else:
+    ${cluster.summation}
+#end if
+
+--output "${output_report}"
+
+#if $output_tree:
+    --output-tree "${output_tree}"
+#end if
+
+#for $input_classification in $classification:
+    "${input_classification.element_identifier}"
+#end for
+
+]]>
+    </command>
+    <inputs>
+        <param format="tabular" label="Kraken output" multiple="True" name="classification" type="data" />
+        <param checked="False" falsevalue="" argument="--show-zeros" label="Display taxa even if they lack a read in any sample" name="show_zeros" truevalue="--show-zeros" type="boolean" />
+        <param checked="True" falsevalue="" argument="--header-line" label="Display a header line indicating sample IDs" name="header_line" truevalue="--header-line" type="boolean" />
+        <param label="Select a Kraken database" name="kraken_database" type="select" help="Select the same database used to classify reads">
+            <options from_data_table="kraken_databases">
+                <validator message="No Kraken databases are available" type="no_options" />
+            </options>
+        </param>
+        <param label="How to name OTUs" name="otu_name" type="select" multiple="False">
+            <option value="" selected="True">Node name only</option>
+            <option value="--name-long">Taxonified Name</option>
+            <option value="--name-id">Node ID</option>
+        </param>
+        <param checked="True" falsevalue="" argument="--sanitize-names" label="Sanitize Names" name="sanitize_names" truevalue="--sanitize-names" type="boolean" help="Replace special chars (\t| |\||\.;) with underscore (_)" />
+        <param checked="False" falsevalue="" argument="--show-rank" label="Output Rank Name in (second to) last column" name="show_rank" truevalue="--show-rank" type="boolean" />
+        <param checked="False" falsevalue="" argument="--taxonomy" label="Output taxonomy in last column" name="taxonomy" truevalue="--taxonomy" type="boolean" />
+        <param checked="False" falsevalue="" argument="--intermediate" label="Display intermediate ranks" name="intermediate" truevalue="--intermediate" type="boolean" />
+        <conditional name="cluster">
+            <param argument="--cluster" help="Combines rows under the selected taxon and reports only selected rank." label="Cluster by taxonomic rank" name="cluster" type="select">
+                <option value="Superkingdom">Superkingdom</option>
+                <option value="Kingdom">Kingdom</option>
+                <option value="Subkingdom">Subkingdom</option>
+                <option value="Superphylum">Superphylum</option>
+                <option value="Phylum">Phylum</option>
+                <option value="Subphylum">Subphylum</option>
+                <option value="Superclass">Superclass</option>
+                <option value="Class">Class</option>
+                <option value="Subclass">Subclass</option>
+                <option value="Infraclass">Infraclass</option>
+                <option value="Superorder">Superorder</option>
+                <option value="Order">Order</option>
+                <option value="Suborder">Suborder</option>
+                <option value="Infraorder">infraorder</option>
+                <option value="Parvorder">Parvorder</option>
+                <option value="Superfamily">Superfamily</option>
+                <option value="Family">Family</option>
+                <option value="Subfamily">Subfamily</option>
+                <option value="Tribe">Tribe</option>
+                <option value="Subtribe">Subtribe</option>
+                <option value="Genus">Genus</option>
+                <option value="Subgenus">Subgenus</option>
+                <option value="Species Group">Species Group</option>
+                <option value="Species Subgroup">Species Subgroup</option>
+                <option value="Species">Species</option>
+                <option value="Subspecies">Subspecies</option>
+                <option value="Varietas">Varietas</option>
+                <option value="Forma">Forma</option>
+                <option value="" selected="True">No Clustering</option>
+            </param>
+            <when value="">
+                <param checked="False" falsevalue="" argument="--summation" label="Summation of lower ranks into higher ranks" name="summation" truevalue="--summation" type="boolean" />
+            </when>
+            <when value="Superkingdom"/>
+            <when value="Kingdom"/>
+            <when value="Subkingdom"/>
+            <when value="Superphylum"/>
+            <when value="Phylum"/>
+            <when value="Subphylum"/>
+            <when value="Superclass"/>
+            <when value="Class"/>
+            <when value="Subclass"/>
+            <when value="Infraclass"/>
+            <when value="Superorder"/>
+            <when value="Order"/>
+            <when value="Suborder"/>
+            <when value="Infraorder"/>
+            <when value="Parvorder"/>
+            <when value="Superfamily"/>
+            <when value="Family"/>
+            <when value="Subfamily"/>
+            <when value="Tribe"/>
+            <when value="Subtribe"/>
+            <when value="Genus"/>
+            <when value="Subgenus"/>
+            <when value="Species Group"/>
+            <when value="Species Subgroup"/>
+            <when value="Species"/>
+            <when value="Subspecies"/>
+            <when value="Varietas"/>
+            <when value="Forma"/>
+        </conditional>
+        <param checked="False" falsevalue="" label="Output a newick tree" name="tree" truevalue="true" type="boolean" help="Trees are pruned at specified rank when clustering"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output_report" label="${tool.name} on ${on_string} (Abundances)"/>
+        <data format="txt" name="output_tree" label="${tool.name} on ${on_string} (Newick Tree)">
+            <filter>tree</filter>
+        </data>
+    </outputs>
+    <tests>
+        <test>
+            <param name="classification" value="input_kraken_1.tabular" ftype="tabular"/>
+            <param name="show_zeros" value="True"/>
+            <param name="header_line" value="True"/>
+            <param name="kraken_database" value="test_db"/>
+            <param name="otu_name" value=""/>
+            <param name="sanitize_names" value="True"/>
+            <param name="show_rank" value="True"/>
+            <param name="taxonomy" value="True"/>
+            <param name="intermediate" value="True"/>
+            <conditional name="cluster">
+                <param name="cluster" value=""/>
+                <param name="summation" value="True"/>
+            </conditional>
+            <param name="tree" value="True"/>
+            <output name="output_report" file="output_abundance_1.tabular" ftype="tabular"/>
+            <output name="output_tree" file="output_tree_1.newick" />
+        </test>
+        <test>
+            <param name="classification" value="input_kraken_1.tabular" ftype="tabular"/>
+            <param name="show_zeros" value="True"/>
+            <param name="header_line" value="True"/>
+            <param name="kraken_database" value="test_db"/>
+            <param name="otu_name" value=""/>
+            <param name="sanitize_names" value="True"/>
+            <param name="show_rank" value="True"/>
+            <param name="taxonomy" value="True"/>
+            <param name="intermediate" value="True"/>
+            <conditional name="cluster">
+                <param name="cluster" value=""/>
+                <param name="summation" value="False"/>
+            </conditional>
+            <param name="tree" value="True"/>
+            <output name="output_report" file="output_abundance_2.tabular" ftype="tabular"/>
+            <output name="output_tree" file="output_tree_1.newick" />
+        </test>
+        <test>
+            <param name="classification" value="input_kraken_1.tabular" ftype="tabular"/>
+            <param name="show_zeros" value="True"/>
+            <param name="header_line" value="True"/>
+            <param name="kraken_database" value="test_db"/>
+            <param name="otu_name" value=""/>
+            <param name="sanitize_names" value="True"/>
+            <param name="show_rank" value="True"/>
+            <param name="taxonomy" value="True"/>
+            <param name="intermediate" value="False"/>
+            <conditional name="cluster">
+                <param name="cluster" value="Species"/>
+            </conditional>
+            <param name="tree" value="True"/>
+            <output name="output_report" file="output_abundance_3.tabular" ftype="tabular"/>
+            <output name="output_tree" file="output_tree_3.newick" />
+        </test>
+        <test>
+            <param name="classification" value="input_kraken_1.tabular,input_kraken_2.tabular" ftype="tabular"/>
+            <param name="show_zeros" value="True"/>
+            <param name="header_line" value="True"/>
+            <param name="kraken_database" value="test_db"/>
+            <param name="otu_name" value=""/>
+            <param name="sanitize_names" value="True"/>
+            <param name="show_rank" value="True"/>
+            <param name="taxonomy" value="True"/>
+            <param name="intermediate" value="False"/>
+            <conditional name="cluster">
+                <param name="cluster" value="Species"/>
+            </conditional>
+            <param name="tree" value="True"/>
+            <output name="output_report" file="output_abundance_4.tabular" ftype="tabular"/>
+            <output name="output_tree" file="output_tree_3.newick" />
+        </test>
+    </tests>
+    <help>
+<![CDATA[
+
+.. class:: warningmark
+
+**Note**: the database used must be the same as the one used in the original Kraken run
+
+-----
+
+**What is Does**
+
+Summarizes read counts across taxonomic ranks for multiple samples. This is convenient for comparing results across multiple experiments, conditions, locations, etc.
+
+-----
+
+**Output**
+
+The output is tab-delimited, with one line per taxon.
+
+Will optionally output a newick tree built from the kraken database taxonomy using the specified options. Tree branch lengths will be set to "1.00000".
+
+
+]]>
+    </help>
+    <citations>
+        <citation type="bibtex">@unpublished{Kraken-Taxonomy-Report:2016,
+          title  = "Kraken Taxonomy Report",
+          author = "Daniel Blankenberg",
+          url    = "https://github.com/blankenberg/Kraken-Taxonomy-Report",
+          year   = "2016 (accessed June 1, 2016)"
+        }</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input_kraken_1.tabular	Wed Jun 01 17:25:40 2016 -0400
@@ -0,0 +1,4 @@
+C	gi|145231|gb|M33724.1|ECOALPHOA	83333	171	83333:162
+C	gi|145232|gb|M33725.1|ECOALPHOB	83333	183	83333:174
+C	gi|145234|gb|M33727.1|ECOALPHOE	562	97	562:88
+C	gi|146195|gb|J01619.1|ECOGLTA	83333	3850	83333:3841
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/input_kraken_2.tabular	Wed Jun 01 17:25:40 2016 -0400
@@ -0,0 +1,5 @@
+C	gi|145231|gb|M33724.1|ECOALPHOA	83333	171	83333:162
+C	gi|145232|gb|M33725.1|ECOALPHOB	83333	183	83333:174
+C	gi|145234|gb|M33727.1|ECOALPHOE	562	97	562:88
+C	gi|146195|gb|J01619.1|ECOGLTA	83333	3850	83333:3841
+C	gi|145234|gb|M33727.1|ECOALPHOE2	562	97	562:88
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_abundance_1.tabular	Wed Jun 01 17:25:40 2016 -0400
@@ -0,0 +1,11 @@
+#ID	input_kraken_1.tabular	rank	taxonomy
+root	4	no rank	n__root
+cellular_organisms	4	no rank	n__root;n__cellular_organisms
+Escherichia_coli_K-12	3	no rank	n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia_coli;n__Escherichia_coli_K-12
+Bacteria	4	superkingdom	n__root;n__cellular_organisms;d__Bacteria
+Proteobacteria	4	phylum	n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria
+Gammaproteobacteria	4	class	n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria
+Enterobacteriales	4	order	n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales
+Enterobacteriaceae	4	family	n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae
+Escherichia	4	genus	n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Escherichia
+Escherichia_coli	4	species	n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia_coli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_abundance_2.tabular	Wed Jun 01 17:25:40 2016 -0400
@@ -0,0 +1,11 @@
+#ID	input_kraken_1.tabular	rank	taxonomy
+root	0	no rank	n__root
+cellular_organisms	0	no rank	n__root;n__cellular_organisms
+Escherichia_coli_K-12	3	no rank	n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia_coli;n__Escherichia_coli_K-12
+Bacteria	0	superkingdom	n__root;n__cellular_organisms;d__Bacteria
+Proteobacteria	0	phylum	n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria
+Gammaproteobacteria	0	class	n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria
+Enterobacteriales	0	order	n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales
+Enterobacteriaceae	0	family	n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae
+Escherichia	0	genus	n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Escherichia
+Escherichia_coli	1	species	n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia_coli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_abundance_3.tabular	Wed Jun 01 17:25:40 2016 -0400
@@ -0,0 +1,2 @@
+#ID	input_kraken_1.tabular	rank	taxonomy
+Escherichia_coli	4	species	d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia_coli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_abundance_4.tabular	Wed Jun 01 17:25:40 2016 -0400
@@ -0,0 +1,2 @@
+#ID	input_kraken_1.tabular	input_kraken_2.tabular	rank	taxonomy
+Escherichia_coli	4	5	species	d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia_coli
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_tree_1.newick	Wed Jun 01 17:25:40 2016 -0400
@@ -0,0 +1,1 @@
+(((((((((Escherichia_coli_K-12:1.00000)Escherichia_coli:1.00000)Escherichia:1.00000)Enterobacteriaceae:1.00000)Enterobacteriales:1.00000)Gammaproteobacteria:1.00000)Proteobacteria:1.00000)Bacteria:1.00000)cellular_organisms:1.00000)root:1.00000;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output_tree_3.newick	Wed Jun 01 17:25:40 2016 -0400
@@ -0,0 +1,1 @@
+((((((((Escherichia_coli:1.00000)Escherichia:1.00000)Enterobacteriaceae:1.00000)Enterobacteriales:1.00000)Gammaproteobacteria:1.00000)Proteobacteria:1.00000)Bacteria:1.00000)cellular_organisms:1.00000)root:1.00000;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_database.loc	Wed Jun 01 17:25:40 2016 -0400
@@ -0,0 +1,1 @@
+test_db	test_db	${__HERE__}
\ No newline at end of file
Binary file test-data/test_db/database.idx has changed
Binary file test-data/test_db/database.kdb has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/taxonomy/names.dmp	Wed Jun 01 17:25:40 2016 -0400
@@ -0,0 +1,74 @@
+83333	|	Escherichia coli K-12	|		|	scientific name	|
+83333	|	Escherichia coli K12	|		|	equivalent name	|
+562	|	"Bacillus coli" Migula 1895	|		|	authority	|
+562	|	"Bacterium coli commune" Escherich 1885	|		|	authority	|
+562	|	"Bacterium coli" (Migula 1895) Lehmann and Neumann 1896	|		|	authority	|
+562	|	ATCC 11775	|		|	type material	|
+562	|	Bacillus coli	|		|	synonym	|
+562	|	Bacterium coli	|		|	synonym	|
+562	|	Bacterium coli commune	|		|	synonym	|
+562	|	CCUG 24	|		|	type material	|
+562	|	CCUG 29300	|		|	type material	|
+562	|	CIP 54.8	|		|	type material	|
+562	|	DSM 30083	|		|	type material	|
+562	|	Enterococcus coli	|		|	synonym	|
+562	|	Escherchia coli	|		|	misspelling	|
+562	|	Escherichia coli	|		|	scientific name	|
+562	|	Escherichia coli (Migula 1895) Castellani and Chalmers 1919	|		|	authority	|
+562	|	Escherichia sp. MAR	|		|	includes	|
+562	|	Escherichia/Shigella coli	|		|	equivalent name	|
+562	|	Eschericia coli	|		|	misspelling	|
+562	|	JCM 1649	|		|	type material	|
+562	|	LMG 2092	|		|	type material	|
+562	|	NBRC 102203	|		|	type material	|
+562	|	NCCB 54008	|		|	type material	|
+562	|	NCTC 9001	|		|	type material	|
+562	|	bacterium 10a	|		|	includes	|
+562	|	bacterium E3	|		|	includes	|
+561	|	Escherchia	|		|	misspelling	|
+561	|	Escherichia	|		|	scientific name	|
+561	|	Escherichia Castellani and Chalmers 1919	|		|	authority	|
+543	|	Enterobacteraceae	|		|	synonym	|
+543	|	Enterobacteraceae (ex Lapage 1979) Lapage 1982, fam. nov., nom. rev.	|		|	synonym	|
+543	|	Enterobacteriaceae	|		|	scientific name	|
+543	|	Enterobacteriaceae (ex Rahn 1937) Ewing et al. 1980, fam. nov., nom. rev.	|		|	synonym	|
+543	|	Enterobacteriaceae Rahn 1937	|		|	synonym	|
+543	|	gamma-3 proteobacteria	|	gamma-3 proteobacteria <#1>	|	in-part	|
+91347	|	'Enterobacteriales'	|		|	synonym	|
+91347	|	Enterobacteriaceae and related endosymbionts	|		|	synonym	|
+91347	|	Enterobacteriaceae group	|		|	synonym	|
+91347	|	Enterobacteriales	|		|	scientific name	|
+91347	|	enterobacteria	|	enterobacteria<blast91347>	|	blast name	|
+91347	|	gamma-3 proteobacteria	|	gamma-3 proteobacteria <#5>	|	in-part	|
+1236	|	Gammaproteobacteria	|		|	scientific name	|
+1236	|	Gammaproteobacteria Garrity et al. 2005	|		|	synonym	|
+1236	|	Proteobacteria gamma subdivision	|		|	synonym	|
+1236	|	Purple bacteria, gamma subdivision	|		|	synonym	|
+1236	|	g-proteobacteria	|	gamma proteos<blast1236>	|	blast name	|
+1236	|	gamma proteobacteria	|		|	synonym	|
+1236	|	gamma subdivision	|		|	synonym	|
+1236	|	gamma subgroup	|		|	synonym	|
+1224	|	Proteobacteria	|		|	scientific name	|
+1224	|	Proteobacteria Garrity et al. 2005	|		|	authority	|
+1224	|	Proteobacteria [class] Stackebrandt et al. 1988	|		|	authority	|
+1224	|	not Proteobacteria Cavalier-Smith 2002	|		|	authority	|
+1224	|	proteobacteria	|	proteobacteria<blast1224>	|	blast name	|
+1224	|	purple bacteria	|		|	common name	|
+1224	|	purple bacteria and relatives	|		|	common name	|
+1224	|	purple non-sulfur bacteria	|		|	common name	|
+1224	|	purple photosynthetic bacteria	|		|	common name	|
+1224	|	purple photosynthetic bacteria and relatives	|		|	common name	|
+2	|	Bacteria	|	Bacteria <prokaryote>	|	scientific name	|
+2	|	Monera	|	Monera <Bacteria>	|	in-part	|
+2	|	Procaryotae	|	Procaryotae <Bacteria>	|	in-part	|
+2	|	Prokaryota	|	Prokaryota <Bacteria>	|	in-part	|
+2	|	Prokaryotae	|	Prokaryotae <Bacteria>	|	in-part	|
+2	|	bacteria	|	bacteria <blast2>	|	blast name	|
+2	|	eubacteria	|		|	genbank common name	|
+2	|	not Bacteria Haeckel 1894	|		|	synonym	|
+2	|	prokaryote	|	prokaryote <Bacteria>	|	in-part	|
+2	|	prokaryotes	|	prokaryotes <Bacteria>	|	in-part	|
+1	|	all	|		|	synonym	|
+1	|	root	|		|	scientific name	|
+131567	|	biota	|		|	synonym	|
+131567	|	cellular organisms	|		|	scientific name	|
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_db/taxonomy/nodes.dmp	Wed Jun 01 17:25:40 2016 -0400
@@ -0,0 +1,10 @@
+83333	|	562	|	no rank	|		|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
+562	|	561	|	species	|	EC	|	0	|	1	|	11	|	1	|	0	|	1	|	1	|	0	|		|
+561	|	543	|	genus	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
+543	|	91347	|	family	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
+91347	|	1236	|	order	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
+1236	|	1224	|	class	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
+1224	|	2	|	phylum	|		|	0	|	1	|	11	|	1	|	0	|	1	|	0	|	0	|		|
+2	|	131567	|	superkingdom	|		|	0	|	0	|	11	|	0	|	0	|	0	|	0	|	0	|		|
+131567	|	1	|	no rank	|		|	8	|	1	|	1	|	1	|	0	|	1	|	1	|	0	|		|
+1	|	1	|	no rank	|		|	8	|	0	|	1	|	0	|	0	|	0	|	0	|	0	|		|
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Wed Jun 01 17:25:40 2016 -0400
@@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<tables>
+    <!-- Locations of Kraken database in the required format -->
+    <table name="kraken_databases" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="tool-data/kraken_databases.loc" />
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.test	Wed Jun 01 17:25:40 2016 -0400
@@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<tables>
+    <!-- Locations of Kraken database in the required format -->
+    <table name="kraken_databases" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="${__HERE__}/test-data/test_database.loc" />
+    </table>
+</tables>