Mercurial > repos > iuc > kraken_taxonomy_report
changeset 0:3f1a0d47ea8d draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/kraken_taxonomy_report/ commit 1c0a7aff7c5f6578a11e6e8e9bface8d02e7f8a1
author | iuc |
---|---|
date | Wed, 01 Jun 2016 17:25:40 -0400 |
parents | |
children | b97694b21bc3 |
files | kraken_databases.loc.sample kraken_taxonomy_report.py kraken_taxonomy_report.xml test-data/input_kraken_1.tabular test-data/input_kraken_2.tabular test-data/output_abundance_1.tabular test-data/output_abundance_2.tabular test-data/output_abundance_3.tabular test-data/output_abundance_4.tabular test-data/output_tree_1.newick test-data/output_tree_3.newick test-data/test_database.loc test-data/test_db/database.idx test-data/test_db/database.kdb test-data/test_db/taxonomy/names.dmp test-data/test_db/taxonomy/nodes.dmp tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
diffstat | 17 files changed, 658 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kraken_taxonomy_report.py Wed Jun 01 17:25:40 2016 -0400 @@ -0,0 +1,278 @@ +#!/usr/bin/env python + +# Reports a summary of Kraken's results +# and optionally creates a newick Tree +# Copyright (c) 2016 Daniel Blankenberg +# Licensed under the Academic Free License version 3.0 +# https://github.com/blankenberg/Kraken-Taxonomy-Report + +import sys +import os +import optparse +import re + +__VERSION__ = '0.0.1' + +__URL__ = "https://github.com/blankenberg/Kraken-Taxonomy-Report" + +# Rank names were pulled from ncbi nodes.dmp on 02/02/2016 +# cat nodes.dmp | cut -f 5 | sort | uniq +# "root" is added manually +NO_RANK_NAME = "no rank" +RANK_NAMES = [ NO_RANK_NAME, + "root", + "superkingdom", + "kingdom", + "subkingdom", + "superphylum", + "phylum", + "subphylum", + "superclass", + "class", + "subclass", + "infraclass", + "superorder", + "order", + "suborder", + "infraorder", + "parvorder", + "superfamily", + "family", + "subfamily", + "tribe", + "subtribe", + "genus", + "subgenus", + "species group", + "species subgroup", + "species", + "subspecies", + "varietas", + "forma" ] +# NB: We put 'no rank' at top of list for generating trees, due to e.g. +# root (root) -> cellular organisms (no rank) -> bacteria (superkingdom) + +RANK_NAME_TO_INTS = dict( [ (y, x) for (x, y) in enumerate( RANK_NAMES ) ] ) +RANK_NAMES_INTS = range( len( RANK_NAMES ) ) + +NO_RANK_INT = RANK_NAMES.index( NO_RANK_NAME ) +NO_RANK_CODE = 'n' + +PRIMARY_RANK_NAMES = [ 'species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom' ] +RANK_INT_TO_CODE = {} +for name in PRIMARY_RANK_NAMES: + RANK_INT_TO_CODE[ RANK_NAMES.index( name ) ] = name[0] +RANK_INT_TO_CODE[ RANK_NAMES.index( 'superkingdom' ) ] = 'd' +PRIMARY_RANK_NAMES.append( 'superkingdom' ) + +NAME_STUB = "%s__%s" +NAME_RE = re.compile( "(\t| |\||\.;)" ) +NAME_REPL = "_" + + +def get_kraken_db_path( db ): + assert db, ValueError( "You must provide a kraken database" ) + k_db_path = os.getenv('KRAKEN_DB_PATH', None ) + if k_db_path: + db = os.path.join( k_db_path, db ) + return db + + +def load_taxonomy( db_path, sanitize_names=False ): + child_lists = {} + name_map = {} + rank_map = {} + with open( os.path.join( db_path, "taxonomy/names.dmp" ) ) as fh: + for line in fh: + line = line.rstrip( "\n\r" ) + if line.endswith( "\t|" ): + line = line[:-2] + fields = line.split( "\t|\t" ) + node_id = fields[0] + name = fields[1] + if sanitize_names: + name = NAME_RE.sub( NAME_REPL, name ) + name_type = fields[3] + if name_type == "scientific name": + name_map[ node_id ] = name + + with open( os.path.join( db_path, "taxonomy/nodes.dmp" ) ) as fh: + for line in fh: + line = line.rstrip( "\n\r" ) + fields = line.split( "\t|\t" ) + node_id = fields[0] + parent_id = fields[1] + rank = RANK_NAME_TO_INTS.get( fields[2].lower(), None ) + if rank is None: + # This should never happen, unless new taxonomy ranks are created + print >> sys.stderr, 'Unrecognized rank: Node "%s" is "%s", setting to "%s"' % ( node_id, fields[2], NO_RANK_NAME ) + rank = NO_RANK_INT + if node_id == '1': + parent_id = '0' + if parent_id not in child_lists: + child_lists[ parent_id ] = [] + child_lists[ parent_id ].append( node_id ) + rank_map[node_id] = rank + return ( child_lists, name_map, rank_map ) + + +def dfs_summation( node, counts, child_lists ): + children = child_lists.get( node, None ) + if children: + for child in children: + dfs_summation( child, counts, child_lists ) + counts[ node ] = counts.get( node, 0 ) + counts.get( child, 0 ) + + +def dfs_report( node, file_data, hit_taxa, rank_map, name_map, child_lists, output_lines, options, name=None, tax=None ): + if not options.summation and ( not options.show_zeros and node not in hit_taxa ): + return + rank_int = rank_map[node] + code = RANK_INT_TO_CODE.get( rank_int, NO_RANK_CODE ) + if ( code != NO_RANK_CODE or options.intermediate ) and ( options.show_zeros or node in hit_taxa): + if name is None: + name = "" + else: + name = "%s|" % name + if tax is None: + tax = '' + else: + tax = "%s;" % tax + sanitized_name = name_map[ node ] + name_stub = NAME_STUB % ( code, sanitized_name ) + name = name + name_stub + tax = tax + name_stub + if options.name_id: + output = node + elif options.name_long: + output = name + else: + output = sanitized_name + for val in file_data: + output = "%s\t%i" % ( output, val.get( node, 0 ) ) + if options.show_rank: + output = "%s\t%s" % ( output, RANK_NAMES[ rank_int ] ) + if options.taxonomy: + output = "%s\t%s" % ( output, tax ) + output_lines[ rank_int ].append( output ) + children = child_lists.get( node ) + if children: + for child in children: + dfs_report( child, file_data, hit_taxa, rank_map, name_map, child_lists, output_lines, options, name=name, tax=tax ) + + +def write_tree( child_lists, name_map, rank_map, options, branch_length=1 ): + # Uses Biopython, only load if making tree + import Bio.Phylo + from Bio.Phylo import BaseTree + + def _get_name( node_id ): + if options.name_id: + return node_id + return name_map[node_id] + nodes = {} + root_node_id = child_lists["0"][0] + nodes[root_node_id] = BaseTree.Clade( name=_get_name( root_node_id), branch_length=branch_length ) + + def recurse_children( parent_id ): + if options.cluster is not None and rank_map[parent_id] == options.cluster: + # Short circuit if we found our rank, prevents 'hanging' no ranks from being output + # e.g. clustering by "species" (Escherichia coli), but have "no rank" below (Escherichia coli K-12) in test_db + return + if parent_id not in nodes: + nodes[parent_id] = BaseTree.Clade( name=_get_name( parent_id ), branch_length=branch_length ) + for child_id in child_lists.get( parent_id, [] ): + if options.cluster is None or ( rank_map[child_id] <= options.cluster ): + if child_id not in nodes: + nodes[child_id] = BaseTree.Clade(name=_get_name( child_id ), branch_length=branch_length) + nodes[parent_id].clades.append(nodes[child_id]) + recurse_children( child_id ) + recurse_children( root_node_id ) + tree = BaseTree.Tree(root=nodes[root_node_id]) + Bio.Phylo.write( [tree], options.output_tree, 'newick' ) + + +def __main__(): + parser = optparse.OptionParser( usage="%prog [options] file1 file...fileN" ) + parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='print version and exit' ) + parser.add_option( '', '--show-zeros', dest='show_zeros', action='store_true', default=False, help='Show empty nodes' ) + parser.add_option( '', '--header-line', dest='header_line', action='store_true', default=False, help='Provide a header on output' ) + parser.add_option( '', '--intermediate', dest='intermediate', action='store_true', default=False, help='Intermediate Ranks' ) + parser.add_option( '', '--name-id', dest='name_id', action='store_true', default=False, help='Use Taxa ID instead of Name' ) + parser.add_option( '', '--name-long', dest='name_long', action='store_true', default=False, help='Use Long taxa ID instead of base name' ) + parser.add_option( '', '--taxonomy', dest='taxonomy', action='store_true', default=False, help='Output taxonomy in last column' ) + parser.add_option( '', '--cluster', dest='cluster', action='store', type="string", default=None, help='Cluster counts to specified rank' ) + parser.add_option( '', '--summation', dest='summation', action='store_true', default=False, help='Add summation of child counts to each taxa' ) + parser.add_option( '', '--sanitize-names', dest='sanitize_names', action='store_true', default=False, help='Replace special chars (\t| |\||\.;) with underscore (_)' ) + parser.add_option( '', '--show-rank', dest='show_rank', action='store_true', default=False, help='Output column with Rank name' ) + parser.add_option( '', '--db', dest='db', action='store', type="string", default=None, help='Name of Kraken database' ) + parser.add_option( '', '--output', dest='output', action='store', type="string", default=None, help='Name of output file' ) + parser.add_option( '', '--output-tree', dest='output_tree', action='store', type="string", default=None, help='Name of output file to place newick tree' ) + (options, args) = parser.parse_args() + if options.version: + print >> sys.stderr, "Kraken Taxonomy Report (%s) version %s" % ( __URL__, __VERSION__ ) + sys.exit() + if not args: + print >> sys.stderr, parser.get_usage() + sys.exit() + + if options.cluster: + cluster_name = options.cluster.lower() + cluster = RANK_NAME_TO_INTS.get( cluster_name, None ) + assert cluster is not None, ValueError( '"%s" is not a valid rank for clustering.' % options.cluster ) + if cluster_name not in PRIMARY_RANK_NAMES: + assert options.intermediate, ValueError( 'You cannot cluster by "%s", unless you enable intermediate ranks.' % options.cluster ) + ranks_to_report = [ cluster ] + options.cluster = cluster + # When clustering we need to do summatation + options.summation = True + else: + options.cluster = None # make empty string into None + ranks_to_report = RANK_NAMES_INTS + + if options.output: + output_fh = open( options.output, 'wb+' ) + else: + output_fh = sys.stdout + + db_path = get_kraken_db_path( options.db ) + ( child_lists, name_map, rank_map ) = load_taxonomy( db_path, sanitize_names=options.sanitize_names ) + file_data = [] + hit_taxa = [] + for input_filename in args: + taxo_counts = {} + with open( input_filename ) as fh: + for line in fh: + fields = line.split( "\t" ) + taxo_counts[ fields[2] ] = taxo_counts.get( fields[2], 0 ) + 1 + clade_counts = taxo_counts.copy() # fixme remove copying? + if options.summation: + dfs_summation( '1', clade_counts, child_lists ) + for key, value in clade_counts.items(): + if value and key not in hit_taxa: + hit_taxa.append( key ) + file_data.append( clade_counts ) + + if options.header_line: + output_fh.write( "#ID\t" ) + output_fh.write( "\t".join( args ) ) + if options.show_rank: + output_fh.write( "\trank" ) + if options.taxonomy: + output_fh.write( "\ttaxonomy" ) + output_fh.write( '\n' ) + + output_lines = dict( [ ( x, [] ) for x in RANK_NAMES_INTS ] ) + dfs_report( '1', file_data, hit_taxa, rank_map, name_map, child_lists, output_lines, options, name=None, tax=None ) + + for rank_int in ranks_to_report: + for line in output_lines.get( rank_int, [] ): + output_fh.write( line ) + output_fh.write( '\n' ) + fh.close() + if options.output_tree: + write_tree( child_lists, name_map, rank_map, options ) + + +if __name__ == "__main__": + __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kraken_taxonomy_report.xml Wed Jun 01 17:25:40 2016 -0400 @@ -0,0 +1,242 @@ +<?xml version="1.0"?> +<tool id="kraken_taxonomy_report" name="Kraken taxonomic report" version="0.0.1"> + <description>view report of classification for multiple samples</description> + <requirements> + <requirement type="package" version="1.66">biopython</requirement> + </requirements> + <stdio> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> + <version_command>python ${__tool_directory__}/kraken_taxonomy_report.py --version</version_command> + <command> +<![CDATA[ + +#for $input_classification in $classification: + ln -s "${input_classification}" "${input_classification.element_identifier}" && +#end for + +export KRAKEN_DB_PATH="${kraken_database.fields.path}" && +python ${__tool_directory__}/kraken_taxonomy_report.py + +--db "${kraken_database.fields.name}" +${show_zeros} +${header_line} +${otu_name} +${taxonomy} +${show_rank} +${intermediate} +${sanitize_names} +#if str( $cluster.cluster ): + --cluster "${cluster.cluster}" +#else: + ${cluster.summation} +#end if + +--output "${output_report}" + +#if $output_tree: + --output-tree "${output_tree}" +#end if + +#for $input_classification in $classification: + "${input_classification.element_identifier}" +#end for + +]]> + </command> + <inputs> + <param format="tabular" label="Kraken output" multiple="True" name="classification" type="data" /> + <param checked="False" falsevalue="" argument="--show-zeros" label="Display taxa even if they lack a read in any sample" name="show_zeros" truevalue="--show-zeros" type="boolean" /> + <param checked="True" falsevalue="" argument="--header-line" label="Display a header line indicating sample IDs" name="header_line" truevalue="--header-line" type="boolean" /> + <param label="Select a Kraken database" name="kraken_database" type="select" help="Select the same database used to classify reads"> + <options from_data_table="kraken_databases"> + <validator message="No Kraken databases are available" type="no_options" /> + </options> + </param> + <param label="How to name OTUs" name="otu_name" type="select" multiple="False"> + <option value="" selected="True">Node name only</option> + <option value="--name-long">Taxonified Name</option> + <option value="--name-id">Node ID</option> + </param> + <param checked="True" falsevalue="" argument="--sanitize-names" label="Sanitize Names" name="sanitize_names" truevalue="--sanitize-names" type="boolean" help="Replace special chars (\t| |\||\.;) with underscore (_)" /> + <param checked="False" falsevalue="" argument="--show-rank" label="Output Rank Name in (second to) last column" name="show_rank" truevalue="--show-rank" type="boolean" /> + <param checked="False" falsevalue="" argument="--taxonomy" label="Output taxonomy in last column" name="taxonomy" truevalue="--taxonomy" type="boolean" /> + <param checked="False" falsevalue="" argument="--intermediate" label="Display intermediate ranks" name="intermediate" truevalue="--intermediate" type="boolean" /> + <conditional name="cluster"> + <param argument="--cluster" help="Combines rows under the selected taxon and reports only selected rank." label="Cluster by taxonomic rank" name="cluster" type="select"> + <option value="Superkingdom">Superkingdom</option> + <option value="Kingdom">Kingdom</option> + <option value="Subkingdom">Subkingdom</option> + <option value="Superphylum">Superphylum</option> + <option value="Phylum">Phylum</option> + <option value="Subphylum">Subphylum</option> + <option value="Superclass">Superclass</option> + <option value="Class">Class</option> + <option value="Subclass">Subclass</option> + <option value="Infraclass">Infraclass</option> + <option value="Superorder">Superorder</option> + <option value="Order">Order</option> + <option value="Suborder">Suborder</option> + <option value="Infraorder">infraorder</option> + <option value="Parvorder">Parvorder</option> + <option value="Superfamily">Superfamily</option> + <option value="Family">Family</option> + <option value="Subfamily">Subfamily</option> + <option value="Tribe">Tribe</option> + <option value="Subtribe">Subtribe</option> + <option value="Genus">Genus</option> + <option value="Subgenus">Subgenus</option> + <option value="Species Group">Species Group</option> + <option value="Species Subgroup">Species Subgroup</option> + <option value="Species">Species</option> + <option value="Subspecies">Subspecies</option> + <option value="Varietas">Varietas</option> + <option value="Forma">Forma</option> + <option value="" selected="True">No Clustering</option> + </param> + <when value=""> + <param checked="False" falsevalue="" argument="--summation" label="Summation of lower ranks into higher ranks" name="summation" truevalue="--summation" type="boolean" /> + </when> + <when value="Superkingdom"/> + <when value="Kingdom"/> + <when value="Subkingdom"/> + <when value="Superphylum"/> + <when value="Phylum"/> + <when value="Subphylum"/> + <when value="Superclass"/> + <when value="Class"/> + <when value="Subclass"/> + <when value="Infraclass"/> + <when value="Superorder"/> + <when value="Order"/> + <when value="Suborder"/> + <when value="Infraorder"/> + <when value="Parvorder"/> + <when value="Superfamily"/> + <when value="Family"/> + <when value="Subfamily"/> + <when value="Tribe"/> + <when value="Subtribe"/> + <when value="Genus"/> + <when value="Subgenus"/> + <when value="Species Group"/> + <when value="Species Subgroup"/> + <when value="Species"/> + <when value="Subspecies"/> + <when value="Varietas"/> + <when value="Forma"/> + </conditional> + <param checked="False" falsevalue="" label="Output a newick tree" name="tree" truevalue="true" type="boolean" help="Trees are pruned at specified rank when clustering"/> + </inputs> + <outputs> + <data format="tabular" name="output_report" label="${tool.name} on ${on_string} (Abundances)"/> + <data format="txt" name="output_tree" label="${tool.name} on ${on_string} (Newick Tree)"> + <filter>tree</filter> + </data> + </outputs> + <tests> + <test> + <param name="classification" value="input_kraken_1.tabular" ftype="tabular"/> + <param name="show_zeros" value="True"/> + <param name="header_line" value="True"/> + <param name="kraken_database" value="test_db"/> + <param name="otu_name" value=""/> + <param name="sanitize_names" value="True"/> + <param name="show_rank" value="True"/> + <param name="taxonomy" value="True"/> + <param name="intermediate" value="True"/> + <conditional name="cluster"> + <param name="cluster" value=""/> + <param name="summation" value="True"/> + </conditional> + <param name="tree" value="True"/> + <output name="output_report" file="output_abundance_1.tabular" ftype="tabular"/> + <output name="output_tree" file="output_tree_1.newick" /> + </test> + <test> + <param name="classification" value="input_kraken_1.tabular" ftype="tabular"/> + <param name="show_zeros" value="True"/> + <param name="header_line" value="True"/> + <param name="kraken_database" value="test_db"/> + <param name="otu_name" value=""/> + <param name="sanitize_names" value="True"/> + <param name="show_rank" value="True"/> + <param name="taxonomy" value="True"/> + <param name="intermediate" value="True"/> + <conditional name="cluster"> + <param name="cluster" value=""/> + <param name="summation" value="False"/> + </conditional> + <param name="tree" value="True"/> + <output name="output_report" file="output_abundance_2.tabular" ftype="tabular"/> + <output name="output_tree" file="output_tree_1.newick" /> + </test> + <test> + <param name="classification" value="input_kraken_1.tabular" ftype="tabular"/> + <param name="show_zeros" value="True"/> + <param name="header_line" value="True"/> + <param name="kraken_database" value="test_db"/> + <param name="otu_name" value=""/> + <param name="sanitize_names" value="True"/> + <param name="show_rank" value="True"/> + <param name="taxonomy" value="True"/> + <param name="intermediate" value="False"/> + <conditional name="cluster"> + <param name="cluster" value="Species"/> + </conditional> + <param name="tree" value="True"/> + <output name="output_report" file="output_abundance_3.tabular" ftype="tabular"/> + <output name="output_tree" file="output_tree_3.newick" /> + </test> + <test> + <param name="classification" value="input_kraken_1.tabular,input_kraken_2.tabular" ftype="tabular"/> + <param name="show_zeros" value="True"/> + <param name="header_line" value="True"/> + <param name="kraken_database" value="test_db"/> + <param name="otu_name" value=""/> + <param name="sanitize_names" value="True"/> + <param name="show_rank" value="True"/> + <param name="taxonomy" value="True"/> + <param name="intermediate" value="False"/> + <conditional name="cluster"> + <param name="cluster" value="Species"/> + </conditional> + <param name="tree" value="True"/> + <output name="output_report" file="output_abundance_4.tabular" ftype="tabular"/> + <output name="output_tree" file="output_tree_3.newick" /> + </test> + </tests> + <help> +<![CDATA[ + +.. class:: warningmark + +**Note**: the database used must be the same as the one used in the original Kraken run + +----- + +**What is Does** + +Summarizes read counts across taxonomic ranks for multiple samples. This is convenient for comparing results across multiple experiments, conditions, locations, etc. + +----- + +**Output** + +The output is tab-delimited, with one line per taxon. + +Will optionally output a newick tree built from the kraken database taxonomy using the specified options. Tree branch lengths will be set to "1.00000". + + +]]> + </help> + <citations> + <citation type="bibtex">@unpublished{Kraken-Taxonomy-Report:2016, + title = "Kraken Taxonomy Report", + author = "Daniel Blankenberg", + url = "https://github.com/blankenberg/Kraken-Taxonomy-Report", + year = "2016 (accessed June 1, 2016)" + }</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/input_kraken_1.tabular Wed Jun 01 17:25:40 2016 -0400 @@ -0,0 +1,4 @@ +C gi|145231|gb|M33724.1|ECOALPHOA 83333 171 83333:162 +C gi|145232|gb|M33725.1|ECOALPHOB 83333 183 83333:174 +C gi|145234|gb|M33727.1|ECOALPHOE 562 97 562:88 +C gi|146195|gb|J01619.1|ECOGLTA 83333 3850 83333:3841
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/input_kraken_2.tabular Wed Jun 01 17:25:40 2016 -0400 @@ -0,0 +1,5 @@ +C gi|145231|gb|M33724.1|ECOALPHOA 83333 171 83333:162 +C gi|145232|gb|M33725.1|ECOALPHOB 83333 183 83333:174 +C gi|145234|gb|M33727.1|ECOALPHOE 562 97 562:88 +C gi|146195|gb|J01619.1|ECOGLTA 83333 3850 83333:3841 +C gi|145234|gb|M33727.1|ECOALPHOE2 562 97 562:88
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_abundance_1.tabular Wed Jun 01 17:25:40 2016 -0400 @@ -0,0 +1,11 @@ +#ID input_kraken_1.tabular rank taxonomy +root 4 no rank n__root +cellular_organisms 4 no rank n__root;n__cellular_organisms +Escherichia_coli_K-12 3 no rank n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia_coli;n__Escherichia_coli_K-12 +Bacteria 4 superkingdom n__root;n__cellular_organisms;d__Bacteria +Proteobacteria 4 phylum n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria +Gammaproteobacteria 4 class n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria +Enterobacteriales 4 order n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales +Enterobacteriaceae 4 family n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae +Escherichia 4 genus n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Escherichia +Escherichia_coli 4 species n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia_coli
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_abundance_2.tabular Wed Jun 01 17:25:40 2016 -0400 @@ -0,0 +1,11 @@ +#ID input_kraken_1.tabular rank taxonomy +root 0 no rank n__root +cellular_organisms 0 no rank n__root;n__cellular_organisms +Escherichia_coli_K-12 3 no rank n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia_coli;n__Escherichia_coli_K-12 +Bacteria 0 superkingdom n__root;n__cellular_organisms;d__Bacteria +Proteobacteria 0 phylum n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria +Gammaproteobacteria 0 class n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria +Enterobacteriales 0 order n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales +Enterobacteriaceae 0 family n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae +Escherichia 0 genus n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Escherichia +Escherichia_coli 1 species n__root;n__cellular_organisms;d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia_coli
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_abundance_3.tabular Wed Jun 01 17:25:40 2016 -0400 @@ -0,0 +1,2 @@ +#ID input_kraken_1.tabular rank taxonomy +Escherichia_coli 4 species d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia_coli
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_abundance_4.tabular Wed Jun 01 17:25:40 2016 -0400 @@ -0,0 +1,2 @@ +#ID input_kraken_1.tabular input_kraken_2.tabular rank taxonomy +Escherichia_coli 4 5 species d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia_coli
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_tree_1.newick Wed Jun 01 17:25:40 2016 -0400 @@ -0,0 +1,1 @@ +(((((((((Escherichia_coli_K-12:1.00000)Escherichia_coli:1.00000)Escherichia:1.00000)Enterobacteriaceae:1.00000)Enterobacteriales:1.00000)Gammaproteobacteria:1.00000)Proteobacteria:1.00000)Bacteria:1.00000)cellular_organisms:1.00000)root:1.00000;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output_tree_3.newick Wed Jun 01 17:25:40 2016 -0400 @@ -0,0 +1,1 @@ +((((((((Escherichia_coli:1.00000)Escherichia:1.00000)Enterobacteriaceae:1.00000)Enterobacteriales:1.00000)Gammaproteobacteria:1.00000)Proteobacteria:1.00000)Bacteria:1.00000)cellular_organisms:1.00000)root:1.00000;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_database.loc Wed Jun 01 17:25:40 2016 -0400 @@ -0,0 +1,1 @@ +test_db test_db ${__HERE__} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/taxonomy/names.dmp Wed Jun 01 17:25:40 2016 -0400 @@ -0,0 +1,74 @@ +83333 | Escherichia coli K-12 | | scientific name | +83333 | Escherichia coli K12 | | equivalent name | +562 | "Bacillus coli" Migula 1895 | | authority | +562 | "Bacterium coli commune" Escherich 1885 | | authority | +562 | "Bacterium coli" (Migula 1895) Lehmann and Neumann 1896 | | authority | +562 | ATCC 11775 | | type material | +562 | Bacillus coli | | synonym | +562 | Bacterium coli | | synonym | +562 | Bacterium coli commune | | synonym | +562 | CCUG 24 | | type material | +562 | CCUG 29300 | | type material | +562 | CIP 54.8 | | type material | +562 | DSM 30083 | | type material | +562 | Enterococcus coli | | synonym | +562 | Escherchia coli | | misspelling | +562 | Escherichia coli | | scientific name | +562 | Escherichia coli (Migula 1895) Castellani and Chalmers 1919 | | authority | +562 | Escherichia sp. MAR | | includes | +562 | Escherichia/Shigella coli | | equivalent name | +562 | Eschericia coli | | misspelling | +562 | JCM 1649 | | type material | +562 | LMG 2092 | | type material | +562 | NBRC 102203 | | type material | +562 | NCCB 54008 | | type material | +562 | NCTC 9001 | | type material | +562 | bacterium 10a | | includes | +562 | bacterium E3 | | includes | +561 | Escherchia | | misspelling | +561 | Escherichia | | scientific name | +561 | Escherichia Castellani and Chalmers 1919 | | authority | +543 | Enterobacteraceae | | synonym | +543 | Enterobacteraceae (ex Lapage 1979) Lapage 1982, fam. nov., nom. rev. | | synonym | +543 | Enterobacteriaceae | | scientific name | +543 | Enterobacteriaceae (ex Rahn 1937) Ewing et al. 1980, fam. nov., nom. rev. | | synonym | +543 | Enterobacteriaceae Rahn 1937 | | synonym | +543 | gamma-3 proteobacteria | gamma-3 proteobacteria <#1> | in-part | +91347 | 'Enterobacteriales' | | synonym | +91347 | Enterobacteriaceae and related endosymbionts | | synonym | +91347 | Enterobacteriaceae group | | synonym | +91347 | Enterobacteriales | | scientific name | +91347 | enterobacteria | enterobacteria<blast91347> | blast name | +91347 | gamma-3 proteobacteria | gamma-3 proteobacteria <#5> | in-part | +1236 | Gammaproteobacteria | | scientific name | +1236 | Gammaproteobacteria Garrity et al. 2005 | | synonym | +1236 | Proteobacteria gamma subdivision | | synonym | +1236 | Purple bacteria, gamma subdivision | | synonym | +1236 | g-proteobacteria | gamma proteos<blast1236> | blast name | +1236 | gamma proteobacteria | | synonym | +1236 | gamma subdivision | | synonym | +1236 | gamma subgroup | | synonym | +1224 | Proteobacteria | | scientific name | +1224 | Proteobacteria Garrity et al. 2005 | | authority | +1224 | Proteobacteria [class] Stackebrandt et al. 1988 | | authority | +1224 | not Proteobacteria Cavalier-Smith 2002 | | authority | +1224 | proteobacteria | proteobacteria<blast1224> | blast name | +1224 | purple bacteria | | common name | +1224 | purple bacteria and relatives | | common name | +1224 | purple non-sulfur bacteria | | common name | +1224 | purple photosynthetic bacteria | | common name | +1224 | purple photosynthetic bacteria and relatives | | common name | +2 | Bacteria | Bacteria <prokaryote> | scientific name | +2 | Monera | Monera <Bacteria> | in-part | +2 | Procaryotae | Procaryotae <Bacteria> | in-part | +2 | Prokaryota | Prokaryota <Bacteria> | in-part | +2 | Prokaryotae | Prokaryotae <Bacteria> | in-part | +2 | bacteria | bacteria <blast2> | blast name | +2 | eubacteria | | genbank common name | +2 | not Bacteria Haeckel 1894 | | synonym | +2 | prokaryote | prokaryote <Bacteria> | in-part | +2 | prokaryotes | prokaryotes <Bacteria> | in-part | +1 | all | | synonym | +1 | root | | scientific name | +131567 | biota | | synonym | +131567 | cellular organisms | | scientific name |
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_db/taxonomy/nodes.dmp Wed Jun 01 17:25:40 2016 -0400 @@ -0,0 +1,10 @@ +83333 | 562 | no rank | | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +562 | 561 | species | EC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +561 | 543 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +543 | 91347 | family | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +91347 | 1236 | order | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +1236 | 1224 | class | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +1224 | 2 | phylum | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | | +2 | 131567 | superkingdom | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | | +131567 | 1 | no rank | | 8 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | | +1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | |
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Wed Jun 01 17:25:40 2016 -0400 @@ -0,0 +1,8 @@ +<?xml version="1.0"?> +<tables> + <!-- Locations of Kraken database in the required format --> + <table name="kraken_databases" comment_char="#"> + <columns>value, name, path</columns> + <file path="tool-data/kraken_databases.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Wed Jun 01 17:25:40 2016 -0400 @@ -0,0 +1,8 @@ +<?xml version="1.0"?> +<tables> + <!-- Locations of Kraken database in the required format --> + <table name="kraken_databases" comment_char="#"> + <columns>value, name, path</columns> + <file path="${__HERE__}/test-data/test_database.loc" /> + </table> +</tables>