Mercurial > repos > iuc > kraken_taxonomy_report
diff kraken_taxonomy_report.py @ 4:27d65c78863c draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/kraken_taxonomy_report commit 04943848aa0f6637b56303ec6026dcb475ecb9e5"
author | iuc |
---|---|
date | Sun, 20 Mar 2022 16:39:48 +0000 |
parents | b11b3ac48bb9 |
children |
line wrap: on
line diff
--- a/kraken_taxonomy_report.py Tue Oct 30 18:58:54 2018 -0400 +++ b/kraken_taxonomy_report.py Sun Mar 20 16:39:48 2022 +0000 @@ -21,130 +21,132 @@ # cat nodes.dmp | cut -f 5 | sort | uniq # "root" is added manually NO_RANK_NAME = "no rank" -RANK_NAMES = [ NO_RANK_NAME, - "root", - "superkingdom", - "kingdom", - "subkingdom", - "superphylum", - "phylum", - "subphylum", - "superclass", - "class", - "subclass", - "infraclass", - "superorder", - "order", - "suborder", - "infraorder", - "parvorder", - "superfamily", - "family", - "subfamily", - "tribe", - "subtribe", - "genus", - "subgenus", - "species group", - "species subgroup", - "species", - "subspecies", - "varietas", - "forma" ] +RANK_NAMES = [ + NO_RANK_NAME, + "root", + "superkingdom", + "kingdom", + "subkingdom", + "superphylum", + "phylum", + "subphylum", + "superclass", + "class", + "subclass", + "infraclass", + "superorder", + "order", + "suborder", + "infraorder", + "parvorder", + "superfamily", + "family", + "subfamily", + "tribe", + "subtribe", + "genus", + "subgenus", + "species group", + "species subgroup", + "species", + "subspecies", + "varietas", + "forma" +] # NB: We put 'no rank' at top of list for generating trees, due to e.g. # root (root) -> cellular organisms (no rank) -> bacteria (superkingdom) -RANK_NAME_TO_INTS = dict( [ (y, x) for (x, y) in enumerate( RANK_NAMES ) ] ) -RANK_NAMES_INTS = range( len( RANK_NAMES ) ) +RANK_NAME_TO_INTS = dict([(y, x) for (x, y) in enumerate(RANK_NAMES)]) +RANK_NAMES_INTS = range(len(RANK_NAMES)) -NO_RANK_INT = RANK_NAMES.index( NO_RANK_NAME ) +NO_RANK_INT = RANK_NAMES.index(NO_RANK_NAME) NO_RANK_CODE = 'n' -PRIMARY_RANK_NAMES = [ 'species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom' ] +PRIMARY_RANK_NAMES = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom'] RANK_INT_TO_CODE = {} for name in PRIMARY_RANK_NAMES: - RANK_INT_TO_CODE[ RANK_NAMES.index( name ) ] = name[0] -RANK_INT_TO_CODE[ RANK_NAMES.index( 'superkingdom' ) ] = 'd' -PRIMARY_RANK_NAMES.append( 'superkingdom' ) + RANK_INT_TO_CODE[RANK_NAMES.index(name)] = name[0] +RANK_INT_TO_CODE[RANK_NAMES.index('superkingdom')] = 'd' +PRIMARY_RANK_NAMES.append('superkingdom') NAME_STUB = "%s__%s" NAME_RE = re.compile(r"(\t| |\||\.;)") NAME_REPL = "_" -def get_kraken_db_path( db ): - assert db, ValueError( "You must provide a kraken database" ) - k_db_path = os.getenv('KRAKEN_DB_PATH', None ) +def get_kraken_db_path(db): + assert db, ValueError("You must provide a kraken database") + k_db_path = os.getenv('KRAKEN_DB_PATH', None) if k_db_path: - db = os.path.join( k_db_path, db ) + db = os.path.join(k_db_path, db) return db -def load_taxonomy( db_path, sanitize_names=False ): +def load_taxonomy(db_path, sanitize_names=False): child_lists = {} name_map = {} rank_map = {} names = {} # Store names here to look for duplicates (id, True/False name fixed) - with open( os.path.join( db_path, "taxonomy/names.dmp" ) ) as fh: + with open(os.path.join(db_path, "taxonomy/names.dmp")) as fh: for line in fh: - line = line.rstrip( "\n\r" ) - if line.endswith( "\t|" ): + line = line.rstrip("\n\r") + if line.endswith("\t|"): line = line[:-2] - fields = line.split( "\t|\t" ) + fields = line.split("\t|\t") node_id = fields[0] name = fields[1] if sanitize_names: - name = NAME_RE.sub( NAME_REPL, name ) + name = NAME_RE.sub(NAME_REPL, name) name_type = fields[3] if name_type == "scientific name": if name in names: - print( 'Warning: name "%s" found at node "%s" but already exists originally for node "%s".' % ( name, node_id, names[name][0] ), file=sys.stderr ) - new_name = "%s_%s" % ( name, node_id ) - print( 'Transforming node "%s" named "%s" to "%s".' % ( node_id, name, new_name ), file=sys.stderr ) + print('Warning: name "%s" found at node "%s" but already exists originally for node "%s".' % (name, node_id, names[name][0]), file=sys.stderr) + new_name = "%s_%s" % (name, node_id) + print('Transforming node "%s" named "%s" to "%s".' % (node_id, name, new_name), file=sys.stderr) assert new_name not in names, 'Transformed Name "%s" already exists. Cannot recover at this time.' % new_name if not names[name][1]: - orig_new_name = "%s_%s" % ( name, names[name][0] ) - print( 'Transforming node "%s" named "%s" to "%s".' % ( names[name][0], name, orig_new_name ), file=sys.stderr ) + orig_new_name = "%s_%s" % (name, names[name][0]) + print('Transforming node "%s" named "%s" to "%s".' % (names[name][0], name, orig_new_name), file=sys.stderr) assert orig_new_name not in names, 'Transformed Name "%s" already exists. Cannot recover at this time.' % orig_new_name name_map[names[name][0]] = orig_new_name - names[name] = ( names[name][0], True ) + names[name] = (names[name][0], True) name = new_name else: - names[name] = ( node_id, False ) - name_map[ node_id ] = name + names[name] = (node_id, False) + name_map[node_id] = name - with open( os.path.join( db_path, "taxonomy/nodes.dmp" ) ) as fh: + with open(os.path.join(db_path, "taxonomy/nodes.dmp")) as fh: for line in fh: - line = line.rstrip( "\n\r" ) - fields = line.split( "\t|\t" ) + line = line.rstrip("\n\r") + fields = line.split("\t|\t") node_id = fields[0] parent_id = fields[1] - rank = RANK_NAME_TO_INTS.get( fields[2].lower(), None ) + rank = RANK_NAME_TO_INTS.get(fields[2].lower(), None) if rank is None: # This should never happen, unless new taxonomy ranks are created - print( 'Unrecognized rank: Node "%s" is "%s", setting to "%s"' % ( node_id, fields[2], NO_RANK_NAME ), file=sys.stderr ) + print('Unrecognized rank: Node "%s" is "%s", setting to "%s"' % (node_id, fields[2], NO_RANK_NAME), file=sys.stderr) rank = NO_RANK_INT if node_id == '1': parent_id = '0' if parent_id not in child_lists: - child_lists[ parent_id ] = [] - child_lists[ parent_id ].append( node_id ) + child_lists[parent_id] = [] + child_lists[parent_id].append(node_id) rank_map[node_id] = rank - return ( child_lists, name_map, rank_map ) + return (child_lists, name_map, rank_map) -def dfs_summation( node, counts, child_lists ): - children = child_lists.get( node, None ) +def dfs_summation(node, counts, child_lists): + children = child_lists.get(node, None) if children: for child in children: - dfs_summation( child, counts, child_lists ) - counts[ node ] = counts.get( node, 0 ) + counts.get( child, 0 ) + dfs_summation(child, counts, child_lists) + counts[node] = counts.get(node, 0) + counts.get(child, 0) -def dfs_report( node, file_data, hit_taxa, rank_map, name_map, child_lists, output_lines, options, name=None, tax=None ): +def dfs_report(node, file_data, hit_taxa, rank_map, name_map, child_lists, output_lines, options, name=None, tax=None): rank_int = rank_map[node] - code = RANK_INT_TO_CODE.get( rank_int, NO_RANK_CODE ) - if ( code != NO_RANK_CODE or options.intermediate ) and ( options.show_zeros or node in hit_taxa): + code = RANK_INT_TO_CODE.get(rank_int, NO_RANK_CODE) + if (code != NO_RANK_CODE or options.intermediate) and (options.show_zeros or node in hit_taxa): if name is None: name = "" else: @@ -153,8 +155,8 @@ tax = '' else: tax = "%s;" % tax - sanitized_name = name_map[ node ] - name_stub = NAME_STUB % ( code, sanitized_name ) + sanitized_name = name_map[node] + name_stub = NAME_STUB % (code, sanitized_name) name = name + name_stub tax = tax + name_stub if options.name_id: @@ -164,80 +166,80 @@ else: output = sanitized_name for val in file_data: - output = "%s\t%i" % ( output, val.get( node, 0 ) ) + output = "%s\t%i" % (output, val.get(node, 0)) if options.show_rank: - output = "%s\t%s" % ( output, RANK_NAMES[ rank_int ] ) + output = "%s\t%s" % (output, RANK_NAMES[rank_int]) if options.taxonomy: - output = "%s\t%s" % ( output, tax ) - output_lines[ rank_int ].append( output ) - children = child_lists.get( node ) + output = "%s\t%s" % (output, tax) + output_lines[rank_int].append(output) + children = child_lists.get(node) if children: for child in children: - dfs_report( child, file_data, hit_taxa, rank_map, name_map, child_lists, output_lines, options, name=name, tax=tax ) + dfs_report(child, file_data, hit_taxa, rank_map, name_map, child_lists, output_lines, options, name=name, tax=tax) -def write_tree( child_lists, name_map, rank_map, options, branch_length=1 ): +def write_tree(child_lists, name_map, rank_map, options, branch_length=1): # Uses Biopython, only load if making tree import Bio.Phylo from Bio.Phylo import BaseTree - def _get_name( node_id ): + def _get_name(node_id): if options.name_id: return node_id return name_map[node_id] nodes = {} root_node_id = child_lists["0"][0] - nodes[root_node_id] = BaseTree.Clade( name=_get_name( root_node_id), branch_length=branch_length ) + nodes[root_node_id] = BaseTree.Clade(name=_get_name(root_node_id), branch_length=branch_length) - def recurse_children( parent_id ): + def recurse_children(parent_id): if options.cluster is not None and rank_map[parent_id] == options.cluster: # Short circuit if we found our rank, prevents 'hanging' no ranks from being output # e.g. clustering by "species" (Escherichia coli), but have "no rank" below (Escherichia coli K-12) in test_db return if parent_id not in nodes: - nodes[parent_id] = BaseTree.Clade( name=_get_name( parent_id ), branch_length=branch_length ) - for child_id in child_lists.get( parent_id, [] ): - if options.cluster is None or ( rank_map[child_id] <= options.cluster ): + nodes[parent_id] = BaseTree.Clade(name=_get_name(parent_id), branch_length=branch_length) + for child_id in child_lists.get(parent_id, []): + if options.cluster is None or (rank_map[child_id] <= options.cluster): if child_id not in nodes: - nodes[child_id] = BaseTree.Clade(name=_get_name( child_id ), branch_length=branch_length) + nodes[child_id] = BaseTree.Clade(name=_get_name(child_id), branch_length=branch_length) nodes[parent_id].clades.append(nodes[child_id]) - recurse_children( child_id ) - recurse_children( root_node_id ) + recurse_children(child_id) + recurse_children(root_node_id) tree = BaseTree.Tree(root=nodes[root_node_id]) - Bio.Phylo.write( [tree], options.output_tree, 'newick' ) + Bio.Phylo.write([tree], options.output_tree, 'newick') def __main__(): - parser = optparse.OptionParser( usage="%prog [options] file1 file...fileN" ) - parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='print version and exit' ) - parser.add_option( '', '--show-zeros', dest='show_zeros', action='store_true', default=False, help='Show empty nodes' ) - parser.add_option( '', '--header-line', dest='header_line', action='store_true', default=False, help='Provide a header on output' ) - parser.add_option( '', '--intermediate', dest='intermediate', action='store_true', default=False, help='Intermediate Ranks' ) - parser.add_option( '', '--name-id', dest='name_id', action='store_true', default=False, help='Use Taxa ID instead of Name' ) - parser.add_option( '', '--name-long', dest='name_long', action='store_true', default=False, help='Use Long taxa ID instead of base name' ) - parser.add_option( '', '--taxonomy', dest='taxonomy', action='store_true', default=False, help='Output taxonomy in last column' ) - parser.add_option( '', '--cluster', dest='cluster', action='store', type="string", default=None, help='Cluster counts to specified rank' ) - parser.add_option( '', '--summation', dest='summation', action='store_true', default=False, help='Add summation of child counts to each taxa' ) + parser = optparse.OptionParser(usage="%prog [options] file1 file...fileN") + parser.add_option('-v', '--version', dest='version', action='store_true', default=False, help='print version and exit') + parser.add_option('', '--show-zeros', dest='show_zeros', action='store_true', default=False, help='Show empty nodes') + parser.add_option('', '--header-line', dest='header_line', action='store_true', default=False, help='Provide a header on output') + parser.add_option('', '--intermediate', dest='intermediate', action='store_true', default=False, help='Intermediate Ranks') + parser.add_option('', '--name-id', dest='name_id', action='store_true', default=False, help='Use Taxa ID instead of Name') + parser.add_option('', '--name-long', dest='name_long', action='store_true', default=False, help='Use Long taxa ID instead of base name') + parser.add_option('', '--taxonomy', dest='taxonomy', action='store_true', default=False, help='Output taxonomy in last column') + parser.add_option('', '--cluster', dest='cluster', action='store', type="string", default=None, help='Cluster counts to specified rank') + parser.add_option('', '--summation', dest='summation', action='store_true', default=False, help='Add summation of child counts to each taxa') parser.add_option('', '--sanitize-names', dest='sanitize_names', action='store_true', default=False, help=r'Replace special chars (\t| |\||\.;) with underscore (_)') - parser.add_option( '', '--show-rank', dest='show_rank', action='store_true', default=False, help='Output column with Rank name' ) - parser.add_option( '', '--db', dest='db', action='store', type="string", default=None, help='Name of Kraken database' ) - parser.add_option( '', '--output', dest='output', action='store', type="string", default=None, help='Name of output file' ) - parser.add_option( '', '--output-tree', dest='output_tree', action='store', type="string", default=None, help='Name of output file to place newick tree' ) + parser.add_option('', '--show-rank', dest='show_rank', action='store_true', default=False, help='Output column with Rank name') + parser.add_option('', '--db', dest='db', action='store', type="string", default=None, help='Name of Kraken database') + parser.add_option('', '--output', dest='output', action='store', type="string", default=None, help='Name of output file') + parser.add_option('', '--output-tree', dest='output_tree', action='store', type="string", default=None, help='Name of output file to place newick tree') (options, args) = parser.parse_args() if options.version: - print( "Kraken Taxonomy Report (%s) version %s" % ( __URL__, __VERSION__ ), file=sys.stderr ) + print("Kraken Taxonomy Report (%s) version %s" % (__URL__, __VERSION__), file=sys.stderr) sys.exit() if not args: - print( parser.get_usage(), file=sys.stderr ) + print(parser.get_usage(), file=sys.stderr) sys.exit() if options.cluster: cluster_name = options.cluster.lower() - cluster = RANK_NAME_TO_INTS.get( cluster_name, None ) - assert cluster is not None, ValueError( '"%s" is not a valid rank for clustering.' % options.cluster ) + cluster = RANK_NAME_TO_INTS.get(cluster_name, None) + assert cluster is not None, ValueError('"%s" is not a valid rank for clustering.' % options.cluster) if cluster_name not in PRIMARY_RANK_NAMES: - assert options.intermediate, ValueError( 'You cannot cluster by "%s", unless you enable intermediate ranks.' % options.cluster ) - ranks_to_report = [ cluster ] + assert options.intermediate, ValueError('You cannot cluster by "%s", unless you enable intermediate ranks.' % options.cluster) + ranks_to_report = [cluster] options.cluster = cluster # When clustering we need to do summatation options.summation = True @@ -250,43 +252,43 @@ else: output_fh = sys.stdout - db_path = get_kraken_db_path( options.db ) - ( child_lists, name_map, rank_map ) = load_taxonomy( db_path, sanitize_names=options.sanitize_names ) + db_path = get_kraken_db_path(options.db) + (child_lists, name_map, rank_map) = load_taxonomy(db_path, sanitize_names=options.sanitize_names) file_data = [] hit_taxa = [] for input_filename in args: taxo_counts = {} - with open( input_filename ) as fh: + with open(input_filename) as fh: for line in fh: - fields = line.split( "\t" ) - taxo_counts[ fields[2] ] = taxo_counts.get( fields[2], 0 ) + 1 + fields = line.split("\t") + taxo_counts[fields[2]] = taxo_counts.get(fields[2], 0) + 1 clade_counts = taxo_counts.copy() # fixme remove copying? if options.summation: - dfs_summation( '1', clade_counts, child_lists ) + dfs_summation('1', clade_counts, child_lists) for key, value in clade_counts.items(): if value and key not in hit_taxa: - hit_taxa.append( key ) - file_data.append( clade_counts ) + hit_taxa.append(key) + file_data.append(clade_counts) if options.header_line: - output_fh.write( "#ID\t" ) - output_fh.write( "\t".join( args ) ) + output_fh.write("#ID\t") + output_fh.write("\t".join(args)) if options.show_rank: - output_fh.write( "\trank" ) + output_fh.write("\trank") if options.taxonomy: - output_fh.write( "\ttaxonomy" ) - output_fh.write( '\n' ) + output_fh.write("\ttaxonomy") + output_fh.write('\n') - output_lines = dict( [ ( x, [] ) for x in RANK_NAMES_INTS ] ) - dfs_report( '1', file_data, hit_taxa, rank_map, name_map, child_lists, output_lines, options, name=None, tax=None ) + output_lines = dict([(x, []) for x in RANK_NAMES_INTS]) + dfs_report('1', file_data, hit_taxa, rank_map, name_map, child_lists, output_lines, options, name=None, tax=None) for rank_int in ranks_to_report: - for line in output_lines.get( rank_int, [] ): - output_fh.write( line ) - output_fh.write( '\n' ) + for line in output_lines.get(rank_int, []): + output_fh.write(line) + output_fh.write('\n') fh.close() if options.output_tree: - write_tree( child_lists, name_map, rank_map, options ) + write_tree(child_lists, name_map, rank_map, options) if __name__ == "__main__":