annotate metaphlan_to_phyloxml.py @ 2:1f80b01e1490

changed metaphlan.xml
author nsegata
date Wed, 06 Jun 2012 10:26:24 -0400
parents 016f6375aadc
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
1 #!/usr/bin/env python
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
2
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
3 """
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
4 Read metaphaln output summarizing taxonomic distribution and format in PhyloXML format
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
5
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
6 usage: %prog metaphlan.txt phylo.xml
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
7 """
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
8
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
9 import sys
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
10
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
11 # Metaphlan output looks like:
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
12 # k__Bacteria 99.07618
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
13 # k__Archaea 0.92382
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
14 # k__Bacteria|p__Proteobacteria 82.50732
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
15 # k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria 81.64905
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
16
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
17 rank_map = { 'k__': 'kingdom', 'p__': 'phylum', 'c__': 'class', 'o__': 'order', 'f__': 'family', 'g__': 'genus', 's__': 'species' }
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
18
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
19 class Node( object ):
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
20 """Node in a taxonomy"""
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
21 def __init__( self, rank=None, name=None ):
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
22 self.rank = rank
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
23 self.name = name
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
24 self.value = None
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
25 self.children = dict()
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
26 @staticmethod
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
27 def from_metaphlan_file( file ):
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
28 """
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
29 Build tree from metaphlan output
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
30 """
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
31 root = Node()
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
32 for line in file:
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
33 taxa, abundance = line.split()
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
34 parts = taxa.split( "|" )
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
35 root.add( parts, abundance )
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
36 return root
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
37 def add( self, parts, value ):
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
38 """
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
39 Parts is a list of node names, recursively add nodes until we reach
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
40 the last part, and then attach the value to that node.
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
41 """
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
42 if len( parts ) == 0:
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
43 self.value = value
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
44 else:
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
45 next_part = parts.pop(0)
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
46 rank = rank_map[ next_part[:3] ]
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
47 name = next_part[3:]
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
48 if name not in self.children:
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
49 self.children[name] = Node( rank, name )
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
50 self.children[name].add( parts, value )
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
51 def __str__( self ):
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
52 if self.children:
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
53 return "(" + ",".join( str( child ) for child in self.children.itervalues() ) + "):" + self.name
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
54 else:
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
55 return self.name
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
56 def to_phyloxml( self, out ):
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
57 print >>out, "<clade>"
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
58 if self.name:
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
59 print >>out, "<name>%s</name>" % self.name
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
60 print >>out, "<taxonomy><scientific_name>%s</scientific_name><rank>%s</rank></taxonomy>" % ( self.name, self.rank )
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
61 if self.value:
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
62 print >>out, "<property datatype='xsd:float' ref='metaphlan:abundance' applies_to='node'>%s</property>" % self.value
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
63 ## print >>out, "<confidence type='abundance'>%s</confidence>" % self.value
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
64 for child in self.children.itervalues():
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
65 child.to_phyloxml( out )
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
66 print >>out, "</clade>"
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
67
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
68 out = open( sys.argv[2], 'w' )
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
69
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
70 print >>out, '<phyloxml xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.phyloxml.org" xsi:schemaLocation="http://www.phyloxml.org http://www.phyloxml.org/1.10/phyloxml.xsd">'
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
71 print >>out, '<phylogeny rooted="true">'
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
72
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
73 Node.from_metaphlan_file( open( sys.argv[1] ) ).to_phyloxml( out )
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
74
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
75 print >>out, '</phylogeny>'
016f6375aadc Initial commit of metaphlan_to_phyloxml converter.
Dannon Baker <dannonbaker@me.com>
parents:
diff changeset
76 print >>out, '</phyloxml>'