comparison corebio/seq_io/_nexus/Trees.py @ 4:4d47ab2b7bcc

Uploaded
author davidmurphy
date Fri, 13 Jan 2012 07:18:19 -0500
parents c55bdc2fb9fa
children
comparison
equal deleted inserted replaced
3:09d2dac9ef73 4:4d47ab2b7bcc
1
2 #
3 # Trees.py
4 #
5 # Copyright 2005 by Frank Kauff & Cymon J. Cox. All rights reserved.
6 # This code is part of the Biopython distribution and governed by its
7 # license. Please see the LICENSE file that should have been included
8 # as part of this package.
9 #
10 # Tree class handles phylogenetic trees. Provides a set of methods to read and write newick-format tree
11 # descriptions, get information about trees (monphyly of taxon sets, congruence between trees, common ancestors,...)
12 # and to manipulate trees (reroot trees, split terminal nodes).
13 #
14 # Bug reports welcome: fkauff@duke.edu
15 #
16
17 import sys, random, sets
18 import Nodes
19
20 PRECISION_BRANCHLENGTH=6
21 PRECISION_SUPPORT=6
22
23 class TreeError(Exception): pass
24
25 class NodeData:
26 """Stores tree-relevant data associated with nodes (e.g. branches or otus)."""
27 def __init__(self,taxon=None,branchlength=0.0,support=None):
28 self.taxon=taxon
29 self.branchlength=branchlength
30 self.support=support
31
32 class Tree(Nodes.Chain):
33 """Represents a tree using a chain of nodes with on predecessor (=ancestor)
34 and multiple successors (=subclades).
35 """
36 # A newick tree is parsed into nested list and then converted to a node list in two stages
37 # mostly due to historical reasons. This could be done in one swoop). Note: parentheses ( ) and
38 # colon : are not allowed in taxon names. This is against NEXUS standard, but makes life much
39 # easier when parsing trees.
40
41 ## NOTE: Tree should store its data class in something like self.dataclass=data,
42 ## so that nodes that are generated have easy access to the data class
43 ## Some routines use automatically NodeData, this needs to be more concise
44
45 def __init__(self,tree=None,weight=1.0,rooted=False,name='',data=NodeData,values_are_support=False,max_support=1.0):
46 """Ntree(self,tree)."""
47 Nodes.Chain.__init__(self)
48 self.dataclass=data
49 self.__values_are_support=values_are_support
50 self.max_support=max_support
51 self.weight=weight
52 self.rooted=rooted
53 self.name=name
54 root=Nodes.Node(data())
55 self.add(root)
56 self.root=root.id
57 if tree: # use the tree we have
58 # if Tree is called from outside Nexus parser, we need to get rid of linebreaks, etc
59 tree=tree.strip().replace('\n','').replace('\r','')
60 # there's discrepancy whether newick allows semicolons et the end
61 tree=tree.rstrip(';')
62 self._add_subtree(parent_id=root.id,tree=self._parse(tree)[0])
63
64 def _parse(self,tree):
65 """Parses (a,b,c...)[[[xx]:]yy] into subcomponents and travels down recursively."""
66
67 if tree.count('(')!=tree.count(')'):
68 raise TreeError, 'Parentheses do not match in (sub)tree: '+tree
69 if tree.count('(')==0: # a leaf
70 colon=tree.rfind(':')
71 if colon>-1:
72 return [tree[:colon],self._get_values(tree[colon+1:])]
73 else:
74 return [tree,[None]]
75 else:
76 closing=tree.rfind(')')
77 val=self._get_values(tree[closing+1:])
78 if not val:
79 val=[None]
80 subtrees=[]
81 plevel=0
82 prev=1
83 for p in range(1,closing):
84 if tree[p]=='(':
85 plevel+=1
86 elif tree[p]==')':
87 plevel-=1
88 elif tree[p]==',' and plevel==0:
89 subtrees.append(tree[prev:p])
90 prev=p+1
91 subtrees.append(tree[prev:closing])
92 subclades=[self._parse(subtree) for subtree in subtrees]
93 return [subclades,val]
94
95 def _add_subtree(self,parent_id=None,tree=None):
96 """Adds leaf or tree (in newick format) to a parent_id. (self,parent_id,tree)."""
97
98 if parent_id is None:
99 raise TreeError('Need node_id to connect to.')
100 for st in tree:
101 if type(st[0])==list: # it's a subtree
102 nd=self.dataclass()
103 if len(st[1])>=2: # if there's two values, support comes first. Is that always so?
104 nd.support=st[1][0]
105 if st[1][1] is not None:
106 nd.branchlength=st[1][1]
107 elif len(st[1])==1: # otherwise it could be real branchlengths or support as branchlengths
108 if not self.__values_are_support: # default
109 if st[1][0] is not None:
110 nd.branchlength=st[1][0]
111 else:
112 nd.support=st[1][0]
113 sn=Nodes.Node(nd)
114 self.add(sn,parent_id)
115 self._add_subtree(sn.id,st[0])
116 else: # it's a leaf
117 nd=self.dataclass()
118 nd.taxon=st[0]
119 if len(st)>1:
120 if len(st[1])>=2: # if there's two values, support comes first. Is that always so?
121 nd.support=st[1][0]
122 if st[1][1] is not None:
123 nd.branchlength=st[1][1]
124 elif len(st[1])==1: # otherwise it could be real branchlengths or support as branchlengths
125 if not self.__values_are_support: # default
126 if st[1][0] is not None:
127 nd.branchlength=st[1][0]
128 else:
129 nd.support=st[1][0]
130 leaf=Nodes.Node(nd)
131 self.add(leaf,parent_id)
132
133 def _get_values(self, text):
134 """Extracts values (support/branchlength) from xx[:yyy], xx."""
135
136 if text=='':
137 return None
138 return [float(t) for t in text.split(':') if t.strip()]
139
140 def _walk(self,node=None):
141 """Return all node_ids downwards from a node."""
142
143 if node is None:
144 node=self.root
145 for n in self.node(node).succ:
146 yield n
147 for sn in self._walk(n):
148 yield sn
149
150 def node(self,node_id):
151 """Return the instance of node_id.
152
153 node = node(self,node_id)
154 """
155 if node_id not in self.chain:
156 raise TreeError('Unknown node_id: %d' % node_id)
157 return self.chain[node_id]
158
159 def split(self,parent_id=None,n=2,branchlength=1.0):
160 """Speciation: generates n (default two) descendants of a node.
161
162 [new ids] = split(self,parent_id=None,n=2,branchlength=1.0):
163 """
164 if parent_id is None:
165 raise TreeError('Missing node_id.')
166 ids=[]
167 parent_data=self.chain[parent_id].data
168 for i in range(n):
169 node=Nodes.Node()
170 if parent_data:
171 node.data=self.dataclass()
172 # each node has taxon and branchlength attribute
173 if parent_data.taxon:
174 node.data.taxon=parent_data.taxon+str(i)
175 node.data.branchlength=branchlength
176 ids.append(self.add(node,parent_id))
177 return ids
178
179 def search_taxon(self,taxon):
180 """Returns the first matching taxon in self.data.taxon. Not restricted to terminal nodes.
181
182 node_id = search_taxon(self,taxon)
183 """
184 for id,node in self.chain.items():
185 if node.data.taxon==taxon:
186 return id
187 return None
188
189 def prune(self,taxon):
190 """Prunes a terminal taxon from the tree.
191
192 id_of_previous_node = prune(self,taxon)
193 If taxon is from a bifurcation, the connectiong node will be collapsed
194 and its branchlength added to remaining terminal node. This might be no
195 longer a meaningful value'
196 """
197
198 id=self.search_taxon(taxon)
199 if id is None:
200 raise TreeError('Taxon not found: %s' % taxon)
201 elif id not in self.get_terminals():
202 raise TreeError('Not a terminal taxon: %s' % taxon)
203 else:
204 prev=self.unlink(id)
205 self.kill(id)
206 if not prev==self.root and len(self.node(prev).succ)==1:
207 succ=self.node(prev).succ[0]
208 new_bl=self.node(prev).data.branchlength+self.node(succ).data.branchlength
209 self.collapse(prev)
210 self.node(succ).data.branchlength=new_bl
211 return prev
212
213 def get_taxa(self,node_id=None):
214 """Return a list of all otus downwards from a node (self, node_id).
215
216 nodes = get_taxa(self,node_id=None)
217 """
218
219 if node_id is None:
220 node_id=self.root
221 if node_id not in self.chain:
222 raise TreeError('Unknown node_id: %d.' % node_id)
223 if self.chain[node_id].succ==[]:
224 if self.chain[node_id].data:
225 return [self.chain[node_id].data.taxon]
226 else:
227 return None
228 else:
229 list=[]
230 for succ in self.chain[node_id].succ:
231 list.extend(self.get_taxa(succ))
232 return list
233
234 def get_terminals(self):
235 """Return a list of all terminal nodes."""
236 return [i for i in self.all_ids() if self.node(i).succ==[]]
237
238 def sum_branchlength(self,root=None,node=None):
239 """Adds up the branchlengths from root (default self.root) to node.
240
241 sum = sum_branchlength(self,root=None,node=None)
242 """
243
244 if root is None:
245 root=self.root
246 if node is None:
247 raise TreeError('Missing node id.')
248 blen=0.0
249 while node is not None and node is not root:
250 blen+=self.node(node).data.branchlength
251 node=self.node(node).prev
252 return blen
253
254 def set_subtree(self,node):
255 """Return subtree as a set of nested sets.
256
257 sets = set_subtree(self,node)
258 """
259
260 if self.node(node).succ==[]:
261 return self.node(node).data.taxon
262 else:
263 return sets.Set([self.set_subtree(n) for n in self.node(node).succ])
264
265 def is_identical(self,tree2):
266 """Compare tree and tree2 for identity.
267
268 result = is_identical(self,tree2)
269 """
270 return self.set_subtree(self.root)==tree2.set_subtree(tree2.root)
271
272 def is_compatible(self,tree2,threshold,strict=True):
273 """Compares branches with support>threshold for compatibility.
274
275 result = is_compatible(self,tree2,threshold)
276 """
277
278 # check if both trees have the same set of taxa. strict=True enforces this.
279 missing2=sets.Set(self.get_taxa())-sets.Set(tree2.get_taxa())
280 missing1=sets.Set(tree2.get_taxa())-sets.Set(self.get_taxa())
281 if strict and (missing1 or missing2):
282 if missing1:
283 print 'Taxon/taxa %s is/are missing in tree %s' % (','.join(missing1) , self.name)
284 if missing2:
285 print 'Taxon/taxa %s is/are missing in tree %s' % (','.join(missing2) , tree2.name)
286 raise TreeError, 'Can\'t compare trees with different taxon compositions.'
287 t1=[(sets.Set(self.get_taxa(n)),self.node(n).data.support) for n in self.all_ids() if \
288 self.node(n).succ and\
289 (self.node(n).data and self.node(n).data.support and self.node(n).data.support>=threshold)]
290 t2=[(sets.Set(tree2.get_taxa(n)),tree2.node(n).data.support) for n in tree2.all_ids() if \
291 tree2.node(n).succ and\
292 (tree2.node(n).data and tree2.node(n).data.support and tree2.node(n).data.support>=threshold)]
293 conflict=[]
294 for (st1,sup1) in t1:
295 for (st2,sup2) in t2:
296 if not st1.issubset(st2) and not st2.issubset(st1): # don't hiccup on upstream nodes
297 intersect,notin1,notin2=st1 & st2, st2-st1, st1-st2 # all three are non-empty sets
298 # if notin1==missing1 or notin2==missing2 <==> st1.issubset(st2) or st2.issubset(st1) ???
299 if intersect and not (notin1.issubset(missing1) or notin2.issubset(missing2)): # omit conflicts due to missing taxa
300 conflict.append((st1,sup1,st2,sup2,intersect,notin1,notin2))
301 return conflict
302
303 def common_ancestor(self,node1,node2):
304 """Return the common ancestor that connects to nodes.
305
306 node_id = common_ancestor(self,node1,node2)
307 """
308
309 l1=[self.root]+self.trace(self.root,node1)
310 l2=[self.root]+self.trace(self.root,node2)
311 return [n for n in l1 if n in l2][-1]
312
313
314 def distance(self,node1,node2):
315 """Add and return the sum of the branchlengths between two nodes.
316 dist = distance(self,node1,node2)
317 """
318
319 ca=self.common_ancestor(node1,node2)
320 return self.sum_branchlength(ca,node1)+self.sum_branchlength(ca,node2)
321
322 def is_monophyletic(self,taxon_list):
323 """Return node_id of common ancestor if taxon_list is monophyletic, -1 otherwise.
324
325 result = is_monophyletic(self,taxon_list)
326 """
327 if isinstance(taxon_list,str):
328 taxon_set=sets.Set([taxon_list])
329 else:
330 taxon_set=sets.Set(taxon_list)
331 node_id=self.root
332 while 1:
333 subclade_taxa=sets.Set(self.get_taxa(node_id))
334 if subclade_taxa==taxon_set: # are we there?
335 return node_id
336 else: # check subnodes
337 for subnode in self.chain[node_id].succ:
338 if sets.Set(self.get_taxa(subnode)).issuperset(taxon_set): # taxon_set is downstream
339 node_id=subnode
340 break # out of for loop
341 else:
342 return -1 # taxon set was not with successors, for loop exhausted
343
344 def is_bifurcating(self,node=None):
345 """Return True if tree downstream of node is strictly bifurcating."""
346 if not node:
347 node=self.root
348 if node==self.root and len(self.node(node).succ)==3: #root can be trifurcating, because it has no ancestor
349 return self.is_bifurcating(self.node(node).succ[0]) and \
350 self.is_bifurcating(self.node(node).succ[1]) and \
351 self.is_bifurcating(self.node(node).succ[2])
352 if len(self.node(node).succ)==2:
353 return self.is_bifurcating(self.node(node).succ[0]) and self.is_bifurcating(self.node(node).succ[1])
354 elif len(self.node(node).succ)==0:
355 return True
356 else:
357 return False
358
359
360
361 def branchlength2support(self):
362 """Move values stored in data.branchlength to data.support, and set branchlength to 0.0
363
364 This is necessary when support has been stored as branchlength (e.g. paup), and has thus
365 been read in as branchlength.
366 """
367
368 for n in self.chain.keys():
369 self.node(n).data.support=self.node(n).data.branchlength
370 self.node(n).data.branchlength=0.0
371
372 def convert_absolute_support(self,nrep):
373 """Convert absolute support (clade-count) to rel. frequencies.
374
375 Some software (e.g. PHYLIP consense) just calculate how often clades appear, instead of
376 calculating relative frequencies."""
377
378 for n in self._walk():
379 if self.node(n).data.support:
380 self.node(n).data.support/=float(nrep)
381
382 def randomize(self,ntax=None,taxon_list=None,branchlength=1.0,branchlength_sd=None,bifurcate=True):
383 """Generates a random tree with ntax taxa and/or taxa from taxlabels.
384
385 new_tree = randomize(self,ntax=None,taxon_list=None,branchlength=1.0,branchlength_sd=None,bifurcate=True)
386 Trees are bifurcating by default. (Polytomies not yet supported).
387 """
388
389 if not ntax and taxon_list:
390 ntax=len(taxon_list)
391 elif not taxon_list and ntax:
392 taxon_list=['taxon'+str(i+1) for i in range(ntax)]
393 elif not ntax and not taxon_list:
394 raise TreeError('Either numer of taxa or list of taxa must be specified.')
395 elif ntax<>len(taxon_list):
396 raise TreeError('Length of taxon list must correspond to ntax.')
397 # initiate self with empty root
398 self.__init__()
399 terminals=self.get_terminals()
400 # bifurcate randomly at terminal nodes until ntax is reached
401 while len(terminals)<ntax:
402 newsplit=random.choice(terminals)
403 new_terminals=self.split(parent_id=newsplit,branchlength=branchlength)
404 # if desired, give some variation to the branch length
405 if branchlength_sd:
406 for nt in new_terminals:
407 bl=random.gauss(branchlength,branchlength_sd)
408 if bl<0:
409 bl=0
410 self.node(nt).data.branchlength=bl
411 terminals.extend(new_terminals)
412 terminals.remove(newsplit)
413 # distribute taxon labels randomly
414 random.shuffle(taxon_list)
415 for (node,name) in zip(terminals,taxon_list):
416 self.node(node).data.taxon=name
417
418 def display(self):
419 """Quick and dirty lists of all nodes."""
420 table=[('#','taxon','prev','succ','brlen','blen (sum)','support')]
421 for i in self.all_ids():
422 n=self.node(i)
423 if not n.data:
424 table.append((str(i),'-',str(n.prev),str(n.succ),'-','-','-'))
425 else:
426 tx=n.data.taxon
427 if not tx:
428 tx='-'
429 blength=n.data.branchlength
430 if blength is None:
431 blength='-'
432 sum_blength='-'
433 else:
434 sum_blength=self.sum_branchlength(node=i)
435 support=n.data.support
436 if support is None:
437 support='-'
438 table.append((str(i),tx,str(n.prev),str(n.succ),blength,sum_blength,support))
439 print '\n'.join(['%3s %32s %15s %15s %8s %10s %8s' % l for l in table])
440 print '\nRoot: ',self.root
441
442 def to_string(self,support_as_branchlengths=False,branchlengths_only=False,plain=True,plain_newick=False):
443 """Return a paup compatible tree line.
444
445 to_string(self,support_as_branchlengths=False,branchlengths_only=False,plain=True)
446 """
447 # if there's a conflict in the arguments, we override plain=True
448 if support_as_branchlengths or branchlengths_only:
449 plain=False
450 self.support_as_branchlengths=support_as_branchlengths
451 self.branchlengths_only=branchlengths_only
452 self.plain=plain
453
454 def make_info_string(data,terminal=False):
455 """Creates nicely formatted support/branchlengths."""
456 # CHECK FORMATTING
457 if self.plain: # plain tree only. That's easy.
458 return ''
459 elif self.support_as_branchlengths: # support as branchlengths (eg. PAUP), ignore actual branchlengths
460 if terminal: # terminal branches have 100% support
461 return ':%1.2f' % self.max_support
462 else:
463 return ':%1.2f' % (data.support)
464 elif self.branchlengths_only: # write only branchlengths, ignore support
465 return ':%1.5f' % (data.branchlength)
466 else: # write suport and branchlengths (e.g. .con tree of mrbayes)
467 if terminal:
468 return ':%1.5f' % (data.branchlength)
469 else:
470 if data.branchlength is not None and data.support is not None: # we have blen and suppport
471 return '%1.2f:%1.5f' % (data.support,data.branchlength)
472 elif data.branchlength is not None: # we have only blen
473 return '0.00000:%1.5f' % (data.branchlength)
474 elif data.support is not None: # we have only support
475 return '%1.2f:0.00000' % (data.support)
476 else:
477 return '0.00:0.00000'
478
479 def newickize(node):
480 """Convert a node tree to a newick tree recursively."""
481
482 if not self.node(node).succ: #terminal
483 return self.node(node).data.taxon+make_info_string(self.node(node).data,terminal=True)
484 else:
485 return '(%s)%s' % (','.join(map(newickize,self.node(node).succ)),make_info_string(self.node(node).data))
486 return subtree
487
488 treeline=['tree']
489 if self.name:
490 treeline.append(self.name)
491 else:
492 treeline.append('a_tree')
493 treeline.append('=')
494 if self.weight<>1:
495 treeline.append('[&W%s]' % str(round(float(self.weight),3)))
496 if self.rooted:
497 treeline.append('[&R]')
498 treeline.append('(%s)' % ','.join(map(newickize,self.node(self.root).succ)))
499 treeline.append(';')
500 if plain_newick:
501 return treeline[-2]
502 else:
503 return ' '.join(treeline)
504
505 def __str__(self):
506 """Short version of to_string(), gives plain tree"""
507 return self.to_string(plain=True)
508
509 def unroot(self):
510 """Defines a unrooted Tree structure, using data of a rooted Tree."""
511
512 # travel down the rooted tree structure and save all branches and the nodes they connect
513
514 def _get_branches(node):
515 branches=[]
516 for b in self.node(node).succ:
517 branches.append([node,b,self.node(b).data.branchlength,self.node(b).data.support])
518 branches.extend(_get_branches(b))
519 return branches
520
521 self.unrooted=_get_branches(self.root)
522 # if root is bifurcating, then it is eliminated
523 if len(self.node(self.root).succ)==2:
524 # find the two branches that connect to root
525 rootbranches=[b for b in self.unrooted if self.root in b[:2]]
526 b1=self.unrooted.pop(self.unrooted.index(rootbranches[0]))
527 b2=self.unrooted.pop(self.unrooted.index(rootbranches[1]))
528 # Connect them two each other. If both have support, it should be identical (or one set to None?).
529 # If both have branchlengths, they will be added
530 newbranch=[b1[1],b2[1],b1[2]+b2[2]]
531 if b1[3] is None:
532 newbranch.append(b2[3]) # either None (both rootbranches are unsupported) or some support
533 elif b2[3] is None:
534 newbranch.append(b1[3]) # dito
535 elif b1[3]==b2[3]:
536 newbranch.append(b1[3]) # identical support
537 elif b1[3]==0 or b2[3]==0:
538 newbranch.append(b1[3]+b2[3]) # one is 0, take the other
539 else:
540 raise TreeError, 'Support mismatch in bifurcating root: %f, %f' % (float(b1[3]),float(b2[3]))
541 self.unrooted.append(newbranch)
542
543 def root_with_outgroup(self,outgroup=None):
544
545 def _connect_subtree(parent,child):
546 """Hook subtree starting with node child to parent."""
547 for i,branch in enumerate(self.unrooted):
548 if parent in branch[:2] and child in branch[:2]:
549 branch=self.unrooted.pop(i)
550 break
551 else:
552 raise TreeError, 'Unable to connect nodes for rooting: nodes %d and %d are not connected' % (parent,child)
553 self.link(parent,child)
554 self.node(child).data.branchlength=branch[2]
555 self.node(child).data.support=branch[3]
556 #now check if there are more branches connected to the child, and if so, connect them
557 child_branches=[b for b in self.unrooted if child in b[:2]]
558 for b in child_branches:
559 if child==b[0]:
560 succ=b[1]
561 else:
562 succ=b[0]
563 _connect_subtree(child,succ)
564
565 # check the outgroup we're supposed to root with
566 if outgroup is None:
567 return self.root
568 outgroup_node=self.is_monophyletic(outgroup)
569 if outgroup_node==-1:
570 return -1
571 # if tree is already rooted with outgroup on a bifurcating root,
572 # or the outgroup includes all taxa on the tree, then we're fine
573 if (len(self.node(self.root).succ)==2 and outgroup_node in self.node(self.root).succ) or outgroup_node==self.root:
574 return self.root
575
576 self.unroot()
577 # now we find the branch that connects outgroup and ingroup
578 #print self.node(outgroup_node).prev
579 for i,b in enumerate(self.unrooted):
580 if outgroup_node in b[:2] and self.node(outgroup_node).prev in b[:2]:
581 root_branch=self.unrooted.pop(i)
582 break
583 else:
584 raise TreeError, 'Unrooted and rooted Tree do not match'
585 if outgroup_node==root_branch[1]:
586 ingroup_node=root_branch[0]
587 else:
588 ingroup_node=root_branch[1]
589 # now we destroy the old tree structure, but keep node data. Nodes will be reconnected according to new outgroup
590 for n in self.all_ids():
591 self.node(n).prev=None
592 self.node(n).succ=[]
593 # now we just add both subtrees (outgroup and ingroup) branch for branch
594 root=Nodes.Node(data=NodeData()) # new root
595 self.add(root) # add to tree description
596 self.root=root.id # set as root
597 self.unrooted.append([root.id,ingroup_node,root_branch[2],root_branch[3]]) # add branch to ingroup to unrooted tree
598 self.unrooted.append([root.id,outgroup_node,0.0,0.0]) # add branch to outgroup to unrooted tree
599 _connect_subtree(root.id,ingroup_node) # add ingroup
600 _connect_subtree(root.id,outgroup_node) # add outgroup
601 # if theres still a lonely node in self.chain, then it's the old root, and we delete it
602 oldroot=[i for i in self.all_ids() if self.node(i).prev is None and i!=self.root]
603 if len(oldroot)>1:
604 raise TreeError, 'Isolated nodes in tree description: %s' % ','.join(oldroot)
605 elif len(oldroot)==1:
606 self.kill(oldroot[0])
607 return self.root
608
609
610 def consensus(trees, threshold=0.5,outgroup=None):
611 """Compute a majority rule consensus tree of all clades with relative frequency>=threshold from a list of trees."""
612
613 total=len(trees)
614 if total==0:
615 return None
616 # shouldn't we make sure that it's NodeData or subclass??
617 dataclass=trees[0].dataclass
618 max_support=trees[0].max_support
619 clades={}
620 #countclades={}
621 alltaxa=sets.Set(trees[0].get_taxa())
622 # calculate calde frequencies
623 c=0
624 for t in trees:
625 c+=1
626 #if c%50==0:
627 # print c
628 if alltaxa!=sets.Set(t.get_taxa()):
629 raise TreeError, 'Trees for consensus must contain the same taxa'
630 t.root_with_outgroup(outgroup=outgroup)
631 for st_node in t._walk(t.root):
632 subclade_taxa=t.get_taxa(st_node)
633 subclade_taxa.sort()
634 subclade_taxa=str(subclade_taxa) # lists are not hashable
635 if subclade_taxa in clades:
636 clades[subclade_taxa]+=float(t.weight)/total
637 else:
638 clades[subclade_taxa]=float(t.weight)/total
639 #if subclade_taxa in countclades:
640 # countclades[subclade_taxa]+=t.weight
641 #else:
642 # countclades[subclade_taxa]=t.weight
643 # weed out clades below threshold
644 for (c,p) in clades.items():
645 if p<threshold:
646 del clades[c]
647 # create a tree with a root node
648 consensus=Tree(name='consensus_%2.1f' % float(threshold),data=dataclass)
649 # each clade needs a node in the new tree, add them as isolated nodes
650 for (c,s) in clades.items():
651 node=Nodes.Node(data=dataclass())
652 node.data.support=s
653 node.data.taxon=sets.Set(eval(c))
654 consensus.add(node)
655 # set root node data
656 consensus.node(consensus.root).data.support=None
657 consensus.node(consensus.root).data.taxon=alltaxa
658 # we sort the nodes by no. of taxa in the clade, so root will be the last
659 consensus_ids=consensus.all_ids()
660 consensus_ids.sort(lambda x,y:len(consensus.node(x).data.taxon)-len(consensus.node(y).data.taxon))
661 # now we just have to hook each node to the next smallest node that includes all taxa of the current
662 for i,current in enumerate(consensus_ids[:-1]): # skip the last one which is the root
663 #print '----'
664 #print 'current: ',consensus.node(current).data.taxon
665 # search remaining nodes
666 for parent in consensus_ids[i+1:]:
667 #print 'parent: ',consensus.node(parent).data.taxon
668 if consensus.node(parent).data.taxon.issuperset(consensus.node(current).data.taxon):
669 break
670 else:
671 sys.exit('corrupt tree structure?')
672 # internal nodes don't have taxa
673 if len(consensus.node(current).data.taxon)==1:
674 consensus.node(current).data.taxon=consensus.node(current).data.taxon.pop()
675 # reset the support for terminal nodes to maximum
676 #consensus.node(current).data.support=max_support
677 else:
678 consensus.node(current).data.taxon=None
679 consensus.link(parent,current)
680 # eliminate root taxon name
681 consensus.node(consensus_ids[-1]).data.taxon=None
682 return consensus
683
684
685
686