comparison patrist.py @ 0:f8847f5a5491 draft default tip

"planemo upload for repository https://github.com/phac-nml/patrist commit f64cb2a8399f83d8c025c8efdc3c3eec72922a7d"
author nml
date Tue, 17 Dec 2019 09:53:52 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:f8847f5a5491
1 import argparse
2 import os
3 import sys
4
5 from Bio import Phylo
6
7
8 def walk_up(tips, curnode, pathlen, cutoff):
9 """
10 Recursive function for traversing up a tree.
11 """
12 pathlen += curnode.branch_length
13 if pathlen < cutoff:
14 if curnode.is_terminal():
15 tips.append((curnode.name, pathlen))
16 else:
17 for c in curnode.clades:
18 tips = walk_up(tips, c, pathlen, cutoff)
19 return tips
20
21
22 def walk_trunk(curnode, cutoff, parents):
23 """
24 Find all tips in the tree that are within a threshold distance
25 of a reference tip.
26 """
27 # first go down to parent and up other branch
28 tips = []
29 pathlen = curnode.branch_length # 0.0184788
30 p = parents[curnode]
31
32 for c in p.clades:
33 if c == curnode:
34 continue
35 if c.is_terminal():
36 if pathlen + c.branch_length < cutoff:
37 tips.append((c.name, pathlen+c.branch_length))
38 else:
39 tips.extend(walk_up([], c, pathlen, cutoff))
40
41 # next walk down trunk until path length exceeds cutoff or hit root
42 while p in parents:
43 curnode = p
44 pathlen += p.branch_length # + 0.0104047
45 p = parents[curnode]
46 if pathlen >= cutoff:
47 break
48 for c in p.clades:
49 if c == curnode:
50 continue
51 if c.is_terminal():
52 if pathlen + c.branch_length < cutoff: # + 0.0503079
53 tips.append((c.name, pathlen+c.branch_length))
54 else:
55 tips.extend(walk_up([], c, pathlen, cutoff))
56 return tips
57
58
59 def find_short_edges(tree, cutoff, keep_ties=True,
60 minimize=False, returnlist=False):
61 """
62 Find the shortest edge from the earliest sequence of a patient to a
63 any sequence from any other patient.
64 minimize = keep only edge from earliest seq to the closest other seq
65 keep_ties = [to be used in conjunction with minimize]
66 report all edges with the same minimum distance
67 """
68
69 # generate dictionary of child->parent associations
70 parents = {}
71 for clade in tree.find_clades(order='level'):
72 for child in clade:
73 parents.update({child: clade})
74
75 tips = tree.get_terminals()
76 res = {}
77 for tip1 in tips:
78 # find the shortest distance in sequences that "cluster" with this one
79 min_dist = 99999.
80 tip2 = []
81 for tipname, dist in walk_trunk(tip1, cutoff, parents):
82 if minimize and dist < min_dist:
83 min_dist = dist
84 tip2 = [[tipname, dist]]
85 else:
86 tip2.append([tipname, dist])
87
88 t1 = tip1.name
89 for t2, dist in tip2:
90 # sort tip names in lexico order
91 key = (t1, t2) if t1 < t2 else (t2, t1)
92 if key in res:
93 continue
94 res.update({key: dist})
95 if minimize and keep_ties:
96 # output only one edge
97 break
98
99 if returnlist:
100 reslist = []
101 for key, dist in res.iteritems():
102 reslist.append((key[0], key[1], dist))
103 return reslist
104
105 return res
106
107
108 def main():
109 parser = argparse.ArgumentParser(
110 description='Generate clusters of tips from a tree that have'
111 ' a path length within a maximum distance of each other.'
112 )
113 parser.add_argument('tree', help='<input> file '
114 'containing Newick tree string.')
115 parser.add_argument('cutoff', type=float, help='Maximum '
116 'patristic distance.')
117 parser.add_argument('outfile', default=None, help='<output> file to '
118 'write results '
119 'in CSV format.')
120 parser.add_argument('--minimize', help='Report no more than '
121 'one nearest neighbour per tip.',
122 action='store_true')
123 parser.add_argument('--keep_ties', help='If more than one '
124 'tip has the same patristic distance, '
125 'report all as nearest neighbours.',
126 action='store_true')
127 parser.add_argument('--overwrite', help='Overwrite existing '
128 'output file.',
129 action='store_true')
130 args = parser.parse_args()
131
132 assert args.cutoff > 0, 'Cutoff %f must be greater than 0.' % (args.cutoff)
133
134 if os.path.exists(args.outfile) and not args.overwrite:
135 print ('Output file', args.outfile, 'already exists, use --overwrite.')
136 sys.exit()
137
138 outfile = open(args.outfile, 'w')
139 outfile.write('tree,tip1,tip2,dist,is.tie\n')
140
141 trees = Phylo.parse(args.tree, 'newick')
142 for treenum, tree in enumerate(trees):
143 results = find_short_edges(tree, args.cutoff)
144 for key, dist in results.items():
145 outfile.write('%d,%s,%s,%f\n' % (treenum, key[0], key[1], dist))
146
147 outfile.close()
148
149
150 if __name__ == "__main__":
151 main()