Mercurial > repos > nml > patrist
comparison patrist.py @ 0:f8847f5a5491 draft default tip
"planemo upload for repository https://github.com/phac-nml/patrist commit f64cb2a8399f83d8c025c8efdc3c3eec72922a7d"
author | nml |
---|---|
date | Tue, 17 Dec 2019 09:53:52 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f8847f5a5491 |
---|---|
1 import argparse | |
2 import os | |
3 import sys | |
4 | |
5 from Bio import Phylo | |
6 | |
7 | |
8 def walk_up(tips, curnode, pathlen, cutoff): | |
9 """ | |
10 Recursive function for traversing up a tree. | |
11 """ | |
12 pathlen += curnode.branch_length | |
13 if pathlen < cutoff: | |
14 if curnode.is_terminal(): | |
15 tips.append((curnode.name, pathlen)) | |
16 else: | |
17 for c in curnode.clades: | |
18 tips = walk_up(tips, c, pathlen, cutoff) | |
19 return tips | |
20 | |
21 | |
22 def walk_trunk(curnode, cutoff, parents): | |
23 """ | |
24 Find all tips in the tree that are within a threshold distance | |
25 of a reference tip. | |
26 """ | |
27 # first go down to parent and up other branch | |
28 tips = [] | |
29 pathlen = curnode.branch_length # 0.0184788 | |
30 p = parents[curnode] | |
31 | |
32 for c in p.clades: | |
33 if c == curnode: | |
34 continue | |
35 if c.is_terminal(): | |
36 if pathlen + c.branch_length < cutoff: | |
37 tips.append((c.name, pathlen+c.branch_length)) | |
38 else: | |
39 tips.extend(walk_up([], c, pathlen, cutoff)) | |
40 | |
41 # next walk down trunk until path length exceeds cutoff or hit root | |
42 while p in parents: | |
43 curnode = p | |
44 pathlen += p.branch_length # + 0.0104047 | |
45 p = parents[curnode] | |
46 if pathlen >= cutoff: | |
47 break | |
48 for c in p.clades: | |
49 if c == curnode: | |
50 continue | |
51 if c.is_terminal(): | |
52 if pathlen + c.branch_length < cutoff: # + 0.0503079 | |
53 tips.append((c.name, pathlen+c.branch_length)) | |
54 else: | |
55 tips.extend(walk_up([], c, pathlen, cutoff)) | |
56 return tips | |
57 | |
58 | |
59 def find_short_edges(tree, cutoff, keep_ties=True, | |
60 minimize=False, returnlist=False): | |
61 """ | |
62 Find the shortest edge from the earliest sequence of a patient to a | |
63 any sequence from any other patient. | |
64 minimize = keep only edge from earliest seq to the closest other seq | |
65 keep_ties = [to be used in conjunction with minimize] | |
66 report all edges with the same minimum distance | |
67 """ | |
68 | |
69 # generate dictionary of child->parent associations | |
70 parents = {} | |
71 for clade in tree.find_clades(order='level'): | |
72 for child in clade: | |
73 parents.update({child: clade}) | |
74 | |
75 tips = tree.get_terminals() | |
76 res = {} | |
77 for tip1 in tips: | |
78 # find the shortest distance in sequences that "cluster" with this one | |
79 min_dist = 99999. | |
80 tip2 = [] | |
81 for tipname, dist in walk_trunk(tip1, cutoff, parents): | |
82 if minimize and dist < min_dist: | |
83 min_dist = dist | |
84 tip2 = [[tipname, dist]] | |
85 else: | |
86 tip2.append([tipname, dist]) | |
87 | |
88 t1 = tip1.name | |
89 for t2, dist in tip2: | |
90 # sort tip names in lexico order | |
91 key = (t1, t2) if t1 < t2 else (t2, t1) | |
92 if key in res: | |
93 continue | |
94 res.update({key: dist}) | |
95 if minimize and keep_ties: | |
96 # output only one edge | |
97 break | |
98 | |
99 if returnlist: | |
100 reslist = [] | |
101 for key, dist in res.iteritems(): | |
102 reslist.append((key[0], key[1], dist)) | |
103 return reslist | |
104 | |
105 return res | |
106 | |
107 | |
108 def main(): | |
109 parser = argparse.ArgumentParser( | |
110 description='Generate clusters of tips from a tree that have' | |
111 ' a path length within a maximum distance of each other.' | |
112 ) | |
113 parser.add_argument('tree', help='<input> file ' | |
114 'containing Newick tree string.') | |
115 parser.add_argument('cutoff', type=float, help='Maximum ' | |
116 'patristic distance.') | |
117 parser.add_argument('outfile', default=None, help='<output> file to ' | |
118 'write results ' | |
119 'in CSV format.') | |
120 parser.add_argument('--minimize', help='Report no more than ' | |
121 'one nearest neighbour per tip.', | |
122 action='store_true') | |
123 parser.add_argument('--keep_ties', help='If more than one ' | |
124 'tip has the same patristic distance, ' | |
125 'report all as nearest neighbours.', | |
126 action='store_true') | |
127 parser.add_argument('--overwrite', help='Overwrite existing ' | |
128 'output file.', | |
129 action='store_true') | |
130 args = parser.parse_args() | |
131 | |
132 assert args.cutoff > 0, 'Cutoff %f must be greater than 0.' % (args.cutoff) | |
133 | |
134 if os.path.exists(args.outfile) and not args.overwrite: | |
135 print ('Output file', args.outfile, 'already exists, use --overwrite.') | |
136 sys.exit() | |
137 | |
138 outfile = open(args.outfile, 'w') | |
139 outfile.write('tree,tip1,tip2,dist,is.tie\n') | |
140 | |
141 trees = Phylo.parse(args.tree, 'newick') | |
142 for treenum, tree in enumerate(trees): | |
143 results = find_short_edges(tree, args.cutoff) | |
144 for key, dist in results.items(): | |
145 outfile.write('%d,%s,%s,%f\n' % (treenum, key[0], key[1], dist)) | |
146 | |
147 outfile.close() | |
148 | |
149 | |
150 if __name__ == "__main__": | |
151 main() |