Mercurial > repos > nml > patrist
comparison patrist.py @ 0:f8847f5a5491 draft default tip
"planemo upload for repository https://github.com/phac-nml/patrist commit f64cb2a8399f83d8c025c8efdc3c3eec72922a7d"
| author | nml |
|---|---|
| date | Tue, 17 Dec 2019 09:53:52 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:f8847f5a5491 |
|---|---|
| 1 import argparse | |
| 2 import os | |
| 3 import sys | |
| 4 | |
| 5 from Bio import Phylo | |
| 6 | |
| 7 | |
| 8 def walk_up(tips, curnode, pathlen, cutoff): | |
| 9 """ | |
| 10 Recursive function for traversing up a tree. | |
| 11 """ | |
| 12 pathlen += curnode.branch_length | |
| 13 if pathlen < cutoff: | |
| 14 if curnode.is_terminal(): | |
| 15 tips.append((curnode.name, pathlen)) | |
| 16 else: | |
| 17 for c in curnode.clades: | |
| 18 tips = walk_up(tips, c, pathlen, cutoff) | |
| 19 return tips | |
| 20 | |
| 21 | |
| 22 def walk_trunk(curnode, cutoff, parents): | |
| 23 """ | |
| 24 Find all tips in the tree that are within a threshold distance | |
| 25 of a reference tip. | |
| 26 """ | |
| 27 # first go down to parent and up other branch | |
| 28 tips = [] | |
| 29 pathlen = curnode.branch_length # 0.0184788 | |
| 30 p = parents[curnode] | |
| 31 | |
| 32 for c in p.clades: | |
| 33 if c == curnode: | |
| 34 continue | |
| 35 if c.is_terminal(): | |
| 36 if pathlen + c.branch_length < cutoff: | |
| 37 tips.append((c.name, pathlen+c.branch_length)) | |
| 38 else: | |
| 39 tips.extend(walk_up([], c, pathlen, cutoff)) | |
| 40 | |
| 41 # next walk down trunk until path length exceeds cutoff or hit root | |
| 42 while p in parents: | |
| 43 curnode = p | |
| 44 pathlen += p.branch_length # + 0.0104047 | |
| 45 p = parents[curnode] | |
| 46 if pathlen >= cutoff: | |
| 47 break | |
| 48 for c in p.clades: | |
| 49 if c == curnode: | |
| 50 continue | |
| 51 if c.is_terminal(): | |
| 52 if pathlen + c.branch_length < cutoff: # + 0.0503079 | |
| 53 tips.append((c.name, pathlen+c.branch_length)) | |
| 54 else: | |
| 55 tips.extend(walk_up([], c, pathlen, cutoff)) | |
| 56 return tips | |
| 57 | |
| 58 | |
| 59 def find_short_edges(tree, cutoff, keep_ties=True, | |
| 60 minimize=False, returnlist=False): | |
| 61 """ | |
| 62 Find the shortest edge from the earliest sequence of a patient to a | |
| 63 any sequence from any other patient. | |
| 64 minimize = keep only edge from earliest seq to the closest other seq | |
| 65 keep_ties = [to be used in conjunction with minimize] | |
| 66 report all edges with the same minimum distance | |
| 67 """ | |
| 68 | |
| 69 # generate dictionary of child->parent associations | |
| 70 parents = {} | |
| 71 for clade in tree.find_clades(order='level'): | |
| 72 for child in clade: | |
| 73 parents.update({child: clade}) | |
| 74 | |
| 75 tips = tree.get_terminals() | |
| 76 res = {} | |
| 77 for tip1 in tips: | |
| 78 # find the shortest distance in sequences that "cluster" with this one | |
| 79 min_dist = 99999. | |
| 80 tip2 = [] | |
| 81 for tipname, dist in walk_trunk(tip1, cutoff, parents): | |
| 82 if minimize and dist < min_dist: | |
| 83 min_dist = dist | |
| 84 tip2 = [[tipname, dist]] | |
| 85 else: | |
| 86 tip2.append([tipname, dist]) | |
| 87 | |
| 88 t1 = tip1.name | |
| 89 for t2, dist in tip2: | |
| 90 # sort tip names in lexico order | |
| 91 key = (t1, t2) if t1 < t2 else (t2, t1) | |
| 92 if key in res: | |
| 93 continue | |
| 94 res.update({key: dist}) | |
| 95 if minimize and keep_ties: | |
| 96 # output only one edge | |
| 97 break | |
| 98 | |
| 99 if returnlist: | |
| 100 reslist = [] | |
| 101 for key, dist in res.iteritems(): | |
| 102 reslist.append((key[0], key[1], dist)) | |
| 103 return reslist | |
| 104 | |
| 105 return res | |
| 106 | |
| 107 | |
| 108 def main(): | |
| 109 parser = argparse.ArgumentParser( | |
| 110 description='Generate clusters of tips from a tree that have' | |
| 111 ' a path length within a maximum distance of each other.' | |
| 112 ) | |
| 113 parser.add_argument('tree', help='<input> file ' | |
| 114 'containing Newick tree string.') | |
| 115 parser.add_argument('cutoff', type=float, help='Maximum ' | |
| 116 'patristic distance.') | |
| 117 parser.add_argument('outfile', default=None, help='<output> file to ' | |
| 118 'write results ' | |
| 119 'in CSV format.') | |
| 120 parser.add_argument('--minimize', help='Report no more than ' | |
| 121 'one nearest neighbour per tip.', | |
| 122 action='store_true') | |
| 123 parser.add_argument('--keep_ties', help='If more than one ' | |
| 124 'tip has the same patristic distance, ' | |
| 125 'report all as nearest neighbours.', | |
| 126 action='store_true') | |
| 127 parser.add_argument('--overwrite', help='Overwrite existing ' | |
| 128 'output file.', | |
| 129 action='store_true') | |
| 130 args = parser.parse_args() | |
| 131 | |
| 132 assert args.cutoff > 0, 'Cutoff %f must be greater than 0.' % (args.cutoff) | |
| 133 | |
| 134 if os.path.exists(args.outfile) and not args.overwrite: | |
| 135 print ('Output file', args.outfile, 'already exists, use --overwrite.') | |
| 136 sys.exit() | |
| 137 | |
| 138 outfile = open(args.outfile, 'w') | |
| 139 outfile.write('tree,tip1,tip2,dist,is.tie\n') | |
| 140 | |
| 141 trees = Phylo.parse(args.tree, 'newick') | |
| 142 for treenum, tree in enumerate(trees): | |
| 143 results = find_short_edges(tree, args.cutoff) | |
| 144 for key, dist in results.items(): | |
| 145 outfile.write('%d,%s,%s,%f\n' % (treenum, key[0], key[1], dist)) | |
| 146 | |
| 147 outfile.close() | |
| 148 | |
| 149 | |
| 150 if __name__ == "__main__": | |
| 151 main() |
