Mercurial > repos > nml > patrist
diff patrist.py @ 0:f8847f5a5491 draft default tip
"planemo upload for repository https://github.com/phac-nml/patrist commit f64cb2a8399f83d8c025c8efdc3c3eec72922a7d"
author | nml |
---|---|
date | Tue, 17 Dec 2019 09:53:52 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/patrist.py Tue Dec 17 09:53:52 2019 -0500 @@ -0,0 +1,151 @@ +import argparse +import os +import sys + +from Bio import Phylo + + +def walk_up(tips, curnode, pathlen, cutoff): + """ + Recursive function for traversing up a tree. + """ + pathlen += curnode.branch_length + if pathlen < cutoff: + if curnode.is_terminal(): + tips.append((curnode.name, pathlen)) + else: + for c in curnode.clades: + tips = walk_up(tips, c, pathlen, cutoff) + return tips + + +def walk_trunk(curnode, cutoff, parents): + """ + Find all tips in the tree that are within a threshold distance + of a reference tip. + """ + # first go down to parent and up other branch + tips = [] + pathlen = curnode.branch_length # 0.0184788 + p = parents[curnode] + + for c in p.clades: + if c == curnode: + continue + if c.is_terminal(): + if pathlen + c.branch_length < cutoff: + tips.append((c.name, pathlen+c.branch_length)) + else: + tips.extend(walk_up([], c, pathlen, cutoff)) + + # next walk down trunk until path length exceeds cutoff or hit root + while p in parents: + curnode = p + pathlen += p.branch_length # + 0.0104047 + p = parents[curnode] + if pathlen >= cutoff: + break + for c in p.clades: + if c == curnode: + continue + if c.is_terminal(): + if pathlen + c.branch_length < cutoff: # + 0.0503079 + tips.append((c.name, pathlen+c.branch_length)) + else: + tips.extend(walk_up([], c, pathlen, cutoff)) + return tips + + +def find_short_edges(tree, cutoff, keep_ties=True, + minimize=False, returnlist=False): + """ + Find the shortest edge from the earliest sequence of a patient to a + any sequence from any other patient. + minimize = keep only edge from earliest seq to the closest other seq + keep_ties = [to be used in conjunction with minimize] + report all edges with the same minimum distance + """ + + # generate dictionary of child->parent associations + parents = {} + for clade in tree.find_clades(order='level'): + for child in clade: + parents.update({child: clade}) + + tips = tree.get_terminals() + res = {} + for tip1 in tips: + # find the shortest distance in sequences that "cluster" with this one + min_dist = 99999. + tip2 = [] + for tipname, dist in walk_trunk(tip1, cutoff, parents): + if minimize and dist < min_dist: + min_dist = dist + tip2 = [[tipname, dist]] + else: + tip2.append([tipname, dist]) + + t1 = tip1.name + for t2, dist in tip2: + # sort tip names in lexico order + key = (t1, t2) if t1 < t2 else (t2, t1) + if key in res: + continue + res.update({key: dist}) + if minimize and keep_ties: + # output only one edge + break + + if returnlist: + reslist = [] + for key, dist in res.iteritems(): + reslist.append((key[0], key[1], dist)) + return reslist + + return res + + +def main(): + parser = argparse.ArgumentParser( + description='Generate clusters of tips from a tree that have' + ' a path length within a maximum distance of each other.' + ) + parser.add_argument('tree', help='<input> file ' + 'containing Newick tree string.') + parser.add_argument('cutoff', type=float, help='Maximum ' + 'patristic distance.') + parser.add_argument('outfile', default=None, help='<output> file to ' + 'write results ' + 'in CSV format.') + parser.add_argument('--minimize', help='Report no more than ' + 'one nearest neighbour per tip.', + action='store_true') + parser.add_argument('--keep_ties', help='If more than one ' + 'tip has the same patristic distance, ' + 'report all as nearest neighbours.', + action='store_true') + parser.add_argument('--overwrite', help='Overwrite existing ' + 'output file.', + action='store_true') + args = parser.parse_args() + + assert args.cutoff > 0, 'Cutoff %f must be greater than 0.' % (args.cutoff) + + if os.path.exists(args.outfile) and not args.overwrite: + print ('Output file', args.outfile, 'already exists, use --overwrite.') + sys.exit() + + outfile = open(args.outfile, 'w') + outfile.write('tree,tip1,tip2,dist,is.tie\n') + + trees = Phylo.parse(args.tree, 'newick') + for treenum, tree in enumerate(trees): + results = find_short_edges(tree, args.cutoff) + for key, dist in results.items(): + outfile.write('%d,%s,%s,%f\n' % (treenum, key[0], key[1], dist)) + + outfile.close() + + +if __name__ == "__main__": + main()