Mercurial > repos > nml > patrist
changeset 0:f8847f5a5491 draft default tip
"planemo upload for repository https://github.com/phac-nml/patrist commit f64cb2a8399f83d8c025c8efdc3c3eec72922a7d"
author | nml |
---|---|
date | Tue, 17 Dec 2019 09:53:52 -0500 |
parents | |
children | |
files | patrist.py patrist.xml test-data/outfile.csv test-data/tree.nhx |
diffstat | 4 files changed, 237 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/patrist.py Tue Dec 17 09:53:52 2019 -0500 @@ -0,0 +1,151 @@ +import argparse +import os +import sys + +from Bio import Phylo + + +def walk_up(tips, curnode, pathlen, cutoff): + """ + Recursive function for traversing up a tree. + """ + pathlen += curnode.branch_length + if pathlen < cutoff: + if curnode.is_terminal(): + tips.append((curnode.name, pathlen)) + else: + for c in curnode.clades: + tips = walk_up(tips, c, pathlen, cutoff) + return tips + + +def walk_trunk(curnode, cutoff, parents): + """ + Find all tips in the tree that are within a threshold distance + of a reference tip. + """ + # first go down to parent and up other branch + tips = [] + pathlen = curnode.branch_length # 0.0184788 + p = parents[curnode] + + for c in p.clades: + if c == curnode: + continue + if c.is_terminal(): + if pathlen + c.branch_length < cutoff: + tips.append((c.name, pathlen+c.branch_length)) + else: + tips.extend(walk_up([], c, pathlen, cutoff)) + + # next walk down trunk until path length exceeds cutoff or hit root + while p in parents: + curnode = p + pathlen += p.branch_length # + 0.0104047 + p = parents[curnode] + if pathlen >= cutoff: + break + for c in p.clades: + if c == curnode: + continue + if c.is_terminal(): + if pathlen + c.branch_length < cutoff: # + 0.0503079 + tips.append((c.name, pathlen+c.branch_length)) + else: + tips.extend(walk_up([], c, pathlen, cutoff)) + return tips + + +def find_short_edges(tree, cutoff, keep_ties=True, + minimize=False, returnlist=False): + """ + Find the shortest edge from the earliest sequence of a patient to a + any sequence from any other patient. + minimize = keep only edge from earliest seq to the closest other seq + keep_ties = [to be used in conjunction with minimize] + report all edges with the same minimum distance + """ + + # generate dictionary of child->parent associations + parents = {} + for clade in tree.find_clades(order='level'): + for child in clade: + parents.update({child: clade}) + + tips = tree.get_terminals() + res = {} + for tip1 in tips: + # find the shortest distance in sequences that "cluster" with this one + min_dist = 99999. + tip2 = [] + for tipname, dist in walk_trunk(tip1, cutoff, parents): + if minimize and dist < min_dist: + min_dist = dist + tip2 = [[tipname, dist]] + else: + tip2.append([tipname, dist]) + + t1 = tip1.name + for t2, dist in tip2: + # sort tip names in lexico order + key = (t1, t2) if t1 < t2 else (t2, t1) + if key in res: + continue + res.update({key: dist}) + if minimize and keep_ties: + # output only one edge + break + + if returnlist: + reslist = [] + for key, dist in res.iteritems(): + reslist.append((key[0], key[1], dist)) + return reslist + + return res + + +def main(): + parser = argparse.ArgumentParser( + description='Generate clusters of tips from a tree that have' + ' a path length within a maximum distance of each other.' + ) + parser.add_argument('tree', help='<input> file ' + 'containing Newick tree string.') + parser.add_argument('cutoff', type=float, help='Maximum ' + 'patristic distance.') + parser.add_argument('outfile', default=None, help='<output> file to ' + 'write results ' + 'in CSV format.') + parser.add_argument('--minimize', help='Report no more than ' + 'one nearest neighbour per tip.', + action='store_true') + parser.add_argument('--keep_ties', help='If more than one ' + 'tip has the same patristic distance, ' + 'report all as nearest neighbours.', + action='store_true') + parser.add_argument('--overwrite', help='Overwrite existing ' + 'output file.', + action='store_true') + args = parser.parse_args() + + assert args.cutoff > 0, 'Cutoff %f must be greater than 0.' % (args.cutoff) + + if os.path.exists(args.outfile) and not args.overwrite: + print ('Output file', args.outfile, 'already exists, use --overwrite.') + sys.exit() + + outfile = open(args.outfile, 'w') + outfile.write('tree,tip1,tip2,dist,is.tie\n') + + trees = Phylo.parse(args.tree, 'newick') + for treenum, tree in enumerate(trees): + results = find_short_edges(tree, args.cutoff) + for key, dist in results.items(): + outfile.write('%d,%s,%s,%f\n' % (treenum, key[0], key[1], dist)) + + outfile.close() + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/patrist.xml Tue Dec 17 09:53:52 2019 -0500 @@ -0,0 +1,81 @@ +<?xml version="1.0"?> +<tool id="patrist" name="PATRIST" version="@VERSION@"> + <description>extract patristic distance from a tree</description> + <macros> + <token name="@VERSION@">0.1.2</token> + </macros> + <requirements> + <requirement type="package" version="3.8.0">python</requirement> + <requirement type="package" version="1.74">biopython</requirement> + </requirements> + <version_command>python '$__tool_directory__/patrist.py'</version_command> + <command detect_errors="aggressive"> +<![CDATA[ +python '$__tool_directory__/patrist.py' + +#if '$minimize': + --minimize +#end if +#if '$keep_ties': + --keep_ties +#end if +--overwrite + +'$tree' '$cutoff' '$outfile' +]]> + </command> + <inputs> + <param format="nhx" name="tree" type="data" multiple="false" label="Newick tree file" /> + <param name="cutoff" type="float" value="0.5" label="Maximum patristic distance" /> + <param name="minimize" type="boolean" truevalue="" falsevalue="" checked="false" label="Report no more than one nearest neighbour per tip" /> + <param name="keep_ties" type="boolean" truevalue="" falsevalue="" checked="false" label="If more than one tip has the same patristic distance report all as nearest neighbours" /> + </inputs> + <outputs> + <data name="outfile" format="csv" label="${tool.name} on ${on_string}:outfile.csv" /> + </outputs> + <tests> + <test> + <param name="tree" value="tree.nhx" /> + <param name="cutoff" value="0.5" /> + <output name="outfile" file="outfile.csv" ftype="csv" lines_diff="0" /> + </test> + </tests> + <help> +<![CDATA[ + +=========== +Description +=========== + +.. class:: infomark + +Patrist will rapidly extract patristic distances from a tree for clustering tips +below a user-defined threshold. + +.. _Patrist: https://gist.github.com/ArtPoon/7330231e74201ded54b87142a1d6cd02 + +----- +Input +----- + +'tree', <input> file containing Newick tree string. + +---------- +Parameters +---------- + +'cutoff', Maximum patristic distance. +'--minimize', Report no more than one nearest neighbour per tip. +'--keep_ties', If more than one tip has the same patristic distance report all as nearest neighbours. + +------ +Output +------ + +'outfile', <output> file to write results in CSV format.' The script overwrites any file with the current file name in the output directory. + +]]> + </help> + <citations> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/outfile.csv Tue Dec 17 09:53:52 2019 -0500 @@ -0,0 +1,1 @@ +tree,tip1,tip2,dist,is.tie
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/tree.nhx Tue Dec 17 09:53:52 2019 -0500 @@ -0,0 +1,4 @@ +(((erHomoC:0.28006,erCaelC:0.22089):0.40998,(erHomoA:0.32304, +(erpCaelC:0.58815,((erHomoB:0.5807,erCaelB:0.23569):0.03586, +erCaelA:0.38272):0.06516):0.03492):0.14265):0.63594,(TRXHomo:0.65866, +TRXSacch:0.38791):0.32147,TRXEcoli:0.57336);