changeset 0:f8847f5a5491 draft default tip

"planemo upload for repository https://github.com/phac-nml/patrist commit f64cb2a8399f83d8c025c8efdc3c3eec72922a7d"
author nml
date Tue, 17 Dec 2019 09:53:52 -0500
parents
children
files patrist.py patrist.xml test-data/outfile.csv test-data/tree.nhx
diffstat 4 files changed, 237 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/patrist.py	Tue Dec 17 09:53:52 2019 -0500
@@ -0,0 +1,151 @@
+import argparse
+import os
+import sys
+
+from Bio import Phylo
+
+
+def walk_up(tips, curnode, pathlen, cutoff):
+    """
+    Recursive function for traversing up a tree.
+    """
+    pathlen += curnode.branch_length
+    if pathlen < cutoff:
+        if curnode.is_terminal():
+            tips.append((curnode.name, pathlen))
+        else:
+            for c in curnode.clades:
+                tips = walk_up(tips, c, pathlen, cutoff)
+    return tips
+
+
+def walk_trunk(curnode, cutoff, parents):
+    """
+    Find all tips in the tree that are within a threshold distance
+    of a reference tip.
+    """
+    # first go down to parent and up other branch
+    tips = []
+    pathlen = curnode.branch_length  # 0.0184788
+    p = parents[curnode]
+
+    for c in p.clades:
+        if c == curnode:
+            continue
+        if c.is_terminal():
+            if pathlen + c.branch_length < cutoff:
+                tips.append((c.name, pathlen+c.branch_length))
+        else:
+            tips.extend(walk_up([], c, pathlen, cutoff))
+
+    # next walk down trunk until path length exceeds cutoff or hit root
+    while p in parents:
+        curnode = p
+        pathlen += p.branch_length  # + 0.0104047
+        p = parents[curnode]
+        if pathlen >= cutoff:
+            break
+        for c in p.clades:
+            if c == curnode:
+                continue
+            if c.is_terminal():
+                if pathlen + c.branch_length < cutoff:  # + 0.0503079
+                    tips.append((c.name, pathlen+c.branch_length))
+            else:
+                tips.extend(walk_up([], c, pathlen, cutoff))
+    return tips
+
+
+def find_short_edges(tree, cutoff, keep_ties=True,
+                     minimize=False, returnlist=False):
+    """
+    Find the shortest edge from the earliest sequence of a patient to a
+    any sequence from any other patient.
+    minimize = keep only edge from earliest seq to the closest other seq
+    keep_ties = [to be used in conjunction with minimize]
+                report all edges with the same minimum distance
+    """
+
+    # generate dictionary of child->parent associations
+    parents = {}
+    for clade in tree.find_clades(order='level'):
+        for child in clade:
+            parents.update({child: clade})
+
+    tips = tree.get_terminals()
+    res = {}
+    for tip1 in tips:
+        # find the shortest distance in sequences that "cluster" with this one
+        min_dist = 99999.
+        tip2 = []
+        for tipname, dist in walk_trunk(tip1, cutoff, parents):
+            if minimize and dist < min_dist:
+                min_dist = dist
+                tip2 = [[tipname, dist]]
+            else:
+                tip2.append([tipname, dist])
+
+        t1 = tip1.name
+        for t2, dist in tip2:
+            # sort tip names in lexico order
+            key = (t1, t2) if t1 < t2 else (t2, t1)
+            if key in res:
+                continue
+            res.update({key: dist})
+            if minimize and keep_ties:
+                # output only one edge
+                break
+
+    if returnlist:
+        reslist = []
+        for key, dist in res.iteritems():
+            reslist.append((key[0], key[1], dist))
+        return reslist
+
+    return res
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate clusters of tips from a tree that have'
+        ' a path length within a maximum distance of each other.'
+    )
+    parser.add_argument('tree', help='<input> file '
+                                     'containing Newick tree string.')
+    parser.add_argument('cutoff', type=float, help='Maximum '
+                                                   'patristic distance.')
+    parser.add_argument('outfile', default=None, help='<output> file to '
+                                                      'write results '
+                                                      'in CSV format.')
+    parser.add_argument('--minimize', help='Report no more than '
+                                           'one nearest neighbour per tip.',
+                                           action='store_true')
+    parser.add_argument('--keep_ties', help='If more than one '
+                        'tip has the same patristic distance, '
+                        'report all as nearest neighbours.',
+                        action='store_true')
+    parser.add_argument('--overwrite', help='Overwrite existing '
+                                            'output file.',
+                                            action='store_true')
+    args = parser.parse_args()
+
+    assert args.cutoff > 0, 'Cutoff %f must be greater than 0.' % (args.cutoff)
+
+    if os.path.exists(args.outfile) and not args.overwrite:
+        print ('Output file', args.outfile, 'already exists, use --overwrite.')
+        sys.exit()
+
+    outfile = open(args.outfile, 'w')
+    outfile.write('tree,tip1,tip2,dist,is.tie\n')
+
+    trees = Phylo.parse(args.tree, 'newick')
+    for treenum, tree in enumerate(trees):
+        results = find_short_edges(tree, args.cutoff)
+        for key, dist in results.items():
+            outfile.write('%d,%s,%s,%f\n' % (treenum, key[0], key[1], dist))
+
+    outfile.close()
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/patrist.xml	Tue Dec 17 09:53:52 2019 -0500
@@ -0,0 +1,81 @@
+<?xml version="1.0"?>
+<tool id="patrist" name="PATRIST" version="@VERSION@">
+    <description>extract patristic distance from a tree</description>
+    <macros>
+        <token name="@VERSION@">0.1.2</token>
+    </macros>
+    <requirements>
+        <requirement type="package" version="3.8.0">python</requirement>
+        <requirement type="package" version="1.74">biopython</requirement>
+    </requirements>
+    <version_command>python '$__tool_directory__/patrist.py'</version_command>
+    <command detect_errors="aggressive">
+<![CDATA[
+python '$__tool_directory__/patrist.py' 
+
+#if '$minimize':
+    --minimize 
+#end if
+#if '$keep_ties':
+    --keep_ties 
+#end if
+--overwrite 
+
+'$tree' '$cutoff' '$outfile'
+]]>
+    </command>
+    <inputs>
+        <param format="nhx" name="tree" type="data" multiple="false" label="Newick tree file" />
+        <param name="cutoff" type="float" value="0.5" label="Maximum patristic distance" />
+        <param name="minimize" type="boolean" truevalue="" falsevalue="" checked="false" label="Report no more than one nearest neighbour per tip" />
+        <param name="keep_ties" type="boolean" truevalue="" falsevalue="" checked="false" label="If more than one tip has the same patristic distance report all as nearest neighbours" />
+    </inputs>
+    <outputs>
+        <data name="outfile" format="csv" label="${tool.name} on ${on_string}:outfile.csv" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="tree" value="tree.nhx" />
+            <param name="cutoff" value="0.5" />
+            <output name="outfile" file="outfile.csv" ftype="csv" lines_diff="0" />
+        </test>
+    </tests>
+    <help>
+<![CDATA[
+
+===========
+Description
+===========
+
+.. class:: infomark
+
+Patrist will rapidly extract patristic distances from a tree for clustering tips
+below a user-defined threshold.
+
+.. _Patrist: https://gist.github.com/ArtPoon/7330231e74201ded54b87142a1d6cd02
+
+-----
+Input
+-----
+
+'tree', <input> file containing Newick tree string.
+
+----------
+Parameters
+----------
+
+'cutoff', Maximum patristic distance.
+'--minimize', Report no more than one nearest neighbour per tip.
+'--keep_ties', If more than one tip has the same patristic distance report all as nearest neighbours.
+
+------
+Output
+------
+
+'outfile', <output> file to write results in CSV format.' The script overwrites any file with the current file name in the output directory.
+
+]]>
+    </help>
+    <citations>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/outfile.csv	Tue Dec 17 09:53:52 2019 -0500
@@ -0,0 +1,1 @@
+tree,tip1,tip2,dist,is.tie
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/tree.nhx	Tue Dec 17 09:53:52 2019 -0500
@@ -0,0 +1,4 @@
+(((erHomoC:0.28006,erCaelC:0.22089):0.40998,(erHomoA:0.32304,
+(erpCaelC:0.58815,((erHomoB:0.5807,erCaelB:0.23569):0.03586,
+erCaelA:0.38272):0.06516):0.03492):0.14265):0.63594,(TRXHomo:0.65866,
+TRXSacch:0.38791):0.32147,TRXEcoli:0.57336);