diff nxn_clustering.py @ 5:57a1a58056a6 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit d786052cd04f8b25eb4aff80b1b9724f62031b61
author bgruening
date Sat, 20 May 2017 12:57:06 -0400
parents 70b071de9bee
children 0d88631bb7de
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/nxn_clustering.py	Sat May 20 12:57:06 2017 -0400
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+"""
+    Modified version of code examples from the chemfp project.
+    http://code.google.com/p/chem-fingerprints/
+    Thanks to Andrew Dalke of Andrew Dalke Scientific!
+"""
+import matplotlib
+matplotlib.use('Agg')
+import argparse
+import os
+import chemfp
+import scipy.cluster.hierarchy as hcluster
+import pylab
+import numpy
+
+def distance_matrix(arena, tanimoto_threshold = 0.0):
+    n = len(arena)
+    # Start off a similarity matrix with 1.0s along the diagonal
+    try:
+        similarities = numpy.identity(n, "d")
+    except:
+        raise Exception('Input dataset is to large!')
+    chemfp.set_num_threads( args.processors )
+
+    ## Compute the full similarity matrix.
+    # The implementation computes the upper-triangle then copies
+    # the upper-triangle into lower-triangle. It does not include
+    # terms for the diagonal.
+    results = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=tanimoto_threshold)
+
+    # Copy the results into the NumPy array.
+    for row_index, row in enumerate(results.iter_indices_and_scores()):
+        for target_index, target_score in row:
+            similarities[row_index, target_index] = target_score
+
+    # Return the distance matrix using the similarity matrix
+    return 1.0 - similarities
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="""NxN clustering for fps files.
+For more details please see the chemfp documentation:
+https://chemfp.readthedocs.org
+""")
+
+    parser.add_argument("-i", "--input", dest="input_path",
+                    required=True,
+                    help="Path to the input file.")
+
+    parser.add_argument("-c", "--cluster", dest="cluster_image",
+                    help="Path to the output cluster image.")
+
+    parser.add_argument("-s", "--smatrix", dest="similarity_matrix",
+                    help="Path to the similarity matrix output file.")
+
+    parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", 
+                    type=float, default=0.0,
+                    help="Tanimoto threshold [0.0]")
+
+    parser.add_argument("--oformat", default='png', help="Output format (png, svg)")
+
+    parser.add_argument('-p', '--processors', type=int, 
+        default=4)
+
+    args = parser.parse_args()
+
+    targets = chemfp.open( args.input_path, format='fps' )
+    arena = chemfp.load_fingerprints( targets )
+    distances  = distance_matrix( arena, args.tanimoto_threshold )
+
+    if args.similarity_matrix:
+        distances.tofile( args.similarity_matrix )
+
+    if args.cluster_image:
+        linkage = hcluster.linkage( distances, method="single", metric="euclidean" )
+
+        hcluster.dendrogram(linkage, labels=arena.ids)
+
+        pylab.savefig( args.cluster_image, format=args.oformat )
+