Mercurial > repos > bgruening > chemfp
diff nxn_clustering.py @ 5:57a1a58056a6 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit d786052cd04f8b25eb4aff80b1b9724f62031b61
author | bgruening |
---|---|
date | Sat, 20 May 2017 12:57:06 -0400 |
parents | 70b071de9bee |
children | 0d88631bb7de |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/nxn_clustering.py Sat May 20 12:57:06 2017 -0400 @@ -0,0 +1,80 @@ +#!/usr/bin/env python +""" + Modified version of code examples from the chemfp project. + http://code.google.com/p/chem-fingerprints/ + Thanks to Andrew Dalke of Andrew Dalke Scientific! +""" +import matplotlib +matplotlib.use('Agg') +import argparse +import os +import chemfp +import scipy.cluster.hierarchy as hcluster +import pylab +import numpy + +def distance_matrix(arena, tanimoto_threshold = 0.0): + n = len(arena) + # Start off a similarity matrix with 1.0s along the diagonal + try: + similarities = numpy.identity(n, "d") + except: + raise Exception('Input dataset is to large!') + chemfp.set_num_threads( args.processors ) + + ## Compute the full similarity matrix. + # The implementation computes the upper-triangle then copies + # the upper-triangle into lower-triangle. It does not include + # terms for the diagonal. + results = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=tanimoto_threshold) + + # Copy the results into the NumPy array. + for row_index, row in enumerate(results.iter_indices_and_scores()): + for target_index, target_score in row: + similarities[row_index, target_index] = target_score + + # Return the distance matrix using the similarity matrix + return 1.0 - similarities + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="""NxN clustering for fps files. +For more details please see the chemfp documentation: +https://chemfp.readthedocs.org +""") + + parser.add_argument("-i", "--input", dest="input_path", + required=True, + help="Path to the input file.") + + parser.add_argument("-c", "--cluster", dest="cluster_image", + help="Path to the output cluster image.") + + parser.add_argument("-s", "--smatrix", dest="similarity_matrix", + help="Path to the similarity matrix output file.") + + parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", + type=float, default=0.0, + help="Tanimoto threshold [0.0]") + + parser.add_argument("--oformat", default='png', help="Output format (png, svg)") + + parser.add_argument('-p', '--processors', type=int, + default=4) + + args = parser.parse_args() + + targets = chemfp.open( args.input_path, format='fps' ) + arena = chemfp.load_fingerprints( targets ) + distances = distance_matrix( arena, args.tanimoto_threshold ) + + if args.similarity_matrix: + distances.tofile( args.similarity_matrix ) + + if args.cluster_image: + linkage = hcluster.linkage( distances, method="single", metric="euclidean" ) + + hcluster.dendrogram(linkage, labels=arena.ids) + + pylab.savefig( args.cluster_image, format=args.oformat ) +