Mercurial > repos > bgruening > chemfp
comparison nxn_clustering.py @ 5:57a1a58056a6 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit d786052cd04f8b25eb4aff80b1b9724f62031b61
author | bgruening |
---|---|
date | Sat, 20 May 2017 12:57:06 -0400 |
parents | 70b071de9bee |
children | 0d88631bb7de |
comparison
equal
deleted
inserted
replaced
4:685a138131f0 | 5:57a1a58056a6 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 Modified version of code examples from the chemfp project. | |
4 http://code.google.com/p/chem-fingerprints/ | |
5 Thanks to Andrew Dalke of Andrew Dalke Scientific! | |
6 """ | |
7 import matplotlib | |
8 matplotlib.use('Agg') | |
9 import argparse | |
10 import os | |
11 import chemfp | |
12 import scipy.cluster.hierarchy as hcluster | |
13 import pylab | |
14 import numpy | |
15 | |
16 def distance_matrix(arena, tanimoto_threshold = 0.0): | |
17 n = len(arena) | |
18 # Start off a similarity matrix with 1.0s along the diagonal | |
19 try: | |
20 similarities = numpy.identity(n, "d") | |
21 except: | |
22 raise Exception('Input dataset is to large!') | |
23 chemfp.set_num_threads( args.processors ) | |
24 | |
25 ## Compute the full similarity matrix. | |
26 # The implementation computes the upper-triangle then copies | |
27 # the upper-triangle into lower-triangle. It does not include | |
28 # terms for the diagonal. | |
29 results = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=tanimoto_threshold) | |
30 | |
31 # Copy the results into the NumPy array. | |
32 for row_index, row in enumerate(results.iter_indices_and_scores()): | |
33 for target_index, target_score in row: | |
34 similarities[row_index, target_index] = target_score | |
35 | |
36 # Return the distance matrix using the similarity matrix | |
37 return 1.0 - similarities | |
38 | |
39 | |
40 if __name__ == "__main__": | |
41 parser = argparse.ArgumentParser(description="""NxN clustering for fps files. | |
42 For more details please see the chemfp documentation: | |
43 https://chemfp.readthedocs.org | |
44 """) | |
45 | |
46 parser.add_argument("-i", "--input", dest="input_path", | |
47 required=True, | |
48 help="Path to the input file.") | |
49 | |
50 parser.add_argument("-c", "--cluster", dest="cluster_image", | |
51 help="Path to the output cluster image.") | |
52 | |
53 parser.add_argument("-s", "--smatrix", dest="similarity_matrix", | |
54 help="Path to the similarity matrix output file.") | |
55 | |
56 parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", | |
57 type=float, default=0.0, | |
58 help="Tanimoto threshold [0.0]") | |
59 | |
60 parser.add_argument("--oformat", default='png', help="Output format (png, svg)") | |
61 | |
62 parser.add_argument('-p', '--processors', type=int, | |
63 default=4) | |
64 | |
65 args = parser.parse_args() | |
66 | |
67 targets = chemfp.open( args.input_path, format='fps' ) | |
68 arena = chemfp.load_fingerprints( targets ) | |
69 distances = distance_matrix( arena, args.tanimoto_threshold ) | |
70 | |
71 if args.similarity_matrix: | |
72 distances.tofile( args.similarity_matrix ) | |
73 | |
74 if args.cluster_image: | |
75 linkage = hcluster.linkage( distances, method="single", metric="euclidean" ) | |
76 | |
77 hcluster.dendrogram(linkage, labels=arena.ids) | |
78 | |
79 pylab.savefig( args.cluster_image, format=args.oformat ) | |
80 |