0
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 Modified version of code examples from the chemfp project.
|
|
4 http://code.google.com/p/chem-fingerprints/
|
|
5 Thanks to Andrew Dalke of Andrew Dalke Scientific!
|
|
6 """
|
|
7 import matplotlib
|
|
8 matplotlib.use('Agg')
|
|
9 import argparse
|
|
10 import os
|
|
11 import chemfp
|
|
12 import scipy.cluster.hierarchy as hcluster
|
|
13 import pylab
|
|
14 import numpy
|
|
15
|
|
16 def distance_matrix(arena, tanimoto_threshold = 0.0):
|
|
17 n = len(arena)
|
|
18 # Start off a similarity matrix with 1.0s along the diagonal
|
|
19 try:
|
|
20 similarities = numpy.identity(n, "d")
|
|
21 except:
|
|
22 raise Exception('Input dataset is to large!')
|
|
23 chemfp.set_num_threads( args.processors )
|
|
24
|
|
25 ## Compute the full similarity matrix.
|
|
26 # The implementation computes the upper-triangle then copies
|
|
27 # the upper-triangle into lower-triangle. It does not include
|
|
28 # terms for the diagonal.
|
|
29 results = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=tanimoto_threshold)
|
|
30
|
|
31 # Copy the results into the NumPy array.
|
|
32 for row_index, row in enumerate(results.iter_indices_and_scores()):
|
|
33 for target_index, target_score in row:
|
|
34 similarities[row_index, target_index] = target_score
|
|
35
|
|
36 # Return the distance matrix using the similarity matrix
|
|
37 return 1.0 - similarities
|
|
38
|
|
39
|
|
40 if __name__ == "__main__":
|
|
41 parser = argparse.ArgumentParser(description="""NxN clustering for fps files.
|
|
42 For more details please see the chemfp documentation:
|
|
43 https://chemfp.readthedocs.org
|
|
44 """)
|
|
45
|
|
46 parser.add_argument("-i", "--input", dest="input_path",
|
|
47 required=True,
|
|
48 help="Path to the input file.")
|
|
49
|
|
50 parser.add_argument("-c", "--cluster", dest="cluster_image",
|
|
51 help="Path to the output cluster image.")
|
|
52
|
|
53 parser.add_argument("-s", "--smatrix", dest="similarity_matrix",
|
|
54 help="Path to the similarity matrix output file.")
|
|
55
|
|
56 parser.add_argument("-t", "--threshold", dest="tanimoto_threshold",
|
|
57 type=float, default=0.0,
|
|
58 help="Tanimoto threshold [0.0]")
|
|
59
|
|
60 parser.add_argument("--oformat", default='png', help="Output format (png, svg)")
|
|
61
|
|
62 parser.add_argument('-p', '--processors', type=int,
|
|
63 default=4)
|
|
64
|
|
65 args = parser.parse_args()
|
|
66
|
|
67 targets = chemfp.open( args.input_path, format='fps' )
|
|
68 arena = chemfp.load_fingerprints( targets )
|
|
69 distances = distance_matrix( arena, args.tanimoto_threshold )
|
|
70
|
|
71 if args.similarity_matrix:
|
|
72 distances.tofile( args.similarity_matrix )
|
|
73
|
|
74 if args.cluster_image:
|
|
75 linkage = hcluster.linkage( distances, method="single", metric="euclidean" )
|
|
76
|
|
77 hcluster.dendrogram(linkage, labels=arena.ids)
|
|
78
|
|
79 pylab.savefig( args.cluster_image, format=args.oformat )
|
|
80
|