Mercurial > repos > bgruening > chemfp
comparison nxn_clustering.py @ 12:3b14765c22ee draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 7fb96a3844b4771084f18de2346ed6d5e241d839"
author | bgruening |
---|---|
date | Sat, 25 Sep 2021 19:07:44 +0000 |
parents | 198b1e30c739 |
children |
comparison
equal
deleted
inserted
replaced
11:92c7cdc243e8 | 12:3b14765c22ee |
---|---|
1 #!/usr/bin/env python | 1 # !/usr/bin/env python |
2 """ | 2 """ |
3 Modified version of code examples from the chemfp project. | 3 Modified version of code examples from the chemfp project. |
4 http://code.google.com/p/chem-fingerprints/ | 4 http://code.google.com/p/chem-fingerprints/ |
5 Thanks to Andrew Dalke of Andrew Dalke Scientific! | 5 Thanks to Andrew Dalke of Andrew Dalke Scientific! |
6 """ | 6 """ |
7 | |
8 import argparse | |
9 | |
10 import chemfp | |
7 import matplotlib | 11 import matplotlib |
8 matplotlib.use('Agg') | 12 matplotlib.use("Agg") # noqa |
9 from matplotlib import rcParams | 13 from matplotlib import rcParams # noqa |
10 rcParams.update({'figure.autolayout': True}) | 14 rcParams.update({"figure.autolayout": True}) # noqa |
11 import argparse | 15 import numpy # noqa |
12 import os | 16 import pylab # noqa |
13 import chemfp | 17 import scipy.cluster.hierarchy as hcluster # noqa |
14 import scipy.cluster.hierarchy as hcluster | |
15 import pylab | |
16 import numpy | |
17 | 18 |
18 def distance_matrix(arena, tanimoto_threshold = 0.0): | 19 |
20 def distance_matrix(arena, tanimoto_threshold=0.0): | |
19 n = len(arena) | 21 n = len(arena) |
20 # Start off a similarity matrix with 1.0s along the diagonal | 22 # Start off a similarity matrix with 1.0s along the diagonal |
21 try: | 23 try: |
22 similarities = numpy.identity(n, "d") | 24 similarities = numpy.identity(n, "d") |
23 except: | 25 except Exception: |
24 raise Exception('Input dataset is to large!') | 26 raise Exception("Input dataset is to large!") |
25 chemfp.set_num_threads( args.processors ) | 27 chemfp.set_num_threads(args.processors) |
26 | 28 |
27 ## Compute the full similarity matrix. | 29 # Compute the full similarity matrix. |
28 # The implementation computes the upper-triangle then copies | 30 # The implementation computes the upper-triangle then copies |
29 # the upper-triangle into lower-triangle. It does not include | 31 # the upper-triangle into lower-triangle. It does not include |
30 # terms for the diagonal. | 32 # terms for the diagonal. |
31 results = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=tanimoto_threshold) | 33 results = chemfp.search.threshold_tanimoto_search_symmetric( |
34 arena, threshold=tanimoto_threshold | |
35 ) | |
32 | 36 |
33 # Copy the results into the NumPy array. | 37 # Copy the results into the NumPy array. |
34 for row_index, row in enumerate(results.iter_indices_and_scores()): | 38 for row_index, row in enumerate(results.iter_indices_and_scores()): |
35 for target_index, target_score in row: | 39 for target_index, target_score in row: |
36 similarities[row_index, target_index] = target_score | 40 similarities[row_index, target_index] = target_score |
38 # Return the distance matrix using the similarity matrix | 42 # Return the distance matrix using the similarity matrix |
39 return 1.0 - similarities | 43 return 1.0 - similarities |
40 | 44 |
41 | 45 |
42 if __name__ == "__main__": | 46 if __name__ == "__main__": |
43 parser = argparse.ArgumentParser(description="""NxN clustering for fps files. | 47 parser = argparse.ArgumentParser( |
48 description="""NxN clustering for fps files. | |
44 For more details please see the chemfp documentation: | 49 For more details please see the chemfp documentation: |
45 https://chemfp.readthedocs.org | 50 https://chemfp.readthedocs.org |
46 """) | 51 """ |
52 ) | |
47 | 53 |
48 parser.add_argument("-i", "--input", dest="input_path", | 54 parser.add_argument( |
49 required=True, | 55 "-i", |
50 help="Path to the input file.") | 56 "--input", |
57 dest="input_path", | |
58 required=True, | |
59 help="Path to the input file.", | |
60 ) | |
51 | 61 |
52 parser.add_argument("-c", "--cluster", dest="cluster_image", | 62 parser.add_argument( |
53 help="Path to the output cluster image.") | 63 "-c", |
64 "--cluster", | |
65 dest="cluster_image", | |
66 help="Path to the output cluster image.", | |
67 ) | |
54 | 68 |
55 parser.add_argument("-s", "--smatrix", dest="similarity_matrix", | 69 parser.add_argument( |
56 help="Path to the similarity matrix output file.") | 70 "-s", |
71 "--smatrix", | |
72 dest="similarity_matrix", | |
73 help="Path to the similarity matrix output file.", | |
74 ) | |
57 | 75 |
58 parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", | 76 parser.add_argument( |
59 type=float, default=0.0, | 77 "-t", |
60 help="Tanimoto threshold [0.0]") | 78 "--threshold", |
79 dest="tanimoto_threshold", | |
80 type=float, | |
81 default=0.0, | |
82 help="Tanimoto threshold [0.0]", | |
83 ) | |
61 | 84 |
62 parser.add_argument("--oformat", default='png', help="Output format (png, svg)") | 85 parser.add_argument("--oformat", default="png", help="Output format (png, svg)") |
63 | 86 |
64 parser.add_argument('-p', '--processors', type=int, | 87 parser.add_argument("-p", "--processors", type=int, default=4) |
65 default=4) | |
66 | 88 |
67 args = parser.parse_args() | 89 args = parser.parse_args() |
68 | 90 |
69 targets = chemfp.open( args.input_path, format='fps' ) | 91 targets = chemfp.open(args.input_path, format="fps") |
70 arena = chemfp.load_fingerprints( targets ) | 92 arena = chemfp.load_fingerprints(targets) |
71 distances = distance_matrix( arena, args.tanimoto_threshold ) | 93 distances = distance_matrix(arena, args.tanimoto_threshold) |
72 | 94 |
73 if args.similarity_matrix: | 95 if args.similarity_matrix: |
74 numpy.savetxt(args.similarity_matrix, distances) | 96 numpy.savetxt(args.similarity_matrix, distances) |
75 | 97 |
76 if args.cluster_image: | 98 if args.cluster_image: |
77 linkage = hcluster.linkage(distances, method="single", metric="euclidean") | 99 linkage = hcluster.linkage(distances, method="single", metric="euclidean") |
78 hcluster.dendrogram(linkage, labels=arena.ids, leaf_rotation=90.) | 100 hcluster.dendrogram(linkage, labels=arena.ids, leaf_rotation=90.0) |
79 pylab.savefig(args.cluster_image, format=args.oformat) | 101 pylab.savefig(args.cluster_image, format=args.oformat) |
80 |