comparison nxn_clustering.py @ 12:3b14765c22ee draft default tip

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 7fb96a3844b4771084f18de2346ed6d5e241d839"
author bgruening
date Sat, 25 Sep 2021 19:07:44 +0000
parents 198b1e30c739
children
comparison
equal deleted inserted replaced
11:92c7cdc243e8 12:3b14765c22ee
1 #!/usr/bin/env python 1 # !/usr/bin/env python
2 """ 2 """
3 Modified version of code examples from the chemfp project. 3 Modified version of code examples from the chemfp project.
4 http://code.google.com/p/chem-fingerprints/ 4 http://code.google.com/p/chem-fingerprints/
5 Thanks to Andrew Dalke of Andrew Dalke Scientific! 5 Thanks to Andrew Dalke of Andrew Dalke Scientific!
6 """ 6 """
7
8 import argparse
9
10 import chemfp
7 import matplotlib 11 import matplotlib
8 matplotlib.use('Agg') 12 matplotlib.use("Agg") # noqa
9 from matplotlib import rcParams 13 from matplotlib import rcParams # noqa
10 rcParams.update({'figure.autolayout': True}) 14 rcParams.update({"figure.autolayout": True}) # noqa
11 import argparse 15 import numpy # noqa
12 import os 16 import pylab # noqa
13 import chemfp 17 import scipy.cluster.hierarchy as hcluster # noqa
14 import scipy.cluster.hierarchy as hcluster
15 import pylab
16 import numpy
17 18
18 def distance_matrix(arena, tanimoto_threshold = 0.0): 19
20 def distance_matrix(arena, tanimoto_threshold=0.0):
19 n = len(arena) 21 n = len(arena)
20 # Start off a similarity matrix with 1.0s along the diagonal 22 # Start off a similarity matrix with 1.0s along the diagonal
21 try: 23 try:
22 similarities = numpy.identity(n, "d") 24 similarities = numpy.identity(n, "d")
23 except: 25 except Exception:
24 raise Exception('Input dataset is to large!') 26 raise Exception("Input dataset is to large!")
25 chemfp.set_num_threads( args.processors ) 27 chemfp.set_num_threads(args.processors)
26 28
27 ## Compute the full similarity matrix. 29 # Compute the full similarity matrix.
28 # The implementation computes the upper-triangle then copies 30 # The implementation computes the upper-triangle then copies
29 # the upper-triangle into lower-triangle. It does not include 31 # the upper-triangle into lower-triangle. It does not include
30 # terms for the diagonal. 32 # terms for the diagonal.
31 results = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=tanimoto_threshold) 33 results = chemfp.search.threshold_tanimoto_search_symmetric(
34 arena, threshold=tanimoto_threshold
35 )
32 36
33 # Copy the results into the NumPy array. 37 # Copy the results into the NumPy array.
34 for row_index, row in enumerate(results.iter_indices_and_scores()): 38 for row_index, row in enumerate(results.iter_indices_and_scores()):
35 for target_index, target_score in row: 39 for target_index, target_score in row:
36 similarities[row_index, target_index] = target_score 40 similarities[row_index, target_index] = target_score
38 # Return the distance matrix using the similarity matrix 42 # Return the distance matrix using the similarity matrix
39 return 1.0 - similarities 43 return 1.0 - similarities
40 44
41 45
42 if __name__ == "__main__": 46 if __name__ == "__main__":
43 parser = argparse.ArgumentParser(description="""NxN clustering for fps files. 47 parser = argparse.ArgumentParser(
48 description="""NxN clustering for fps files.
44 For more details please see the chemfp documentation: 49 For more details please see the chemfp documentation:
45 https://chemfp.readthedocs.org 50 https://chemfp.readthedocs.org
46 """) 51 """
52 )
47 53
48 parser.add_argument("-i", "--input", dest="input_path", 54 parser.add_argument(
49 required=True, 55 "-i",
50 help="Path to the input file.") 56 "--input",
57 dest="input_path",
58 required=True,
59 help="Path to the input file.",
60 )
51 61
52 parser.add_argument("-c", "--cluster", dest="cluster_image", 62 parser.add_argument(
53 help="Path to the output cluster image.") 63 "-c",
64 "--cluster",
65 dest="cluster_image",
66 help="Path to the output cluster image.",
67 )
54 68
55 parser.add_argument("-s", "--smatrix", dest="similarity_matrix", 69 parser.add_argument(
56 help="Path to the similarity matrix output file.") 70 "-s",
71 "--smatrix",
72 dest="similarity_matrix",
73 help="Path to the similarity matrix output file.",
74 )
57 75
58 parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", 76 parser.add_argument(
59 type=float, default=0.0, 77 "-t",
60 help="Tanimoto threshold [0.0]") 78 "--threshold",
79 dest="tanimoto_threshold",
80 type=float,
81 default=0.0,
82 help="Tanimoto threshold [0.0]",
83 )
61 84
62 parser.add_argument("--oformat", default='png', help="Output format (png, svg)") 85 parser.add_argument("--oformat", default="png", help="Output format (png, svg)")
63 86
64 parser.add_argument('-p', '--processors', type=int, 87 parser.add_argument("-p", "--processors", type=int, default=4)
65 default=4)
66 88
67 args = parser.parse_args() 89 args = parser.parse_args()
68 90
69 targets = chemfp.open( args.input_path, format='fps' ) 91 targets = chemfp.open(args.input_path, format="fps")
70 arena = chemfp.load_fingerprints( targets ) 92 arena = chemfp.load_fingerprints(targets)
71 distances = distance_matrix( arena, args.tanimoto_threshold ) 93 distances = distance_matrix(arena, args.tanimoto_threshold)
72 94
73 if args.similarity_matrix: 95 if args.similarity_matrix:
74 numpy.savetxt(args.similarity_matrix, distances) 96 numpy.savetxt(args.similarity_matrix, distances)
75 97
76 if args.cluster_image: 98 if args.cluster_image:
77 linkage = hcluster.linkage(distances, method="single", metric="euclidean") 99 linkage = hcluster.linkage(distances, method="single", metric="euclidean")
78 hcluster.dendrogram(linkage, labels=arena.ids, leaf_rotation=90.) 100 hcluster.dendrogram(linkage, labels=arena.ids, leaf_rotation=90.0)
79 pylab.savefig(args.cluster_image, format=args.oformat) 101 pylab.savefig(args.cluster_image, format=args.oformat)
80