comparison sucos_cluster.py @ 0:f8f53668d5a2 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
author bgruening
date Wed, 02 Oct 2019 12:58:43 -0400
parents
children 4f1896782f7c
comparison
equal deleted inserted replaced
-1:000000000000 0:f8f53668d5a2
1 #!/usr/bin/env python
2 """
3 Cluster a set of molecules based on their 3D overlays as determined by the SuCOS score.
4
5 This will generate a set of SD files, one for each cluster of molecules (presumably corresponding to a
6 binding pocket in the protein target).
7
8
9 SuCOS is the work of Susan Leung.
10 GitHub: https://github.com/susanhleung/SuCOS
11 Publication: https://doi.org/10.26434/chemrxiv.8100203.v1
12 """
13
14 import sucos, utils
15 import argparse, gzip
16 from rdkit import Chem
17 import numpy as np
18 import pandas as pd
19 from scipy.cluster.hierarchy import linkage, fcluster
20
21 ### start main execution #########################################
22
23
24 def calc_distance_matrix(mols):
25 """
26 Calculate a full distance matrix for the given molecules. Identical molecules get a score of 0.0 with the maximum
27 distance possible being 1.0.
28 :param mols: A list of molecules. It must be possible to iterate through this list multiple times
29 :return: A NxN 2D array of distance scores, with N being the number of molecules in the input
30 """
31
32 # TODO - do we need to calculate both sides of the matrix? Tanimoto is supposed to be a symmetric distance measure,
33 # but the matrix that is generated does not seem to be symmetric.
34
35 mol_fm_tuples = []
36 for mol in mols:
37 features = sucos.getRawFeatures(mol)
38 mol_fm_tuples.append((mol, features))
39
40 matrix = []
41 for tuple1 in mol_fm_tuples:
42 tmp = []
43 for tuple2 in mol_fm_tuples:
44 if tuple1[0] == tuple2[0]:
45 tmp.append(0.0)
46 else:
47 #utils.log("Calculating SuCOS between", mol1, mol2)
48 sucos_score, fm_score, tani_score = sucos.get_SucosScore(tuple1[0], tuple2[0],
49 tani=True, ref_features=tuple1[1], query_features=tuple2[1])
50 tmp.append(1.0 - sucos_score)
51 matrix.append(tmp)
52
53
54 return matrix
55
56
57 def cluster(matrix, threshold=0.8):
58 """
59 Cluster the supplied distance matrix returning an array of clusters.
60 :param matrix: the distance matrix, as calculated with the calc_distance_matrix function.
61 :param threshold: The clustering cuttoff. The default of 0.8 is a reasonable value to use.
62 :return: An array of clusters, each cluster being an array of the indices from the matrix.
63 """
64
65 indexes = [x for x in range(0, len(matrix))]
66 cols = [x for x in range(0, len(matrix[0]))]
67 #utils.log("indexes", indexes)
68 #utils.log("cols", cols)
69 df = pd.DataFrame(matrix, columns=cols, index=indexes)
70 utils.log("DataFrame:", df.shape)
71 #utils.log(df)
72 indices = np.triu_indices(df.shape[0], k=1)
73 #utils.log("Indices:", indices)
74 t = np.array(df)[indices]
75 Z = linkage(t, 'average')
76 lig_clusters = []
77 cluster_arr = fcluster(Z, t=threshold, criterion='distance')
78 for i in range(np.amax(cluster_arr)):
79 clus = df.columns[np.argwhere(cluster_arr==i+1)]
80 lig_clusters.append([x[0] for x in clus.tolist()])
81
82 utils.log("Clusters", lig_clusters)
83 return lig_clusters
84
85 def write_clusters_to_sdfs(mols, clusters, basename, gzip=False):
86 """
87 Write the molecules to SDF files, 1 file for each cluster.
88 :param mols The molecules to write:
89 :param clusters The clusters, as returned by the cluster function:
90 :param basename The basename for the file name. e.g. if basename is 'output' then files like
91 output1.sdf, output2.sdf will be written:
92 :param gzip Whether to gzip the output
93 :return:
94 """
95
96 i = 0
97 for cluster in clusters:
98 i += 1
99 filename = basename + str(i) + ".sdf"
100 if gzip:
101 filename += ".gz"
102 utils.log("Writing ", len(cluster), "molecules in cluster", i, "to file", filename)
103 output_file = utils.open_file_for_writing(filename)
104 writer = Chem.SDWriter(output_file)
105 for index in cluster:
106 mol = mols[index]
107 writer.write(mol)
108 writer.flush()
109 writer.close()
110 output_file.close()
111
112
113
114 def main():
115 parser = argparse.ArgumentParser(description='Clustering with SuCOS and RDKit')
116 parser.add_argument('-i', '--input', help='Input file in SDF format. Can be gzipped (*.gz).')
117 parser.add_argument('-o', '--output', default="cluster", help="Base name for output files in SDF format. " +
118 "e.g. if value is 'output' then files like output1.sdf, output2.sdf will be created")
119 parser.add_argument('--gzip', action='store_true', help='Gzip the outputs generating files like output1.sdf.gz, output2.sdf.gz')
120 parser.add_argument('-t', '--threshold', type=float, default=0.8, help='Clustering threshold')
121
122 args = parser.parse_args()
123 utils.log("SuCOS Cluster Args: ", args)
124
125 input_file = utils.open_file_for_reading(args.input)
126 suppl = Chem.ForwardSDMolSupplier(input_file)
127 mols = list(suppl)
128 matrix = calc_distance_matrix(mols)
129 clusters = cluster(matrix, threshold=args.threshold)
130 write_clusters_to_sdfs(mols, clusters, args.output, gzip=args.gzip)
131
132
133 if __name__ == "__main__":
134 main()