comparison sucos_cluster.py @ 7:9b48456a96fe draft default tip

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
author bgruening
date Wed, 14 Apr 2021 09:30:28 +0000
parents bb5365381c8f
children
comparison
equal deleted inserted replaced
6:55ac04db36aa 7:9b48456a96fe
8 8
9 SuCOS is the work of Susan Leung. 9 SuCOS is the work of Susan Leung.
10 GitHub: https://github.com/susanhleung/SuCOS 10 GitHub: https://github.com/susanhleung/SuCOS
11 Publication: https://doi.org/10.26434/chemrxiv.8100203.v1 11 Publication: https://doi.org/10.26434/chemrxiv.8100203.v1
12 """ 12 """
13 import argparse
13 14
14 import sucos, utils
15 import argparse, gzip
16 from rdkit import Chem
17 import numpy as np 15 import numpy as np
18 import pandas as pd 16 import pandas as pd
19 from scipy.cluster.hierarchy import linkage, fcluster 17 import sucos
18 import utils
19 from rdkit import Chem
20 from scipy.cluster.hierarchy import fcluster, linkage
20 21
21 ### start main execution ######################################### 22 # start main execution #########################################
22 23
23 24
24 def calc_distance_matrix(mols): 25 def calc_distance_matrix(mols):
25 """ 26 """
26 Calculate a full distance matrix for the given molecules. Identical molecules get a score of 0.0 with the maximum 27 Calculate a full distance matrix for the given molecules. Identical molecules get a score of 0.0 with the maximum
42 tmp = [] 43 tmp = []
43 for tuple2 in mol_fm_tuples: 44 for tuple2 in mol_fm_tuples:
44 if tuple1[0] == tuple2[0]: 45 if tuple1[0] == tuple2[0]:
45 tmp.append(0.0) 46 tmp.append(0.0)
46 else: 47 else:
47 #utils.log("Calculating SuCOS between", mol1, mol2) 48 # utils.log("Calculating SuCOS between", mol1, mol2)
48 sucos_score, fm_score, tani_score = sucos.get_SucosScore(tuple1[0], tuple2[0], 49 sucos_score, fm_score, tani_score = sucos.get_SucosScore(
49 tani=True, ref_features=tuple1[1], query_features=tuple2[1]) 50 tuple1[0],
51 tuple2[0],
52 tani=True,
53 ref_features=tuple1[1],
54 query_features=tuple2[1],
55 )
50 tmp.append(1.0 - sucos_score) 56 tmp.append(1.0 - sucos_score)
51 matrix.append(tmp) 57 matrix.append(tmp)
52
53 58
54 return matrix 59 return matrix
55 60
56 61
57 def cluster(matrix, threshold=0.8): 62 def cluster(matrix, threshold=0.8):
62 :return: An array of clusters, each cluster being an array of the indices from the matrix. 67 :return: An array of clusters, each cluster being an array of the indices from the matrix.
63 """ 68 """
64 69
65 indexes = [x for x in range(0, len(matrix))] 70 indexes = [x for x in range(0, len(matrix))]
66 cols = [x for x in range(0, len(matrix[0]))] 71 cols = [x for x in range(0, len(matrix[0]))]
67 #utils.log("indexes", indexes) 72 # utils.log("indexes", indexes)
68 #utils.log("cols", cols) 73 # utils.log("cols", cols)
69 df = pd.DataFrame(matrix, columns=cols, index=indexes) 74 df = pd.DataFrame(matrix, columns=cols, index=indexes)
70 utils.log("DataFrame:", df.shape) 75 utils.log("DataFrame:", df.shape)
71 #utils.log(df) 76 # utils.log(df)
72 indices = np.triu_indices(df.shape[0], k=1) 77 indices = np.triu_indices(df.shape[0], k=1)
73 #utils.log("Indices:", indices) 78 # utils.log("Indices:", indices)
74 t = np.array(df)[indices] 79 t = np.array(df)[indices]
75 Z = linkage(t, 'average') 80 Z = linkage(t, "average")
76 lig_clusters = [] 81 lig_clusters = []
77 cluster_arr = fcluster(Z, t=threshold, criterion='distance') 82 cluster_arr = fcluster(Z, t=threshold, criterion="distance")
78 for i in range(np.amax(cluster_arr)): 83 for i in range(np.amax(cluster_arr)):
79 clus = df.columns[np.argwhere(cluster_arr==i+1)] 84 clus = df.columns[np.argwhere(cluster_arr == i + 1)]
80 lig_clusters.append([x[0] for x in clus.tolist()]) 85 lig_clusters.append([x[0] for x in clus.tolist()])
81 86
82 utils.log("Clusters", lig_clusters) 87 utils.log("Clusters", lig_clusters)
83 return lig_clusters 88 return lig_clusters
89
84 90
85 def write_clusters_to_sdfs(mols, clusters, basename, gzip=False): 91 def write_clusters_to_sdfs(mols, clusters, basename, gzip=False):
86 """ 92 """
87 Write the molecules to SDF files, 1 file for each cluster. 93 Write the molecules to SDF files, 1 file for each cluster.
88 :param mols The molecules to write: 94 :param mols The molecules to write:
97 for cluster in clusters: 103 for cluster in clusters:
98 i += 1 104 i += 1
99 filename = basename + str(i) + ".sdf" 105 filename = basename + str(i) + ".sdf"
100 if gzip: 106 if gzip:
101 filename += ".gz" 107 filename += ".gz"
102 utils.log("Writing ", len(cluster), "molecules in cluster", i, "to file", filename) 108 utils.log(
109 "Writing ", len(cluster), "molecules in cluster", i, "to file", filename
110 )
103 output_file = utils.open_file_for_writing(filename) 111 output_file = utils.open_file_for_writing(filename)
104 writer = Chem.SDWriter(output_file) 112 writer = Chem.SDWriter(output_file)
105 for index in cluster: 113 for index in cluster:
106 mol = mols[index] 114 mol = mols[index]
107 writer.write(mol) 115 writer.write(mol)
108 writer.flush() 116 writer.flush()
109 writer.close() 117 writer.close()
110 output_file.close() 118 output_file.close()
111 119
112 120
113
114 def main(): 121 def main():
115 parser = argparse.ArgumentParser(description='Clustering with SuCOS and RDKit') 122 parser = argparse.ArgumentParser(description="Clustering with SuCOS and RDKit")
116 parser.add_argument('-i', '--input', help='Input file in SDF format. Can be gzipped (*.gz).') 123 parser.add_argument(
117 parser.add_argument('-o', '--output', default="cluster", help="Base name for output files in SDF format. " + 124 "-i", "--input", help="Input file in SDF format. Can be gzipped (*.gz)."
118 "e.g. if value is 'output' then files like output1.sdf, output2.sdf will be created") 125 )
119 parser.add_argument('--gzip', action='store_true', help='Gzip the outputs generating files like output1.sdf.gz, output2.sdf.gz') 126 parser.add_argument(
120 parser.add_argument('-t', '--threshold', type=float, default=0.8, help='Clustering threshold') 127 "-o",
128 "--output",
129 default="cluster",
130 help="Base name for output files in SDF format. "
131 + "e.g. if value is 'output' then files like output1.sdf, output2.sdf will be created",
132 )
133 parser.add_argument(
134 "--gzip",
135 action="store_true",
136 help="Gzip the outputs generating files like output1.sdf.gz, output2.sdf.gz",
137 )
138 parser.add_argument(
139 "-t", "--threshold", type=float, default=0.8, help="Clustering threshold"
140 )
121 141
122 args = parser.parse_args() 142 args = parser.parse_args()
123 utils.log("SuCOS Cluster Args: ", args) 143 utils.log("SuCOS Cluster Args: ", args)
124 144
125 input_file = utils.open_file_for_reading(args.input) 145 input_file = utils.open_file_for_reading(args.input)