Mercurial > repos > bgruening > sucos_clustering
comparison sucos_cluster.py @ 6:b8725fec8c7b draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
author | bgruening |
---|---|
date | Wed, 14 Apr 2021 09:30:48 +0000 |
parents | f80cfac80c53 |
children |
comparison
equal
deleted
inserted
replaced
5:12725d4b90f3 | 6:b8725fec8c7b |
---|---|
8 | 8 |
9 SuCOS is the work of Susan Leung. | 9 SuCOS is the work of Susan Leung. |
10 GitHub: https://github.com/susanhleung/SuCOS | 10 GitHub: https://github.com/susanhleung/SuCOS |
11 Publication: https://doi.org/10.26434/chemrxiv.8100203.v1 | 11 Publication: https://doi.org/10.26434/chemrxiv.8100203.v1 |
12 """ | 12 """ |
13 import argparse | |
13 | 14 |
14 import sucos, utils | |
15 import argparse, gzip | |
16 from rdkit import Chem | |
17 import numpy as np | 15 import numpy as np |
18 import pandas as pd | 16 import pandas as pd |
19 from scipy.cluster.hierarchy import linkage, fcluster | 17 import sucos |
18 import utils | |
19 from rdkit import Chem | |
20 from scipy.cluster.hierarchy import fcluster, linkage | |
20 | 21 |
21 ### start main execution ######################################### | 22 # start main execution ######################################### |
22 | 23 |
23 | 24 |
24 def calc_distance_matrix(mols): | 25 def calc_distance_matrix(mols): |
25 """ | 26 """ |
26 Calculate a full distance matrix for the given molecules. Identical molecules get a score of 0.0 with the maximum | 27 Calculate a full distance matrix for the given molecules. Identical molecules get a score of 0.0 with the maximum |
42 tmp = [] | 43 tmp = [] |
43 for tuple2 in mol_fm_tuples: | 44 for tuple2 in mol_fm_tuples: |
44 if tuple1[0] == tuple2[0]: | 45 if tuple1[0] == tuple2[0]: |
45 tmp.append(0.0) | 46 tmp.append(0.0) |
46 else: | 47 else: |
47 #utils.log("Calculating SuCOS between", mol1, mol2) | 48 # utils.log("Calculating SuCOS between", mol1, mol2) |
48 sucos_score, fm_score, tani_score = sucos.get_SucosScore(tuple1[0], tuple2[0], | 49 sucos_score, fm_score, tani_score = sucos.get_SucosScore( |
49 tani=True, ref_features=tuple1[1], query_features=tuple2[1]) | 50 tuple1[0], |
51 tuple2[0], | |
52 tani=True, | |
53 ref_features=tuple1[1], | |
54 query_features=tuple2[1], | |
55 ) | |
50 tmp.append(1.0 - sucos_score) | 56 tmp.append(1.0 - sucos_score) |
51 matrix.append(tmp) | 57 matrix.append(tmp) |
52 | |
53 | 58 |
54 return matrix | 59 return matrix |
55 | 60 |
56 | 61 |
57 def cluster(matrix, threshold=0.8): | 62 def cluster(matrix, threshold=0.8): |
62 :return: An array of clusters, each cluster being an array of the indices from the matrix. | 67 :return: An array of clusters, each cluster being an array of the indices from the matrix. |
63 """ | 68 """ |
64 | 69 |
65 indexes = [x for x in range(0, len(matrix))] | 70 indexes = [x for x in range(0, len(matrix))] |
66 cols = [x for x in range(0, len(matrix[0]))] | 71 cols = [x for x in range(0, len(matrix[0]))] |
67 #utils.log("indexes", indexes) | 72 # utils.log("indexes", indexes) |
68 #utils.log("cols", cols) | 73 # utils.log("cols", cols) |
69 df = pd.DataFrame(matrix, columns=cols, index=indexes) | 74 df = pd.DataFrame(matrix, columns=cols, index=indexes) |
70 utils.log("DataFrame:", df.shape) | 75 utils.log("DataFrame:", df.shape) |
71 #utils.log(df) | 76 # utils.log(df) |
72 indices = np.triu_indices(df.shape[0], k=1) | 77 indices = np.triu_indices(df.shape[0], k=1) |
73 #utils.log("Indices:", indices) | 78 # utils.log("Indices:", indices) |
74 t = np.array(df)[indices] | 79 t = np.array(df)[indices] |
75 Z = linkage(t, 'average') | 80 Z = linkage(t, "average") |
76 lig_clusters = [] | 81 lig_clusters = [] |
77 cluster_arr = fcluster(Z, t=threshold, criterion='distance') | 82 cluster_arr = fcluster(Z, t=threshold, criterion="distance") |
78 for i in range(np.amax(cluster_arr)): | 83 for i in range(np.amax(cluster_arr)): |
79 clus = df.columns[np.argwhere(cluster_arr==i+1)] | 84 clus = df.columns[np.argwhere(cluster_arr == i + 1)] |
80 lig_clusters.append([x[0] for x in clus.tolist()]) | 85 lig_clusters.append([x[0] for x in clus.tolist()]) |
81 | 86 |
82 utils.log("Clusters", lig_clusters) | 87 utils.log("Clusters", lig_clusters) |
83 return lig_clusters | 88 return lig_clusters |
89 | |
84 | 90 |
85 def write_clusters_to_sdfs(mols, clusters, basename, gzip=False): | 91 def write_clusters_to_sdfs(mols, clusters, basename, gzip=False): |
86 """ | 92 """ |
87 Write the molecules to SDF files, 1 file for each cluster. | 93 Write the molecules to SDF files, 1 file for each cluster. |
88 :param mols The molecules to write: | 94 :param mols The molecules to write: |
97 for cluster in clusters: | 103 for cluster in clusters: |
98 i += 1 | 104 i += 1 |
99 filename = basename + str(i) + ".sdf" | 105 filename = basename + str(i) + ".sdf" |
100 if gzip: | 106 if gzip: |
101 filename += ".gz" | 107 filename += ".gz" |
102 utils.log("Writing ", len(cluster), "molecules in cluster", i, "to file", filename) | 108 utils.log( |
109 "Writing ", len(cluster), "molecules in cluster", i, "to file", filename | |
110 ) | |
103 output_file = utils.open_file_for_writing(filename) | 111 output_file = utils.open_file_for_writing(filename) |
104 writer = Chem.SDWriter(output_file) | 112 writer = Chem.SDWriter(output_file) |
105 for index in cluster: | 113 for index in cluster: |
106 mol = mols[index] | 114 mol = mols[index] |
107 writer.write(mol) | 115 writer.write(mol) |
108 writer.flush() | 116 writer.flush() |
109 writer.close() | 117 writer.close() |
110 output_file.close() | 118 output_file.close() |
111 | 119 |
112 | 120 |
113 | |
114 def main(): | 121 def main(): |
115 parser = argparse.ArgumentParser(description='Clustering with SuCOS and RDKit') | 122 parser = argparse.ArgumentParser(description="Clustering with SuCOS and RDKit") |
116 parser.add_argument('-i', '--input', help='Input file in SDF format. Can be gzipped (*.gz).') | 123 parser.add_argument( |
117 parser.add_argument('-o', '--output', default="cluster", help="Base name for output files in SDF format. " + | 124 "-i", "--input", help="Input file in SDF format. Can be gzipped (*.gz)." |
118 "e.g. if value is 'output' then files like output1.sdf, output2.sdf will be created") | 125 ) |
119 parser.add_argument('--gzip', action='store_true', help='Gzip the outputs generating files like output1.sdf.gz, output2.sdf.gz') | 126 parser.add_argument( |
120 parser.add_argument('-t', '--threshold', type=float, default=0.8, help='Clustering threshold') | 127 "-o", |
128 "--output", | |
129 default="cluster", | |
130 help="Base name for output files in SDF format. " | |
131 + "e.g. if value is 'output' then files like output1.sdf, output2.sdf will be created", | |
132 ) | |
133 parser.add_argument( | |
134 "--gzip", | |
135 action="store_true", | |
136 help="Gzip the outputs generating files like output1.sdf.gz, output2.sdf.gz", | |
137 ) | |
138 parser.add_argument( | |
139 "-t", "--threshold", type=float, default=0.8, help="Clustering threshold" | |
140 ) | |
121 | 141 |
122 args = parser.parse_args() | 142 args = parser.parse_args() |
123 utils.log("SuCOS Cluster Args: ", args) | 143 utils.log("SuCOS Cluster Args: ", args) |
124 | 144 |
125 input_file = utils.open_file_for_reading(args.input) | 145 input_file = utils.open_file_for_reading(args.input) |