Mercurial > repos > bgruening > sucos_clustering
comparison sucos_max.py @ 0:f80cfac80c53 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
author | bgruening |
---|---|
date | Wed, 02 Oct 2019 12:58:19 -0400 |
parents | |
children | 58d18838e244 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f80cfac80c53 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 Assess ligands against a second set of molecules using SuCOS scores. | |
4 This is a quite specialised function that is designed to take a set of potential follow up | |
5 compounds and compare them to a set of clustered fragment hits to help identify which follow up | |
6 ligands best map to the binding space of the hits. | |
7 | |
8 The clustering of the fragment hits is expected to be performed with the sucos_cluster.py module | |
9 and will generate a set of SD files, one for each cluster of hits (presumably corresponding to a | |
10 binding pocket in the protein target). | |
11 | |
12 Each molecule in the input ligands is then compared (using SuCOS) to each hit in the clusters. There | |
13 are different modes which determine how the ligand is assessed. | |
14 | |
15 In mode 'max' the hit with the best SuCOS score is identified. The output is a SD file with each of the ligands, | |
16 with these additional fields for each molecule: | |
17 Max_SuCOS_Score - the best score | |
18 Max_SuCOS_FeatureMap_Score - the feature map score for the hit that has the best SuCOS score | |
19 Max_SuCOS_Protrude_Score - the protrude volume for the hit that has the best SuCOS score | |
20 Max_SuCOS_Cluster - the name of the cluster SD file that contains the best hit | |
21 Max_SuCOS_Index - the index of the best hit in the SD file | |
22 | |
23 In mode 'cum' the sum of all the scores is calculated and reported as the following properties for each molecule: | |
24 Cum_SuCOS_Score property: the sum of the SuCOS scores | |
25 Cum_SuCOS_FeatureMap_Score: the sum of the feature map scores | |
26 Cum_SuCOS_Protrude_Score: the sum of the protrude volume scores | |
27 | |
28 If a molecule has no alignment to any of the clustered hits (all alignment scores of zero) then it is not | |
29 included in the results. | |
30 | |
31 | |
32 SuCOS is the work of Susan Leung. | |
33 GitHub: https://github.com/susanhleung/SuCOS | |
34 Publication: https://doi.org/10.26434/chemrxiv.8100203.v1 | |
35 """ | |
36 | |
37 import sucos, utils | |
38 import argparse, gzip, os | |
39 from rdkit import Chem | |
40 | |
41 | |
42 def process(inputfilename, clusterfilenames, outputfilename, mode): | |
43 | |
44 all_clusters = {} | |
45 for filename in clusterfilenames: | |
46 cluster = [] | |
47 cluster_file = utils.open_file_for_reading(filename) | |
48 suppl = Chem.ForwardSDMolSupplier(cluster_file) | |
49 i = 0 | |
50 for mol in suppl: | |
51 i += 1 | |
52 if not mol: | |
53 utils.log("WARNING: failed to generate molecule", i, "in cluster", filename) | |
54 continue | |
55 try: | |
56 features = sucos.getRawFeatures(mol) | |
57 cluster.append((mol, features)) | |
58 except: | |
59 utils.log("WARNING: failed to generate features for molecule", i, "in cluster", filename) | |
60 | |
61 cluster_file.close() | |
62 all_clusters[filename] = cluster | |
63 | |
64 input_file = utils.open_file_for_reading(inputfilename) | |
65 suppl = Chem.ForwardSDMolSupplier(input_file) | |
66 output_file = utils.open_file_for_writing(outputfilename) | |
67 writer = Chem.SDWriter(output_file) | |
68 | |
69 comparisons = 0 | |
70 mol_num = 0 | |
71 | |
72 for mol in suppl: | |
73 mol_num += 1 | |
74 if not mol: | |
75 utils.log("WARNING: failed to generate molecule", mol_num, "in input") | |
76 continue | |
77 try: | |
78 query_features = sucos.getRawFeatures(mol) | |
79 except: | |
80 utils.log("WARNING: failed to generate features for molecule", mol_num, "in input") | |
81 continue | |
82 scores = [0, 0, 0] | |
83 for clusterfilename in all_clusters: | |
84 cluster = all_clusters[clusterfilename] | |
85 index = 0 | |
86 for entry in cluster: | |
87 hit = entry[0] | |
88 ref_features = entry[1] | |
89 index += 1 | |
90 comparisons += 1 | |
91 sucos_score, fm_score, vol_score = sucos.get_SucosScore(hit, mol, | |
92 tani=False, ref_features=ref_features, query_features=query_features) | |
93 if mode == 'max': | |
94 if sucos_score > scores[0]: | |
95 scores[0] = sucos_score | |
96 scores[1] = fm_score | |
97 scores[2] = vol_score | |
98 cluster_name = clusterfilename | |
99 cluster_index = index | |
100 elif mode == 'cum': | |
101 scores[0] += sucos_score | |
102 scores[1] += fm_score | |
103 scores[2] += vol_score | |
104 else: | |
105 raise ValueError("Invalid mode: " + mode) | |
106 | |
107 if scores[0] > 0: | |
108 if mode == 'max': | |
109 cluster_file_name_only = cluster_name.split(os.sep)[-1] | |
110 #utils.log("Max SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2],"File:", cluster_file_name_only, "Index:", cluster_index) | |
111 mol.SetDoubleProp("Max_SuCOS_Score", scores[0]) | |
112 mol.SetDoubleProp("Max_SuCOS_FeatureMap_Score", scores[1]) | |
113 mol.SetDoubleProp("Max_SuCOS_Protrude_Score", scores[2]) | |
114 mol.SetProp("Max_SuCOS_Cluster", cluster_file_name_only) | |
115 mol.SetIntProp("Max_SuCOS_Index", cluster_index) | |
116 | |
117 else: | |
118 #utils.log("Cum SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2]) | |
119 mol.SetDoubleProp("Cum_SuCOS_Score", scores[0]) | |
120 mol.SetDoubleProp("Cum_SuCOS_FeatureMap_Score", scores[1]) | |
121 mol.SetDoubleProp("Cum_SuCOS_Protrude_Score", scores[2]) | |
122 | |
123 writer.write(mol) | |
124 | |
125 else: | |
126 utils.log("Molecule", mol_num, "did not overlay. Omitting from results") | |
127 | |
128 | |
129 input_file.close() | |
130 writer.flush() | |
131 writer.close() | |
132 output_file.close() | |
133 | |
134 utils.log("Completed", comparisons, "comparisons") | |
135 | |
136 | |
137 ### start main execution ######################################### | |
138 | |
139 def main(): | |
140 parser = argparse.ArgumentParser(description='Max SuCOS scores with RDKit') | |
141 parser.add_argument('-i', '--input', help='Input file to score in SDF format. Can be gzipped (*.gz).') | |
142 parser.add_argument('-o', '--output', help='Output file in SDF format. Can be gzipped (*.gz).') | |
143 parser.add_argument('-m', '--mode', choices=['max', 'cum'], | |
144 default='max', help='Score mode: max = best score, cum = sum of all scores') | |
145 parser.add_argument('clusters', nargs='*', help="One or more SDF files with the clustered hits") | |
146 | |
147 args = parser.parse_args() | |
148 utils.log("Max SuCOS Args: ", args) | |
149 | |
150 process(args.input, args.clusters, args.output, args.mode) | |
151 | |
152 | |
153 if __name__ == "__main__": | |
154 main() |