comparison sucos_max.py @ 0:f80cfac80c53 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
author bgruening
date Wed, 02 Oct 2019 12:58:19 -0400
parents
children 58d18838e244
comparison
equal deleted inserted replaced
-1:000000000000 0:f80cfac80c53
1 #!/usr/bin/env python
2 """
3 Assess ligands against a second set of molecules using SuCOS scores.
4 This is a quite specialised function that is designed to take a set of potential follow up
5 compounds and compare them to a set of clustered fragment hits to help identify which follow up
6 ligands best map to the binding space of the hits.
7
8 The clustering of the fragment hits is expected to be performed with the sucos_cluster.py module
9 and will generate a set of SD files, one for each cluster of hits (presumably corresponding to a
10 binding pocket in the protein target).
11
12 Each molecule in the input ligands is then compared (using SuCOS) to each hit in the clusters. There
13 are different modes which determine how the ligand is assessed.
14
15 In mode 'max' the hit with the best SuCOS score is identified. The output is a SD file with each of the ligands,
16 with these additional fields for each molecule:
17 Max_SuCOS_Score - the best score
18 Max_SuCOS_FeatureMap_Score - the feature map score for the hit that has the best SuCOS score
19 Max_SuCOS_Protrude_Score - the protrude volume for the hit that has the best SuCOS score
20 Max_SuCOS_Cluster - the name of the cluster SD file that contains the best hit
21 Max_SuCOS_Index - the index of the best hit in the SD file
22
23 In mode 'cum' the sum of all the scores is calculated and reported as the following properties for each molecule:
24 Cum_SuCOS_Score property: the sum of the SuCOS scores
25 Cum_SuCOS_FeatureMap_Score: the sum of the feature map scores
26 Cum_SuCOS_Protrude_Score: the sum of the protrude volume scores
27
28 If a molecule has no alignment to any of the clustered hits (all alignment scores of zero) then it is not
29 included in the results.
30
31
32 SuCOS is the work of Susan Leung.
33 GitHub: https://github.com/susanhleung/SuCOS
34 Publication: https://doi.org/10.26434/chemrxiv.8100203.v1
35 """
36
37 import sucos, utils
38 import argparse, gzip, os
39 from rdkit import Chem
40
41
42 def process(inputfilename, clusterfilenames, outputfilename, mode):
43
44 all_clusters = {}
45 for filename in clusterfilenames:
46 cluster = []
47 cluster_file = utils.open_file_for_reading(filename)
48 suppl = Chem.ForwardSDMolSupplier(cluster_file)
49 i = 0
50 for mol in suppl:
51 i += 1
52 if not mol:
53 utils.log("WARNING: failed to generate molecule", i, "in cluster", filename)
54 continue
55 try:
56 features = sucos.getRawFeatures(mol)
57 cluster.append((mol, features))
58 except:
59 utils.log("WARNING: failed to generate features for molecule", i, "in cluster", filename)
60
61 cluster_file.close()
62 all_clusters[filename] = cluster
63
64 input_file = utils.open_file_for_reading(inputfilename)
65 suppl = Chem.ForwardSDMolSupplier(input_file)
66 output_file = utils.open_file_for_writing(outputfilename)
67 writer = Chem.SDWriter(output_file)
68
69 comparisons = 0
70 mol_num = 0
71
72 for mol in suppl:
73 mol_num += 1
74 if not mol:
75 utils.log("WARNING: failed to generate molecule", mol_num, "in input")
76 continue
77 try:
78 query_features = sucos.getRawFeatures(mol)
79 except:
80 utils.log("WARNING: failed to generate features for molecule", mol_num, "in input")
81 continue
82 scores = [0, 0, 0]
83 for clusterfilename in all_clusters:
84 cluster = all_clusters[clusterfilename]
85 index = 0
86 for entry in cluster:
87 hit = entry[0]
88 ref_features = entry[1]
89 index += 1
90 comparisons += 1
91 sucos_score, fm_score, vol_score = sucos.get_SucosScore(hit, mol,
92 tani=False, ref_features=ref_features, query_features=query_features)
93 if mode == 'max':
94 if sucos_score > scores[0]:
95 scores[0] = sucos_score
96 scores[1] = fm_score
97 scores[2] = vol_score
98 cluster_name = clusterfilename
99 cluster_index = index
100 elif mode == 'cum':
101 scores[0] += sucos_score
102 scores[1] += fm_score
103 scores[2] += vol_score
104 else:
105 raise ValueError("Invalid mode: " + mode)
106
107 if scores[0] > 0:
108 if mode == 'max':
109 cluster_file_name_only = cluster_name.split(os.sep)[-1]
110 #utils.log("Max SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2],"File:", cluster_file_name_only, "Index:", cluster_index)
111 mol.SetDoubleProp("Max_SuCOS_Score", scores[0])
112 mol.SetDoubleProp("Max_SuCOS_FeatureMap_Score", scores[1])
113 mol.SetDoubleProp("Max_SuCOS_Protrude_Score", scores[2])
114 mol.SetProp("Max_SuCOS_Cluster", cluster_file_name_only)
115 mol.SetIntProp("Max_SuCOS_Index", cluster_index)
116
117 else:
118 #utils.log("Cum SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2])
119 mol.SetDoubleProp("Cum_SuCOS_Score", scores[0])
120 mol.SetDoubleProp("Cum_SuCOS_FeatureMap_Score", scores[1])
121 mol.SetDoubleProp("Cum_SuCOS_Protrude_Score", scores[2])
122
123 writer.write(mol)
124
125 else:
126 utils.log("Molecule", mol_num, "did not overlay. Omitting from results")
127
128
129 input_file.close()
130 writer.flush()
131 writer.close()
132 output_file.close()
133
134 utils.log("Completed", comparisons, "comparisons")
135
136
137 ### start main execution #########################################
138
139 def main():
140 parser = argparse.ArgumentParser(description='Max SuCOS scores with RDKit')
141 parser.add_argument('-i', '--input', help='Input file to score in SDF format. Can be gzipped (*.gz).')
142 parser.add_argument('-o', '--output', help='Output file in SDF format. Can be gzipped (*.gz).')
143 parser.add_argument('-m', '--mode', choices=['max', 'cum'],
144 default='max', help='Score mode: max = best score, cum = sum of all scores')
145 parser.add_argument('clusters', nargs='*', help="One or more SDF files with the clustered hits")
146
147 args = parser.parse_args()
148 utils.log("Max SuCOS Args: ", args)
149
150 process(args.input, args.clusters, args.output, args.mode)
151
152
153 if __name__ == "__main__":
154 main()