comparison sucos.py @ 6:4f1896782f7c draft default tip

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
author bgruening
date Wed, 14 Apr 2021 09:31:11 +0000
parents 8161c08627bf
children
comparison
equal deleted inserted replaced
5:fe318c648502 6:4f1896782f7c
7 GitHub: https://github.com/susanhleung/SuCOS 7 GitHub: https://github.com/susanhleung/SuCOS
8 Publication: https://doi.org/10.26434/chemrxiv.8100203.v1 8 Publication: https://doi.org/10.26434/chemrxiv.8100203.v1
9 """ 9 """
10 10
11 from __future__ import print_function 11 from __future__ import print_function
12 import argparse, os, sys, gzip 12
13 import argparse
14 import os
15
13 import numpy as np 16 import numpy as np
14 from rdkit import Chem, rdBase, RDConfig 17 import utils
18 from rdkit import Chem, RDConfig
15 from rdkit.Chem import AllChem, rdShapeHelpers 19 from rdkit.Chem import AllChem, rdShapeHelpers
16 from rdkit.Chem.FeatMaps import FeatMaps 20 from rdkit.Chem.FeatMaps import FeatMaps
17 import utils 21
18 22 # start function definitions #########################################
19
20 ### start function definitions #########################################
21 23
22 # Setting up the features to use in FeatureMap 24 # Setting up the features to use in FeatureMap
23 fdef = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')) 25 fdef = AllChem.BuildFeatureFactory(
26 os.path.join(RDConfig.RDDataDir, "BaseFeatures.fdef")
27 )
24 28
25 fmParams = {} 29 fmParams = {}
26 for k in fdef.GetFeatureFamilies(): 30 for k in fdef.GetFeatureFamilies():
27 fparams = FeatMaps.FeatMapParams() 31 fparams = FeatMaps.FeatMapParams()
28 fmParams[k] = fparams 32 fmParams[k] = fparams
29 33
30 keep = ('Donor', 'Acceptor', 'NegIonizable', 'PosIonizable', 'ZnBinder', 34 keep = (
31 'Aromatic', 'Hydrophobe', 'LumpedHydrophobe') 35 "Donor",
36 "Acceptor",
37 "NegIonizable",
38 "PosIonizable",
39 "ZnBinder",
40 "Aromatic",
41 "Hydrophobe",
42 "LumpedHydrophobe",
43 )
44
32 45
33 def filterFeature(f): 46 def filterFeature(f):
34 result = f.GetFamily() in keep 47 result = f.GetFamily() in keep
35 # TODO - nothing ever seems to be filtered. Is this expected? 48 # TODO - nothing ever seems to be filtered. Is this expected?
36 if not result: 49 if not result:
37 utils.log("Filtered out feature type", f.GetFamily()) 50 utils.log("Filtered out feature type", f.GetFamily())
38 return result 51 return result
39 52
53
40 def getRawFeatures(mol): 54 def getRawFeatures(mol):
41 55
42 rawFeats = fdef.GetFeaturesForMol(mol) 56 rawFeats = fdef.GetFeaturesForMol(mol)
43 # filter that list down to only include the ones we're interested in 57 # filter that list down to only include the ones we're interested in
44 filtered = list(filter(filterFeature, rawFeats)) 58 filtered = list(filter(filterFeature, rawFeats))
45 return filtered 59 return filtered
46 60
47 def get_FeatureMapScore(small_feats, large_feats, tani=False, score_mode=FeatMaps.FeatMapScoreMode.All): 61
62 def get_FeatureMapScore(
63 small_feats, large_feats, tani=False, score_mode=FeatMaps.FeatMapScoreMode.All
64 ):
48 """ 65 """
49 Generate the feature map score. 66 Generate the feature map score.
50 67
51 :param small_feats: 68 :param small_feats:
52 :param large_feats: 69 :param large_feats:
56 73
57 featLists = [] 74 featLists = []
58 for rawFeats in [small_feats, large_feats]: 75 for rawFeats in [small_feats, large_feats]:
59 # filter that list down to only include the ones we're interested in 76 # filter that list down to only include the ones we're interested in
60 featLists.append(rawFeats) 77 featLists.append(rawFeats)
61 fms = [FeatMaps.FeatMap(feats=x, weights=[1] * len(x), params=fmParams) for x in featLists] 78 fms = [
79 FeatMaps.FeatMap(feats=x, weights=[1] * len(x), params=fmParams)
80 for x in featLists
81 ]
62 # set the score mode 82 # set the score mode
63 fms[0].scoreMode = score_mode 83 fms[0].scoreMode = score_mode
64 84
65 try: 85 try:
66 if tani: 86 if tani:
67 c = fms[0].ScoreFeats(featLists[1]) 87 c = fms[0].ScoreFeats(featLists[1])
68 A = fms[0].GetNumFeatures() 88 A = fms[0].GetNumFeatures()
69 B = len(featLists[1]) 89 B = len(featLists[1])
70 if B != fms[1].GetNumFeatures(): 90 if B != fms[1].GetNumFeatures():
71 utils.log("Why isn't B equal to number of features...?!") 91 utils.log("Why isn't B equal to number of features...?!")
72 tani_score = float(c) / (A+B-c) 92 tani_score = float(c) / (A + B - c)
73 return tani_score 93 return tani_score
74 else: 94 else:
75 fm_score = fms[0].ScoreFeats(featLists[1]) / min(fms[0].GetNumFeatures(), len(featLists[1])) 95 fm_score = fms[0].ScoreFeats(featLists[1]) / min(
96 fms[0].GetNumFeatures(), len(featLists[1])
97 )
76 return fm_score 98 return fm_score
77 except ZeroDivisionError: 99 except ZeroDivisionError:
78 utils.log("ZeroDivisionError") 100 utils.log("ZeroDivisionError")
79 return 0 101 return 0
80 102
81 if tani: 103 if tani:
82 tani_score = float(c) / (A+B-c) 104 tani_score = float(c) / (A + B - c)
83 return tani_score 105 return tani_score
84 else: 106 else:
85 fm_score = fms[0].ScoreFeats(featLists[1]) / min(fms[0].GetNumFeatures(), len(featLists[1])) 107 fm_score = fms[0].ScoreFeats(featLists[1]) / min(
108 fms[0].GetNumFeatures(), len(featLists[1])
109 )
86 return fm_score 110 return fm_score
87 111
88 112
89 def get_SucosScore(ref_mol, query_mol, tani=False, ref_features=None, query_features=None, score_mode=FeatMaps.FeatMapScoreMode.All): 113 def get_SucosScore(
114 ref_mol,
115 query_mol,
116 tani=False,
117 ref_features=None,
118 query_features=None,
119 score_mode=FeatMaps.FeatMapScoreMode.All,
120 ):
90 """ 121 """
91 This is the key function that calculates the SuCOS scores and is expected to be called from other modules. 122 This is the key function that calculates the SuCOS scores and is expected to be called from other modules.
92 To improve performance you can pre-calculate the features and pass them in as optional parameters to avoid having 123 To improve performance you can pre-calculate the features and pass them in as optional parameters to avoid having
93 to recalculate them. Use the getRawFeatures function to pre-calculate the features. 124 to recalculate them. Use the getRawFeatures function to pre-calculate the features.
94 125
105 ref_features = getRawFeatures(ref_mol) 136 ref_features = getRawFeatures(ref_mol)
106 if not query_features: 137 if not query_features:
107 query_features = getRawFeatures(query_mol) 138 query_features = getRawFeatures(query_mol)
108 139
109 fm_score = get_FeatureMapScore(ref_features, query_features, tani, score_mode) 140 fm_score = get_FeatureMapScore(ref_features, query_features, tani, score_mode)
110 fm_score = np.clip(fm_score, 0, 1) 141 fm_score = np.float(np.clip(fm_score, 0, 1))
111 142
112 try : 143 try:
113 if tani: 144 if tani:
114 tani_sim = 1 - float(rdShapeHelpers.ShapeTanimotoDist(ref_mol, query_mol)) 145 tani_sim = 1 - float(rdShapeHelpers.ShapeTanimotoDist(ref_mol, query_mol))
115 tani_sim = np.clip(tani_sim, 0, 1) 146 tani_sim = np.clip(tani_sim, 0, 1)
116 SuCOS_score = 0.5*fm_score + 0.5*tani_sim 147 SuCOS_score = 0.5 * fm_score + 0.5 * tani_sim
117 return SuCOS_score, fm_score, tani_sim 148 return SuCOS_score, fm_score, tani_sim
118 else: 149 else:
119 protrude_dist = rdShapeHelpers.ShapeProtrudeDist(ref_mol, query_mol, allowReordering=False) 150 protrude_dist = rdShapeHelpers.ShapeProtrudeDist(
151 ref_mol, query_mol, allowReordering=False
152 )
120 protrude_dist = np.clip(protrude_dist, 0, 1) 153 protrude_dist = np.clip(protrude_dist, 0, 1)
121 protrude_val = 1.0 - protrude_dist 154 protrude_val = 1.0 - protrude_dist
122 SuCOS_score = 0.5 * fm_score + 0.5 * protrude_val 155 SuCOS_score = 0.5 * fm_score + 0.5 * protrude_val
123 return SuCOS_score, fm_score, protrude_val 156 return SuCOS_score, fm_score, protrude_val
124 except: 157 except Exception:
125 utils.log("Failed to calculate SuCOS scores. Returning 0,0,0") 158 utils.log("Failed to calculate SuCOS scores. Returning 0,0,0")
126 return 0, 0, 0 159 return 0, 0, 0
127 160
128 def process(refmol_filename, inputs_filename, outputs_filename, refmol_index=None, 161
129 refmol_format=None, tani=False, score_mode=FeatMaps.FeatMapScoreMode.All): 162 def process(
130 163 refmol_filename,
131 ref_mol = utils.read_single_molecule(refmol_filename, index=refmol_index, format=refmol_format) 164 inputs_filename,
132 #utils.log("Reference mol has", ref_mol.GetNumHeavyAtoms(), "heavy atoms") 165 outputs_filename,
166 refmol_index=None,
167 refmol_format=None,
168 tani=False,
169 score_mode=FeatMaps.FeatMapScoreMode.All,
170 ):
171
172 ref_mol = utils.read_single_molecule(
173 refmol_filename, index=refmol_index, format=refmol_format
174 )
175 # utils.log("Reference mol has", ref_mol.GetNumHeavyAtoms(), "heavy atoms")
133 ref_features = getRawFeatures(ref_mol) 176 ref_features = getRawFeatures(ref_mol)
134 177
135 input_file = utils.open_file_for_reading(inputs_filename) 178 input_file = utils.open_file_for_reading(inputs_filename)
136 suppl = Chem.ForwardSDMolSupplier(input_file) 179 suppl = Chem.ForwardSDMolSupplier(input_file)
137 output_file = utils.open_file_for_writing(outputs_filename) 180 output_file = utils.open_file_for_writing(outputs_filename)
139 182
140 count = 0 183 count = 0
141 total = 0 184 total = 0
142 errors = 0 185 errors = 0
143 for mol in suppl: 186 for mol in suppl:
144 count +=1 187 count += 1
145 if mol is None: 188 if mol is None:
146 continue 189 continue
147 #utils.log("Mol has", str(mol.GetNumHeavyAtoms()), "heavy atoms") 190 # utils.log("Mol has", str(mol.GetNumHeavyAtoms()), "heavy atoms")
148 try: 191 try:
149 sucos_score, fm_score, val3 = get_SucosScore(ref_mol, mol, tani=tani, ref_features=ref_features, score_mode=score_mode) 192 sucos_score, fm_score, val3 = get_SucosScore(
193 ref_mol,
194 mol,
195 tani=tani,
196 ref_features=ref_features,
197 score_mode=score_mode,
198 )
150 mol.SetDoubleProp("SuCOS_Score", sucos_score) 199 mol.SetDoubleProp("SuCOS_Score", sucos_score)
151 mol.SetDoubleProp("SuCOS_FeatureMap_Score", fm_score) 200 mol.SetDoubleProp("SuCOS_FeatureMap_Score", fm_score)
152 if tani: 201 if tani:
153 mol.SetDoubleProp("SuCOS_Tanimoto_Score", val3) 202 mol.SetDoubleProp("SuCOS_Tanimoto_Score", val3)
154 else: 203 else:
155 mol.SetDoubleProp("SuCOS_Protrude_Score", val3) 204 mol.SetDoubleProp("SuCOS_Protrude_Score", val3)
156 utils.log("Scores:", sucos_score, fm_score, val3) 205 utils.log("Scores:", sucos_score, fm_score, val3)
157 writer.write(mol) 206 writer.write(mol)
158 total +=1 207 total += 1
159 except ValueError as e: 208 except ValueError as e:
160 errors +=1 209 errors += 1
161 utils.log("Molecule", count, "failed to score:", e.message) 210 utils.log("Molecule", count, "failed to score:", e.message)
162 211
163 input_file.close() 212 input_file.close()
164 writer.flush() 213 writer.flush()
165 writer.close() 214 writer.close()
166 output_file.close() 215 output_file.close()
167 216
168 utils.log("Completed.", total, "processed, ", count, "succeeded, ", errors, "errors") 217 utils.log(
218 "Completed.", total, "processed, ", count, "succeeded, ", errors, "errors"
219 )
220
169 221
170 def parse_score_mode(value): 222 def parse_score_mode(value):
171 if value == None or value == 'all': 223 if value is None or value == "all":
172 return FeatMaps.FeatMapScoreMode.All 224 return FeatMaps.FeatMapScoreMode.All
173 elif value == 'closest': 225 elif value == "closest":
174 return FeatMaps.FeatMapScoreMode.Closest 226 return FeatMaps.FeatMapScoreMode.Closest
175 elif value == 'best': 227 elif value == "best":
176 return FeatMaps.FeatMapScoreMode.Best 228 return FeatMaps.FeatMapScoreMode.Best
177 else: 229 else:
178 raise ValueError(value + " is not a valid scoring mode option") 230 raise ValueError(value + " is not a valid scoring mode option")
179 231
180 232
181 ### start main execution ######################################### 233 # start main execution #########################################
234
182 235
183 def main(): 236 def main():
184 237
185 parser = argparse.ArgumentParser(description='SuCOS with RDKit') 238 parser = argparse.ArgumentParser(description="SuCOS with RDKit")
186 parser.add_argument('-i', '--input', help='Input file in SDF format. Can be gzipped (*.gz).') 239 parser.add_argument(
187 parser.add_argument('-r', '--refmol', help='Molecule to compare against in Molfile (.mol) or SDF (.sdf) format') 240 "-i", "--input", help="Input file in SDF format. Can be gzipped (*.gz)."
188 parser.add_argument('--refmol-format', help="Format for the reference molecule (mol or sdf). " + 241 )
189 "Only needed if files don't have the expected extensions") 242 parser.add_argument(
190 parser.add_argument('--refmolidx', help='Reference molecule index in SD file if not the first', type=int, default=1) 243 "-r",
191 parser.add_argument('-o', '--output', help='Output file in SDF format. Can be gzipped (*.gz).') 244 "--refmol",
192 parser.add_argument('--tanimoto', action='store_true', help='Include Tanimoto distance in score') 245 help="Molecule to compare against in Molfile (.mol) or SDF (.sdf) format",
193 parser.add_argument('--score_mode', choices=['all', 'closest', 'best'], 246 )
194 help="choose the scoring mode for the feature map, default is 'all'.") 247 parser.add_argument(
248 "--refmol-format",
249 help="Format for the reference molecule (mol or sdf). "
250 + "Only needed if files don't have the expected extensions",
251 )
252 parser.add_argument(
253 "--refmolidx",
254 help="Reference molecule index in SD file if not the first",
255 type=int,
256 default=1,
257 )
258 parser.add_argument(
259 "-o", "--output", help="Output file in SDF format. Can be gzipped (*.gz)."
260 )
261 parser.add_argument(
262 "--tanimoto", action="store_true", help="Include Tanimoto distance in score"
263 )
264 parser.add_argument(
265 "--score_mode",
266 choices=["all", "closest", "best"],
267 help="choose the scoring mode for the feature map, default is 'all'.",
268 )
195 269
196 args = parser.parse_args() 270 args = parser.parse_args()
197 utils.log("SuCOS Args: ", args) 271 utils.log("SuCOS Args: ", args)
198 272
199 score_mode = parse_score_mode(args.score_mode) 273 score_mode = parse_score_mode(args.score_mode)
200 274
201 process(args.refmol, args.input, args.output, refmol_index=args.refmolidx, 275 process(
202 refmol_format=args.refmol_format, tani=args.tanimoto, score_mode=score_mode) 276 args.refmol,
277 args.input,
278 args.output,
279 refmol_index=args.refmolidx,
280 refmol_format=args.refmol_format,
281 tani=args.tanimoto,
282 score_mode=score_mode,
283 )
203 284
204 285
205 if __name__ == "__main__": 286 if __name__ == "__main__":
206 main() 287 main()