comparison rdconf.py @ 5:67ee76f0e497 draft default tip

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit c1d813d3f0fec60ea6efe8a11e59d98bfdc1636f"
author bgruening
date Sat, 04 Dec 2021 16:40:23 +0000
parents
children
comparison
equal deleted inserted replaced
4:bbbf5fb356dd 5:67ee76f0e497
1 #!/usr/bin/python3
2
3 import gzip
4 import os
5 import sys
6 from optparse import OptionParser
7
8 from rdkit.Chem import AllChem as Chem
9
10 """
11 This script was originally written by David Koes, University of Pittsburgh:
12 https://github.com/dkoes/rdkit-scripts/blob/master/rdconf.py
13 It is licensed under the MIT licence.
14
15 Given a smiles file, generate 3D conformers in output sdf.
16 Energy minimizes and filters conformers to meet energy window and rms constraints.
17
18 Some time ago I compared this to alternative conformer generators and
19 it was quite competitive (especially after RDKit's UFF implementation
20 added OOP terms).
21 """
22
23
24 # convert smiles to sdf
25 def getRMS(mol, c1, c2):
26 rms = Chem.GetBestRMS(mol, mol, c1, c2)
27 return rms
28
29
30 parser = OptionParser(usage="Usage: %prog [options] <input>.smi <output>.sdf")
31 parser.add_option(
32 "--maxconfs",
33 dest="maxconfs",
34 action="store",
35 help="maximum number of conformers to generate per a molecule (default 20)",
36 default="20",
37 type="int",
38 metavar="CNT",
39 )
40 parser.add_option(
41 "--sample_multiplier",
42 dest="sample",
43 action="store",
44 help="sample N*maxconfs conformers and choose the maxconformers with lowest energy (default 1)",
45 default="1",
46 type="float",
47 metavar="N",
48 )
49 parser.add_option(
50 "--seed",
51 dest="seed",
52 action="store",
53 help="random seed (default 9162006)",
54 default="9162006",
55 type="int",
56 metavar="s",
57 )
58 parser.add_option(
59 "--rms_threshold",
60 dest="rms",
61 action="store",
62 help="filter based on rms (default 0.7)",
63 default="0.7",
64 type="float",
65 metavar="R",
66 )
67 parser.add_option(
68 "--energy_window",
69 dest="energy",
70 action="store",
71 help="filter based on energy difference with lowest energy conformer",
72 default="10",
73 type="float",
74 metavar="E",
75 )
76 parser.add_option(
77 "-v",
78 "--verbose",
79 dest="verbose",
80 action="store_true",
81 default=False,
82 help="verbose output",
83 )
84 parser.add_option(
85 "--mmff",
86 dest="mmff",
87 action="store_true",
88 default=False,
89 help="use MMFF forcefield instead of UFF",
90 )
91 parser.add_option(
92 "--nomin",
93 dest="nomin",
94 action="store_true",
95 default=False,
96 help="don't perform energy minimization (bad idea)",
97 )
98 parser.add_option(
99 "--etkdg",
100 dest="etkdg",
101 action="store_true",
102 default=False,
103 help="use new ETKDG knowledge-based method instead of distance geometry",
104 )
105
106
107 (options, args) = parser.parse_args()
108
109 if len(args) < 2:
110 parser.error("Need input and output")
111 sys.exit(-1)
112
113 input = args[0]
114 output = args[1]
115 smifile = open(input)
116 if options.verbose:
117 print("Generating a maximum of", options.maxconfs, "per a mol")
118
119 if options.etkdg and not Chem.ETKDG:
120 print("ETKDB does not appear to be implemented. Please upgrade RDKit.")
121 sys.exit(1)
122
123 split = os.path.splitext(output)
124 if split[1] == ".gz":
125 outf = gzip.open(output, "wt+")
126 output = split[0] # strip .gz
127 else:
128 outf = open(output, "w+")
129
130
131 if os.path.splitext(output)[1] == ".pdb":
132 sdwriter = Chem.PDBWriter(outf)
133 else:
134 sdwriter = Chem.SDWriter(outf)
135
136 if sdwriter is None:
137 print("Could not open ".output)
138 sys.exit(-1)
139
140 for line in smifile:
141 toks = line.split()
142 smi = toks[0]
143 name = " ".join(toks[1:])
144
145 pieces = smi.split(".")
146 if len(pieces) > 1:
147 smi = max(pieces, key=len) # take largest component by length
148 print("Taking largest component: %s\t%s" % (smi, name))
149
150 mol = Chem.MolFromSmiles(smi)
151 if mol is not None:
152 if options.verbose:
153 print(smi)
154 try:
155 Chem.SanitizeMol(mol)
156 mol = Chem.AddHs(mol)
157 mol.SetProp("_Name", name)
158
159 if options.etkdg:
160 cids = Chem.EmbedMultipleConfs(
161 mol, int(options.sample * options.maxconfs), Chem.ETKDG()
162 )
163 else:
164 cids = Chem.EmbedMultipleConfs(
165 mol, int(options.sample * options.maxconfs), randomSeed=options.seed
166 )
167 if options.verbose:
168 print(len(cids), "conformers found")
169 cenergy = []
170 for conf in cids:
171 # not passing confID only minimizes the first conformer
172 if options.nomin:
173 cenergy.append(conf)
174 elif options.mmff:
175 converged = Chem.MMFFOptimizeMolecule(mol, confId=conf)
176 mp = Chem.MMFFGetMoleculeProperties(mol)
177 cenergy.append(
178 Chem.MMFFGetMoleculeForceField(
179 mol, mp, confId=conf
180 ).CalcEnergy()
181 )
182 else:
183 converged = not Chem.UFFOptimizeMolecule(mol, confId=conf)
184 cenergy.append(
185 Chem.UFFGetMoleculeForceField(mol, confId=conf).CalcEnergy()
186 )
187 if options.verbose:
188 print("Convergence of conformer", conf, converged)
189
190 mol = Chem.RemoveHs(mol)
191 sortedcids = sorted(cids, key=lambda cid: cenergy[cid])
192 if len(sortedcids) > 0:
193 mine = cenergy[sortedcids[0]]
194 else:
195 mine = 0
196 if options.rms == 0:
197 cnt = 0
198 for conf in sortedcids:
199 if cnt >= options.maxconfs:
200 break
201 if (options.energy < 0) or cenergy[conf] - mine <= options.energy:
202 sdwriter.write(mol, conf)
203 cnt += 1
204 else:
205 written = {}
206 for conf in sortedcids:
207 if len(written) >= options.maxconfs:
208 break
209 # check rmsd
210 passed = True
211 for seenconf in written.keys():
212 rms = getRMS(mol, seenconf, conf)
213 if (rms < options.rms) or (
214 options.energy > 0 and cenergy[conf] - mine > options.energy
215 ):
216 passed = False
217 break
218 if passed:
219 written[conf] = True
220 sdwriter.write(mol, conf)
221 except (KeyboardInterrupt, SystemExit):
222 raise
223 except Exception as e:
224 print("Exception", e)
225 else:
226 print("ERROR:", smi)
227
228 sdwriter.close()
229 outf.close()