comparison dimorphite_dl.py @ 0:0f3e5c69251e draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 20df7e562341cd30e89a14d6bde9054956fadc06"
author bgruening
date Tue, 10 Mar 2020 12:57:24 -0400
parents
children bbbf5fb356dd
comparison
equal deleted inserted replaced
-1:000000000000 0:0f3e5c69251e
1 # Copyright 2018 Jacob D. Durrant
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 """
16 This script identifies and enumerates the possible protonation sites of SMILES
17 strings.
18 """
19
20 from __future__ import print_function
21 import copy
22 import os
23 import argparse
24 import sys
25
26 try:
27 # Python2
28 from StringIO import StringIO
29 except ImportError:
30 # Python3
31 from io import StringIO
32
33 # Always let the user know a help file is available.
34 print("\nFor help, use: python dimorphite_dl.py --help")
35
36 # And always report citation information.
37 print("\nIf you use Dimorphite-DL in your research, please cite:")
38 print("Ropp PJ, Kaminsky JC, Yablonski S, Durrant JD (2019) Dimorphite-DL: An")
39 print("open-source program for enumerating the ionization states of drug-like small")
40 print("molecules. J Cheminform 11:14. doi:10.1186/s13321-019-0336-9.\n")
41
42 try:
43 import rdkit
44 from rdkit import Chem
45 from rdkit.Chem import AllChem
46 except:
47 msg = "Dimorphite-DL requires RDKit. See https://www.rdkit.org/"
48 print(msg)
49 raise Exception(msg)
50
51 def main(params=None):
52 """The main definition run when you call the script from the commandline.
53
54 :param params: The parameters to use. Entirely optional. If absent,
55 defaults to None, in which case argments will be taken from
56 those given at the command line.
57 :param params: dict, optional
58 :return: Returns a list of the SMILES strings return_as_list parameter is
59 True. Otherwise, returns None.
60 """
61
62 parser = ArgParseFuncs.get_args()
63 args = vars(parser.parse_args())
64
65 # Add in any parameters in params.
66 if params is not None:
67 for k, v in params.items():
68 args[k] = v
69
70 # If being run from the command line, print out all parameters.
71 if __name__ == "__main__":
72 print("\nPARAMETERS:\n")
73 for k in sorted(args.keys()):
74 print(k.rjust(13) + ": " + str(args[k]))
75 print("")
76
77 if args["test"]:
78 # Run tests.
79 TestFuncs.test()
80 else:
81 # Run protonation
82 if "output_file" in args and args["output_file"] is not None:
83 # An output file was specified, so write to that.
84 with open(args["output_file"], "w") as file:
85 for protonated_smi in Protonate(args):
86 file.write(protonated_smi + "\n")
87 elif "return_as_list" in args and args["return_as_list"] == True:
88 return list(Protonate(args))
89 else:
90 # No output file specified. Just print it to the screen.
91 for protonated_smi in Protonate(args):
92 print(protonated_smi)
93
94 class MyParser(argparse.ArgumentParser):
95 """Overwrite default parse so it displays help file on error. See
96 https://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu"""
97
98 def error(self, message):
99 """Overwrites the default error message.
100
101 :param message: The default error message.
102 """
103
104 self.print_help()
105 msg = "ERROR: %s\n\n" % message
106 print(msg)
107 raise Exception(msg)
108
109 def print_help(self, file=None):
110 """Overwrite the default print_help function
111
112 :param file: Output file, defaults to None
113 """
114
115 print("")
116
117 if file is None:
118 file = sys.stdout
119 self._print_message(self.format_help(), file)
120 print("""
121 examples:
122 python dimorphite_dl.py --smiles_file sample_molecules.smi
123 python dimorphite_dl.py --smiles "CCC(=O)O" --min_ph -3.0 --max_ph -2.0
124 python dimorphite_dl.py --smiles "CCCN" --min_ph -3.0 --max_ph -2.0 --output_file output.smi
125 python dimorphite_dl.py --smiles_file sample_molecules.smi --pka_precision 2.0 --label_states
126 python dimorphite_dl.py --test""")
127 print("")
128
129 class ArgParseFuncs:
130 """A namespace for storing functions that are useful for processing
131 command-line arguments. To keep things organized."""
132
133 @staticmethod
134 def get_args():
135 """Gets the arguments from the command line.
136
137 :return: A parser object.
138 """
139
140 parser = MyParser(description="Dimorphite 1.2: Creates models of " +
141 "appropriately protonated small moleucles. " +
142 "Apache 2.0 License. Copyright 2018 Jacob D. " +
143 "Durrant.")
144 parser.add_argument('--min_ph', metavar='MIN', type=float, default=6.4,
145 help='minimum pH to consider (default: 6.4)')
146 parser.add_argument('--max_ph', metavar='MAX', type=float, default=8.4,
147 help='maximum pH to consider (default: 8.4)')
148 parser.add_argument('--pka_precision', metavar='PRE', type=float, default=1.0,
149 help='pKa precision factor (number of standard devations, default: 1.0)')
150 parser.add_argument('--smiles', metavar='SMI', type=str,
151 help='SMILES string to protonate')
152 parser.add_argument('--smiles_file', metavar="FILE", type=str,
153 help='file that contains SMILES strings to protonate')
154 parser.add_argument('--output_file', metavar="FILE", type=str,
155 help='output file to write protonated SMILES (optional)')
156 parser.add_argument('--label_states', action="store_true",
157 help='label protonated SMILES with target state ' + \
158 '(i.e., "DEPROTONATED", "PROTONATED", or "BOTH").')
159 parser.add_argument('--test', action="store_true",
160 help='run unit tests (for debugging)')
161
162 return parser
163
164 @staticmethod
165 def clean_args(args):
166 """Cleans and normalizes input parameters
167
168 :param args: A dictionary containing the arguments.
169 :type args: dict
170 :raises Exception: No SMILES in params.
171 """
172
173 defaults = {'min_ph' : 6.4,
174 'max_ph' : 8.4,
175 'pka_precision' : 1.0,
176 'label_states' : False,
177 'test' : False}
178
179 for key in defaults:
180 if key not in args:
181 args[key] = defaults[key]
182
183 keys = list(args.keys())
184 for key in keys:
185 if args[key] is None:
186 del args[key]
187
188 if not "smiles" in args and not "smiles_file" in args:
189 msg = "Error: No SMILES in params. Use the -h parameter for help."
190 print(msg)
191 raise Exception(msg)
192
193 # If the user provides a smiles string, turn it into a file-like StringIO
194 # object.
195 if "smiles" in args:
196 if isinstance(args["smiles"], str):
197 args["smiles_file"] = StringIO(args["smiles"])
198
199 args["smiles_and_data"] = LoadSMIFile(args["smiles_file"])
200
201 return args
202
203 class UtilFuncs:
204 """A namespace to store functions for manipulating mol objects. To keep
205 things organized."""
206
207 @staticmethod
208 def neutralize_mol(mol):
209 """All molecules should be neuralized to the extent possible. The user
210 should not be allowed to specify the valence of the atoms in most cases.
211
212 :param rdkit.Chem.rdchem.Mol mol: The rdkit Mol objet to be neutralized.
213 :return: The neutralized Mol object.
214 """
215
216 # Get the reaction data
217 rxn_data = [
218 ['[Ov1-1:1]', '[Ov2+0:1]-[H]'], # To handle O- bonded to only one atom (add hydrogen).
219 ['[#7v4+1:1]-[H]', '[#7v3+0:1]'], # To handle N+ bonded to a hydrogen (remove hydrogen).
220 ['[Ov2-:1]', '[Ov2+0:1]'], # To handle O- bonded to two atoms. Should not be Negative.
221 ['[#7v3+1:1]', '[#7v3+0:1]'], # To handle N+ bonded to three atoms. Should not be positive.
222 ['[#7v2-1:1]', '[#7+0:1]-[H]'], # To handle N- Bonded to two atoms. Add hydrogen.
223 # ['[N:1]=[N+0:2]=[N:3]-[H]', '[N:1]=[N+1:2]=[N+0:3]-[H]'], # To
224 # handle bad azide. Must be protonated. (Now handled elsewhere, before
225 # SMILES converted to Mol object.)
226 ['[H]-[N:1]-[N:2]#[N:3]', '[N:1]=[N+1:2]=[N:3]-[H]'] # To handle bad azide. R-N-N#N should be R-N=[N+]=N
227 ]
228
229 # Add substructures and reactions (initially none)
230 for i, rxn_datum in enumerate(rxn_data):
231 rxn_data[i].append(Chem.MolFromSmarts(rxn_datum[0]))
232 rxn_data[i].append(None)
233
234 # Add hydrogens (respects valence, so incomplete).
235 # Chem.calcImplicitValence(mol)
236 mol.UpdatePropertyCache(strict=False)
237 mol = Chem.AddHs(mol)
238
239 while True: # Keep going until all these issues have been resolved.
240 current_rxn = None # The reaction to perform.
241 current_rxn_str = None
242
243 for i, rxn_datum in enumerate(rxn_data):
244 reactant_smarts, product_smarts, substruct_match_mol, rxn_placeholder = rxn_datum
245 if mol.HasSubstructMatch(substruct_match_mol):
246 if rxn_placeholder is None:
247 current_rxn_str = reactant_smarts + '>>' + product_smarts
248 current_rxn = AllChem.ReactionFromSmarts(current_rxn_str)
249 rxn_data[i][3] = current_rxn # Update the placeholder.
250 else:
251 current_rxn = rxn_data[i][3]
252 break
253
254 # Perform the reaction if necessary
255 if current_rxn is None: # No reaction left, so break out of while loop.
256 break
257 else:
258 mol = current_rxn.RunReactants((mol,))[0][0]
259 mol.UpdatePropertyCache(strict=False) # Update valences
260
261 # The mols have been altered from the reactions described above, we need
262 # to resanitize them. Make sure aromatic rings are shown as such This
263 # catches all RDKit Errors. without the catchError and sanitizeOps the
264 # Chem.SanitizeMol can crash the program.
265 sanitize_string = Chem.SanitizeMol(
266 mol,
267 sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL,
268 catchErrors = True
269 )
270
271 return mol if sanitize_string.name == "SANITIZE_NONE" else None
272
273 @staticmethod
274 def convert_smiles_str_to_mol(smiles_str):
275 """Given a SMILES string, check that it is actually a string and not a
276 None. Then try to convert it to an RDKit Mol Object.
277
278 :param string smiles_str: The SMILES string.
279 :return: A rdkit.Chem.rdchem.Mol object, or None if it is the wrong type or
280 if it fails to convert to a Mol Obj
281 """
282
283 # Check that there are no type errors, ie Nones or non-string
284 # A non-string type will cause RDKit to hard crash
285 if smiles_str is None or type(smiles_str) is not str:
286 return None
287
288 # Try to fix azides here. They are just tricky to deal with.
289 smiles_str = smiles_str.replace("N=N=N", "N=[N+]=N")
290 smiles_str = smiles_str.replace("NN#N", "N=[N+]=N")
291
292 # Now convert to a mol object. Note the trick that is necessary to
293 # capture RDKit error/warning messages. See
294 # https://stackoverflow.com/questions/24277488/in-python-how-to-capture-the-stdout-from-a-c-shared-library-to-a-variable
295 stderr_fileno = sys.stderr.fileno()
296 stderr_save = os.dup(stderr_fileno)
297 stderr_pipe = os.pipe()
298 os.dup2(stderr_pipe[1], stderr_fileno)
299 os.close(stderr_pipe[1])
300
301 mol = Chem.MolFromSmiles(smiles_str)
302
303 os.close(stderr_fileno)
304 os.close(stderr_pipe[0])
305 os.dup2(stderr_save, stderr_fileno)
306 os.close(stderr_save)
307
308 # Check that there are None type errors Chem.MolFromSmiles has sanitize on
309 # which means if there is even a small error in the SMILES (kekulize,
310 # nitrogen charge...) then mol=None. ie.
311 # Chem.MolFromSmiles("C[N]=[N]=[N]") = None this is an example of an
312 # nitrogen charge error. It is cased in a try statement to be overly
313 # cautious.
314
315 return None if mol is None else mol
316
317 @staticmethod
318 def eprint(*args, **kwargs):
319 """Error messages should be printed to STDERR. See
320 https://stackoverflow.com/questions/5574702/how-to-print-to-stderr-in-python"""
321
322 print(*args, file=sys.stderr, **kwargs)
323
324 class LoadSMIFile(object):
325 """A generator class for loading in the SMILES strings from a file, one at
326 a time."""
327
328 def __init__(self, filename):
329 """Initializes this class.
330
331 :param filename: The filename or file object (i.e., StringIO).
332 :type filename: str or StringIO
333 """
334
335 if type(filename) is str:
336 # It's a filename
337 self.f = open(filename, "r")
338 else:
339 # It's a file object (i.e., StringIO)
340 self.f = filename
341
342 def __iter__(self):
343 """Returns this generator object.
344
345 :return: This generator object.
346 :rtype: LoadSMIFile
347 """
348
349 return self
350
351 def __next__(self):
352 """Ensure Python3 compatibility.
353
354 :return: A dict, where the "smiles" key contains the canonical SMILES
355 string and the "data" key contains the remaining information
356 (e.g., the molecule name).
357 :rtype: dict
358 """
359
360 return self.next()
361
362 def next(self):
363 """Get the data associated with the next line.
364
365 :raises StopIteration: If there are no more lines left iin the file.
366 :return: A dict, where the "smiles" key contains the canonical SMILES
367 string and the "data" key contains the remaining information
368 (e.g., the molecule name).
369 :rtype: dict
370 """
371
372 line = self.f.readline()
373
374 if line == "":
375 # EOF
376 self.f.close()
377 raise StopIteration()
378 return
379
380 # Divide line into smi and data
381 splits = line.split()
382 if len(splits) != 0:
383 # Generate mol object
384 smiles_str = splits[0]
385
386 # Convert from SMILES string to RDKIT Mol. This series of tests is
387 # to make sure the SMILES string is properly formed and to get it
388 # into a canonical form. Filter if failed.
389 mol = UtilFuncs.convert_smiles_str_to_mol(smiles_str)
390 if mol is None:
391 UtilFuncs.eprint("WARNING: Skipping poorly formed SMILES string: " + line)
392 return self.next()
393
394 # Handle nuetralizing the molecules. Filter if failed.
395 mol = UtilFuncs.neutralize_mol(mol)
396 if mol is None:
397 UtilFuncs.eprint("WARNING: Skipping poorly formed SMILES string: " + line)
398 return self.next()
399
400 # Remove the hydrogens.
401 try:
402 mol = Chem.RemoveHs(mol)
403 except:
404 UtilFuncs.eprint("WARNING: Skipping poorly formed SMILES string: " + line)
405 return self.next()
406
407 if mol is None:
408 UtilFuncs.eprint("WARNING: Skipping poorly formed SMILES string: " + line)
409 return self.next()
410
411 # Regenerate the smiles string (to standardize).
412 new_mol_string = Chem.MolToSmiles(mol, isomericSmiles=True)
413
414 return {
415 "smiles": new_mol_string,
416 "data": splits[1:]
417 }
418 else:
419 # Blank line? Go to next one.
420 return self.next()
421
422 class Protonate(object):
423 """A generator class for protonating SMILES strings, one at a time."""
424
425 def __init__(self, args):
426 """Initialize the generator.
427
428 :param args: A dictionary containing the arguments.
429 :type args: dict
430 """
431
432 # Make the args an object variable variable.
433 self.args = args
434
435 # A list to store the protonated SMILES strings associated with a
436 # single input model.
437 self.cur_prot_SMI = []
438
439 # Clean and normalize the args
440 self.args = ArgParseFuncs.clean_args(args)
441
442 # Load the substructures that can be protonated.
443 self.subs = ProtSubstructFuncs.load_protonation_substructs_calc_state_for_ph(
444 self.args["min_ph"], self.args["max_ph"], self.args["pka_precision"]
445 )
446
447 def __iter__(self):
448 """Returns this generator object.
449
450 :return: This generator object.
451 :rtype: Protonate
452 """
453
454 return self
455
456 def __next__(self):
457 """Ensure Python3 compatibility.
458
459 :return: A dict, where the "smiles" key contains the canonical SMILES
460 string and the "data" key contains the remaining information
461 (e.g., the molecule name).
462 :rtype: dict
463 """
464
465 return self.next()
466
467 def next(self):
468 """Get the next protonated SMILES string.
469
470 :raises StopIteration: If there are no more lines left iin the file.
471 :return: TODO A dict, where the "smiles" key contains the canonical SMILES
472 string and the "data" key contains the remaining information
473 (e.g., the molecule name).
474 :rtype: dict
475 """
476
477 # If there are any SMILES strings in self.cur_prot_SMI, just return
478 # the first one and update the list to include only the remaining.
479 if len(self.cur_prot_SMI) > 0:
480 first, self.cur_prot_SMI = self.cur_prot_SMI[0], self.cur_prot_SMI[1:]
481 return first
482
483 # self.cur_prot_SMI is empty, so try to add more to it.
484
485 # Get the next SMILES string from the input file.
486 try:
487 smile_and_datum = self.args["smiles_and_data"].next()
488 except StopIteration:
489 # There are no more input smiles strings...
490 raise StopIteration()
491
492 smi = smile_and_datum["smiles"]
493 data = smile_and_datum["data"] # Everything on SMILES line but the
494 # SMILES string itself (e.g., the
495 # molecule name).
496
497 # Collect the data associated with this smiles (e.g., the molecule
498 # name).
499 tag = " ".join(data)
500
501 # sites is a list of (atom index, "PROTONATED|DEPROTONATED|BOTH").
502 # Note that the second entry indicates what state the site SHOULD be
503 # in (not the one it IS in per the SMILES string). It's calculated
504 # based on the probablistic distributions obtained during training.
505 sites = ProtSubstructFuncs.get_prot_sites_and_target_states(smi, self.subs)
506
507 new_smis = [smi]
508 for site in sites:
509 # Make a new smiles with the correct protonation state. Note that
510 # new_smis is a growing list. This is how multiple protonation
511 # sites are handled.
512
513 # new_smis_to_perhaps_add = ProtSubstructFuncs.protonate_site(new_smis, site)
514 new_smis = ProtSubstructFuncs.protonate_site(new_smis, site)
515 # print(site, new_smis) # Good for debugging.
516
517 # Only add new smiles if not already in the list.
518 # for s in new_smis_to_perhaps_add:
519 # if not s in new_smis:
520 # new_smis.append(s)
521
522 # In some cases, the script might generate redundant molecules.
523 # Phosphonates, when the pH is between the two pKa values and the
524 # stdev value is big enough, for example, will generate two identical
525 # BOTH states. Let's remove this redundancy.
526 new_smis = list(set(new_smis))
527
528 # Deprotonating protonated aromatic nitrogen gives [nH-]. Change this
529 # to [n-]. This is a hack.
530 new_smis = [s.replace("[nH-]", "[n-]") for s in new_smis]
531
532 # Sometimes Dimorphite-DL generates molecules that aren't actually
533 # possible. Simply convert these to mol objects to eliminate the bad
534 # ones (that are None).
535 new_smis = [s for s in new_smis if UtilFuncs.convert_smiles_str_to_mol(s) is not None]
536
537 # If there are no smi left, return the input one at the very least.
538 # All generated forms have apparently been judged
539 # inappropriate/mal-formed.
540 if len(new_smis) == 0:
541 new_smis = [smi]
542
543 # If the user wants to see the target states, add those
544 # to the ends of each line.
545 if self.args["label_states"]:
546 states = '\t'.join([x[1] for x in sites])
547 new_lines = [x + "\t" + tag + "\t" + states for x in new_smis]
548 else:
549 new_lines = [x + "\t" + tag for x in new_smis]
550
551 self.cur_prot_SMI = new_lines
552
553 return self.next()
554
555 class ProtSubstructFuncs:
556 """A namespace to store functions for loading the substructures that can
557 be protonated. To keep things organized."""
558
559 @staticmethod
560 def load_protonation_substructs_calc_state_for_ph(min_ph=6.4, max_ph=8.4, pka_std_range=1):
561 """A pre-calculated list of R-groups with protonation sites, with their
562 likely pKa bins.
563
564 :param float min_ph: The lower bound on the pH range, defaults to 6.4.
565 :param float max_ph: The upper bound on the pH range, defaults to 8.4.
566 :param pka_std_range: Basically the precision (stdev from predicted pKa to
567 consider), defaults to 1.
568 :return: A dict of the protonation substructions for the specified pH
569 range.
570 """
571
572 subs = []
573 pwd = os.path.dirname(os.path.realpath(__file__))
574
575 site_structures_file = "{}/{}".format(pwd, "site_substructures.smarts")
576 with open(site_structures_file, 'r') as substruct:
577 for line in substruct:
578 line = line.strip()
579 sub = {}
580 if line is not "":
581 splits = line.split()
582 sub["name"] = splits[0]
583 sub["smart"] = splits[1]
584 sub["mol"] = Chem.MolFromSmarts(sub["smart"])
585
586 # NEED TO DIVIDE THIS BY 3s
587 pka_ranges = [splits[i:i+3] for i in range(2, len(splits)-1, 3)]
588
589 prot = []
590 for pka_range in pka_ranges:
591 site = pka_range[0]
592 std = float(pka_range[2]) * pka_std_range
593 mean = float(pka_range[1])
594 protonation_state = ProtSubstructFuncs.define_protonation_state(
595 mean, std, min_ph, max_ph
596 )
597
598 prot.append([site, protonation_state])
599
600 sub["prot_states_for_pH"] = prot
601 subs.append(sub)
602 return subs
603
604 @staticmethod
605 def define_protonation_state(mean, std, min_ph, max_ph):
606 """Updates the substructure definitions to include the protonation state
607 based on the user-given pH range. The size of the pKa range is also based
608 on the number of standard deviations to be considered by the user param.
609
610 :param float mean: The mean pKa.
611 :param float std: The precision (stdev).
612 :param float min_ph: The min pH of the range.
613 :param float max_ph: The max pH of the range.
614 :return: A string describing the protonation state.
615 """
616
617 min_pka = mean - std
618 max_pka = mean + std
619
620 # This needs to be reassigned, and 'ERROR' should never make it past the
621 # next set of checks.
622 if min_pka <= max_ph and min_ph <= max_pka:
623 protonation_state = 'BOTH'
624 elif mean > max_ph:
625 protonation_state = 'PROTONATED'
626 else:
627 protonation_state = 'DEPROTONATED'
628
629 return protonation_state
630
631 @staticmethod
632 def get_prot_sites_and_target_states(smi, subs):
633 """For a single molecule, find all possible matches in the protonation
634 R-group list, subs. Items that are higher on the list will be matched
635 first, to the exclusion of later items.
636
637 :param string smi: A SMILES string.
638 :param list subs: Substructure information.
639 :return: A list of protonation sites and their pKa bin. ('PROTONATED',
640 'BOTH', or 'DEPROTONATED')
641 """
642
643 # Convert the Smiles string (smi) to an RDKit Mol Obj
644 mol = UtilFuncs.convert_smiles_str_to_mol(smi)
645
646 # Check Conversion worked
647 if mol is None:
648 UtilFuncs.eprint("ERROR: ", smi)
649 return []
650
651 # Try to Add hydrogens. if failed return []
652 try:
653 mol = Chem.AddHs(mol)
654 except:
655 UtilFuncs.eprint("ERROR: ", smi)
656 return []
657
658 # Check adding Hs worked
659 if mol is None:
660 UtilFuncs.eprint("ERROR: ", smi)
661 return []
662
663 ProtectUnprotectFuncs.unprotect_molecule(mol)
664 protonation_sites = []
665
666 for item in subs:
667 smart = item["mol"]
668 if mol.HasSubstructMatch(smart):
669 matches = ProtectUnprotectFuncs.get_unprotected_matches(mol, smart)
670 prot = item["prot_states_for_pH"]
671 for match in matches:
672 # We want to move the site from being relative to the
673 # substructure, to the index on the main molecule.
674 for site in prot:
675 proton = int(site[0])
676 category = site[1]
677 new_site = (match[proton], category, item["name"])
678
679 if not new_site in protonation_sites:
680 # Because sites must be unique.
681 protonation_sites.append(new_site)
682
683 ProtectUnprotectFuncs.protect_molecule(mol, match)
684
685 return protonation_sites
686
687 @staticmethod
688 def protonate_site(smis, site):
689 """Given a list of SMILES strings, we protonate the site.
690
691 :param list smis: The list of SMILES strings.
692 :param tuple site: Information about the protonation site.
693 (idx, target_prot_state, prot_site_name)
694 :return: A list of the appropriately protonated SMILES.
695 """
696
697 # Decouple the atom index and its target protonation state from the site
698 # tuple
699 idx, target_prot_state, prot_site_name = site
700
701 # Initialize the output list
702 output_smis = []
703
704 state_to_charge = {"DEPROTONATED": [-1],
705 "PROTONATED": [0],
706 "BOTH": [-1, 0]}
707
708 charges = state_to_charge[target_prot_state]
709
710 # Now make the actual smiles match the target protonation state.
711 output_smis = ProtSubstructFuncs.set_protonation_charge(smis, idx, charges, prot_site_name)
712
713 return output_smis
714
715 @staticmethod
716 def set_protonation_charge(smis, idx, charges, prot_site_name):
717 """Sets the atomic charge on a particular site for a set of SMILES.
718
719 :param list smis: A list of the SMILES strings.
720 :param int idx: The index of the atom to consider.
721 :param list charges: A list of the charges (ints) to assign at
722 this site.
723 :param string prot_site_name: The name of the protonation site.
724 :return: A list of the processed SMILES strings.
725 """
726
727 # Sets up the output list and the Nitrogen charge
728 output = []
729
730 for charge in charges:
731 # The charge for Nitrogens is 1 higher than others (i.e., protonated
732 # state is positively charged).
733 nitro_charge = charge + 1
734
735 # But there are a few nitrogen moieties where the acidic group is the
736 # neutral one. Amides are a good example. I gave some thought re. how
737 # to best flag these. I decided that those nitrogen-containing
738 # moieties where the acidic group is neutral (rather than positively
739 # charged) will have "*" in the name.
740 if "*" in prot_site_name:
741 nitro_charge = nitro_charge - 1 # Undo what was done previously.
742
743 for smi in smis:
744
745 # Convert smilesstring (smi) into a RDKit Mol
746 mol = UtilFuncs.convert_smiles_str_to_mol(smi)
747
748 # Check that the conversion worked, skip if it fails
749 if mol is None:
750 continue
751
752 atom = mol.GetAtomWithIdx(idx)
753
754 # Assign the protonation charge, with special care for Nitrogens
755 element = atom.GetAtomicNum()
756 if element == 7:
757 atom.SetFormalCharge(nitro_charge)
758 else:
759 atom.SetFormalCharge(charge)
760
761 # Convert back to SMILE and add to output
762 out_smile = Chem.MolToSmiles(mol, isomericSmiles=True,canonical=True)
763 output.append(out_smile)
764
765 return output
766
767 class ProtectUnprotectFuncs:
768 """A namespace for storing functions that are useful for protecting and
769 unprotecting molecules. To keep things organized. We need to identify and
770 mark groups that have been matched with a substructure."""
771
772 @staticmethod
773 def unprotect_molecule(mol):
774 """Sets the protected property on all atoms to 0. This also creates the
775 property for new molecules.
776
777 :param rdkit.Chem.rdchem.Mol mol: The rdkit Mol object.
778 :type mol: The rdkit Mol object with atoms unprotected.
779 """
780
781 for atom in mol.GetAtoms():
782 atom.SetProp('_protected', '0')
783
784 @staticmethod
785 def protect_molecule(mol, match):
786 """Given a 'match', a list of molecules idx's, we set the protected status
787 of each atom to 1. This will prevent any matches using that atom in the
788 future.
789
790 :param rdkit.Chem.rdchem.Mol mol: The rdkit Mol object to protect.
791 :param list match: A list of molecule idx's.
792 """
793
794 for idx in match:
795 atom = mol.GetAtomWithIdx(idx)
796 atom.SetProp('_protected', '1')
797
798 @staticmethod
799 def get_unprotected_matches(mol, substruct):
800 """Finds substructure matches with atoms that have not been protected.
801 Returns list of matches, each match a list of atom idxs.
802
803 :param rdkit.Chem.rdchem.Mol mol: The Mol object to consider.
804 :param string substruct: The SMARTS string of the substructure ot match.
805 :return: A list of the matches. Each match is itself a list of atom idxs.
806 """
807
808 matches = mol.GetSubstructMatches(substruct)
809 unprotected_matches = []
810 for match in matches:
811 if ProtectUnprotectFuncs.is_match_unprotected(mol, match):
812 unprotected_matches.append(match)
813 return unprotected_matches
814
815 @staticmethod
816 def is_match_unprotected(mol, match):
817 """Checks a molecule to see if the substructure match contains any
818 protected atoms.
819
820 :param rdkit.Chem.rdchem.Mol mol: The Mol object to check.
821 :param list match: The match to check.
822 :return: A boolean, whether the match is present or not.
823 """
824
825 for idx in match:
826 atom = mol.GetAtomWithIdx(idx)
827 protected = atom.GetProp("_protected")
828 if protected == "1":
829 return False
830 return True
831
832 class TestFuncs:
833 """A namespace for storing functions that perform tests on the code. To
834 keep things organized."""
835
836 @staticmethod
837 def test():
838 """Tests all the 38 groups."""
839
840 smis = [
841 # [input smiles, pka, protonated, deprotonated, category]
842 ["C#CCO", "C#CCO", "C#CC[O-]", "Alcohol"],
843 ["C(=O)N", "NC=O", "[NH-]C=O", "Amide"],
844 ["CC(=O)NOC(C)=O", "CC(=O)NOC(C)=O", "CC(=O)[N-]OC(C)=O", "Amide_electronegative"],
845 ["COC(=N)N", "COC(N)=[NH2+]", "COC(=N)N", "AmidineGuanidine2"],
846 ["Brc1ccc(C2NCCS2)cc1", "Brc1ccc(C2[NH2+]CCS2)cc1", "Brc1ccc(C2NCCS2)cc1", "Amines_primary_secondary_tertiary"],
847 ["CC(=O)[n+]1ccc(N)cc1", "CC(=O)[n+]1ccc([NH3+])cc1", "CC(=O)[n+]1ccc(N)cc1", "Anilines_primary"],
848 ["CCNc1ccccc1", "CC[NH2+]c1ccccc1", "CCNc1ccccc1", "Anilines_secondary"],
849 ["Cc1ccccc1N(C)C", "Cc1ccccc1[NH+](C)C", "Cc1ccccc1N(C)C", "Anilines_tertiary"],
850 ["BrC1=CC2=C(C=C1)NC=C2", "Brc1ccc2[nH]ccc2c1", "Brc1ccc2[n-]ccc2c1", "Indole_pyrrole"],
851 ["O=c1cc[nH]cc1", "O=c1cc[nH]cc1", "O=c1cc[n-]cc1", "Aromatic_nitrogen_protonated"],
852 ["C-N=[N+]=[N@H]", "CN=[N+]=N", "CN=[N+]=[N-]", "Azide"],
853 ["BrC(C(O)=O)CBr", "O=C(O)C(Br)CBr", "O=C([O-])C(Br)CBr", "Carboxyl"],
854 ["NC(NN=O)=N", "NC(=[NH2+])NN=O", "N=C(N)NN=O", "AmidineGuanidine1"],
855 ["C(F)(F)(F)C(=O)NC(=O)C", "CC(=O)NC(=O)C(F)(F)F", "CC(=O)[N-]C(=O)C(F)(F)F", "Imide"],
856 ["O=C(C)NC(C)=O", "CC(=O)NC(C)=O", "CC(=O)[N-]C(C)=O", "Imide2"],
857 ["CC(C)(C)C(N(C)O)=O", "CN(O)C(=O)C(C)(C)C", "CN([O-])C(=O)C(C)(C)C", "N-hydroxyamide"],
858 ["C[N+](O)=O", "C[N+](=O)O", "C[N+](=O)[O-]", "Nitro"],
859 ["O=C1C=C(O)CC1", "O=C1C=C(O)CC1", "O=C1C=C([O-])CC1", "O=C-C=C-OH"],
860 ["C1CC1OO", "OOC1CC1", "[O-]OC1CC1", "Peroxide2"],
861 ["C(=O)OO", "O=COO", "O=CO[O-]", "Peroxide1"],
862 ["Brc1cc(O)cc(Br)c1", "Oc1cc(Br)cc(Br)c1", "[O-]c1cc(Br)cc(Br)c1", "Phenol"],
863 ["CC(=O)c1ccc(S)cc1", "CC(=O)c1ccc(S)cc1", "CC(=O)c1ccc([S-])cc1", "Phenyl_Thiol"],
864 ["C=CCOc1ccc(C(=O)O)cc1", "C=CCOc1ccc(C(=O)O)cc1", "C=CCOc1ccc(C(=O)[O-])cc1", "Phenyl_carboxyl"],
865 ["COP(=O)(O)OC", "COP(=O)(O)OC", "COP(=O)([O-])OC", "Phosphate_diester"],
866 ["CP(C)(=O)O", "CP(C)(=O)O", "CP(C)(=O)[O-]", "Phosphinic_acid"],
867 ["CC(C)OP(C)(=O)O", "CC(C)OP(C)(=O)O", "CC(C)OP(C)(=O)[O-]", "Phosphonate_ester"],
868 ["CC1(C)OC(=O)NC1=O", "CC1(C)OC(=O)NC1=O", "CC1(C)OC(=O)[N-]C1=O", "Ringed_imide1"],
869 ["O=C(N1)C=CC1=O", "O=C1C=CC(=O)N1", "O=C1C=CC(=O)[N-]1", "Ringed_imide2"],
870 ["O=S(OC)(O)=O", "COS(=O)(=O)O", "COS(=O)(=O)[O-]", "Sulfate"],
871 ["COc1ccc(S(=O)O)cc1", "COc1ccc(S(=O)O)cc1", "COc1ccc(S(=O)[O-])cc1", "Sulfinic_acid"],
872 ["CS(N)(=O)=O", "CS(N)(=O)=O", "CS([NH-])(=O)=O", "Sulfonamide"],
873 ["CC(=O)CSCCS(O)(=O)=O", "CC(=O)CSCCS(=O)(=O)O", "CC(=O)CSCCS(=O)(=O)[O-]", "Sulfonate"],
874 ["CC(=O)S", "CC(=O)S", "CC(=O)[S-]", "Thioic_acid"],
875 ["C(C)(C)(C)(S)", "CC(C)(C)S", "CC(C)(C)[S-]", "Thiol"],
876 ["Brc1cc[nH+]cc1", "Brc1cc[nH+]cc1", "Brc1ccncc1", "Aromatic_nitrogen_unprotonated"],
877 ["C=C(O)c1c(C)cc(C)cc1C", "C=C(O)c1c(C)cc(C)cc1C", "C=C([O-])c1c(C)cc(C)cc1C", "Vinyl_alcohol"],
878 ["CC(=O)ON", "CC(=O)O[NH3+]", "CC(=O)ON", "Primary_hydroxyl_amine"]
879 ]
880
881 smis_phos = [
882 ["O=P(O)(O)OCCCC", "CCCCOP(=O)(O)O", "CCCCOP(=O)([O-])O", "CCCCOP(=O)([O-])[O-]", "Phosphate"],
883 ["CC(P(O)(O)=O)C", "CC(C)P(=O)(O)O", "CC(C)P(=O)([O-])O", "CC(C)P(=O)([O-])[O-]", "Phosphonate"]
884 ]
885
886 # Load the average pKa values.
887 average_pkas = {l.split()[0].replace("*", ""):float(l.split()[3]) for l in open("site_substructures.smarts") if l.split()[0] not in ["Phosphate", "Phosphonate"]}
888 average_pkas_phos = {l.split()[0].replace("*", ""):[float(l.split()[3]), float(l.split()[6])] for l in open("site_substructures.smarts") if l.split()[0] in ["Phosphate", "Phosphonate"]}
889
890 print("Running Tests")
891 print("=============")
892 print("")
893
894 print("Very Acidic (pH -10000000)")
895 print("--------------------------")
896 print("")
897
898 args = {
899 "min_ph": -10000000,
900 "max_ph": -10000000,
901 "pka_precision": 0.5,
902 "smiles": "",
903 "label_states": True
904 }
905
906 for smi, protonated, deprotonated, category in smis:
907 args["smiles"] = smi
908 TestFuncs.test_check(args, [protonated], ["PROTONATED"])
909
910 for smi, protonated, mix, deprotonated, category in smis_phos:
911 args["smiles"] = smi
912 TestFuncs.test_check(args, [protonated], ["PROTONATED"])
913
914 args["min_ph"] = 10000000
915 args["max_ph"] = 10000000
916
917 print("")
918 print("Very Basic (pH 10000000)")
919 print("------------------------")
920 print("")
921
922 for smi, protonated, deprotonated, category in smis:
923 args["smiles"] = smi
924 TestFuncs.test_check(args, [deprotonated], ["DEPROTONATED"])
925
926 for smi, protonated, mix, deprotonated, category in smis_phos:
927 args["smiles"] = smi
928 TestFuncs.test_check(args, [deprotonated], ["DEPROTONATED"])
929
930 print("")
931 print("pH is Category pKa")
932 print("------------------")
933 print("")
934
935 for smi, protonated, deprotonated, category in smis:
936 avg_pka = average_pkas[category]
937
938 args["smiles"] = smi
939 args["min_ph"] = avg_pka
940 args["max_ph"] = avg_pka
941
942 TestFuncs.test_check(args, [protonated, deprotonated], ["BOTH"])
943
944 for smi, protonated, mix, deprotonated, category in smis_phos:
945 args["smiles"] = smi
946
947 avg_pka = average_pkas_phos[category][0]
948 args["min_ph"] = avg_pka
949 args["max_ph"] = avg_pka
950
951 TestFuncs.test_check(args, [mix, protonated], ["BOTH"])
952
953 avg_pka = average_pkas_phos[category][1]
954 args["min_ph"] = avg_pka
955 args["max_ph"] = avg_pka
956
957 TestFuncs.test_check(args, [mix, deprotonated], ["DEPROTONATED", "DEPROTONATED"])
958
959 avg_pka = 0.5 * (average_pkas_phos[category][0] + average_pkas_phos[category][1])
960 args["min_ph"] = avg_pka
961 args["max_ph"] = avg_pka
962 args["pka_precision"] = 5 # Should give all three
963
964 TestFuncs.test_check(args, [mix, deprotonated, protonated], ["BOTH", "BOTH"])
965
966 @staticmethod
967 def test_check(args, expected_output, labels):
968 """Tests most ionizable groups. The ones that can only loose or gain a single proton.
969
970 :param args: The arguments to pass to protonate()
971 :param expected_output: A list of the expected SMILES-strings output.
972 :param labels: The labels. A list containing combo of BOTH, PROTONATED,
973 DEPROTONATED.
974 :raises Exception: Wrong number of states produced.
975 :raises Exception: Unexpected output SMILES.
976 :raises Exception: Wrong labels.
977 """
978
979 output = list(Protonate(args))
980 output = [o.split() for o in output]
981
982 num_states = len(expected_output)
983
984 if (len(output) != num_states):
985 msg = args["smiles"] + " should have " + str(num_states) + \
986 " states at at pH " + str(args["min_ph"]) + ": " + str(output)
987 print(msg)
988 raise Exception(msg)
989
990 if (len(set([l[0] for l in output]) - set(expected_output)) != 0):
991 msg = args["smiles"] + " is not " + " AND ".join(expected_output) + \
992 " at pH " + str(args["min_ph"]) + " - " + str(args["max_ph"]) + \
993 "; it is " + " AND ".join([l[0] for l in output])
994 print(msg)
995 raise Exception(msg)
996
997 if (len(set([l[1] for l in output]) - set(labels)) != 0):
998 msg = args["smiles"] + " not labeled as " + " AND ".join(labels) + \
999 "; it is " + " AND ".join([l[1] for l in output])
1000 print(msg)
1001 raise Exception(msg)
1002
1003 ph_range = sorted(list(set([args["min_ph"], args["max_ph"]])))
1004 ph_range_str = "(" + " - ".join("{0:.2f}".format(n) for n in ph_range) + ")"
1005 print("(CORRECT) " + ph_range_str.ljust(10) + " " + args["smiles"] + " => " + " AND ".join([l[0] for l in output]))
1006
1007 def run(**kwargs):
1008 """A helpful, importable function for those who want to call Dimorphite-DL
1009 from another Python script rather than the command line. Note that this
1010 function accepts keyword arguments that match the command-line parameters
1011 exactly. If you want to pass and return a list of RDKit Mol objects, import
1012 run_with_mol_list() instead.
1013
1014 :param **kwargs: For a complete description, run dimorphite_dl.py from the
1015 command line with the -h option.
1016 :type kwargs: dict
1017 """
1018
1019 # Run the main function with the specified arguments.
1020 main(kwargs)
1021
1022 def run_with_mol_list(mol_lst, **kwargs):
1023 """A helpful, importable function for those who want to call Dimorphite-DL
1024 from another Python script rather than the command line. Note that this
1025 function is for passing Dimorphite-DL a list of RDKit Mol objects, together
1026 with command-line parameters. If you want to use only the same parameters
1027 that you would use from the command line, import run() instead.
1028
1029 :param mol_lst: A list of rdkit.Chem.rdchem.Mol objects.
1030 :type mol_lst: list
1031 :raises Exception: If the **kwargs includes "smiles", "smiles_file",
1032 "output_file", or "test" parameters.
1033 :return: A list of properly protonated rdkit.Chem.rdchem.Mol objects.
1034 :rtype: list
1035 """
1036
1037 # Do a quick check to make sure the user input makes sense.
1038 for bad_arg in ["smiles", "smiles_file", "output_file", "test"]:
1039 if bad_arg in kwargs:
1040 msg = "You're using Dimorphite-DL's run_with_mol_list(mol_lst, " + \
1041 "**kwargs) function, but you also passed the \"" + \
1042 bad_arg + "\" argument. Did you mean to use the " + \
1043 "run(**kwargs) function instead?"
1044 print(msg)
1045 raise Exception(msg)
1046
1047 # Set the return_as_list flag so main() will return the protonated smiles
1048 # as a list.
1049 kwargs["return_as_list"] = True
1050
1051 # Having reviewed the code, it will be very difficult to rewrite it so
1052 # that a list of Mol objects can be used directly. Intead, convert this
1053 # list of mols to smiles and pass that. Not efficient, but it will work.
1054 protonated_smiles_and_props = []
1055 for m in mol_lst:
1056 props = m.GetPropsAsDict()
1057 kwargs["smiles"] = Chem.MolToSmiles(m, isomericSmiles=True)
1058 protonated_smiles_and_props.extend(
1059 [(s.split("\t")[0], props) for s in main(kwargs)]
1060 )
1061
1062 # Now convert the list of protonated smiles strings back to RDKit Mol
1063 # objects. Also, add back in the properties from the original mol objects.
1064 mols = []
1065 for s, props in protonated_smiles_and_props:
1066 m = Chem.MolFromSmiles(s)
1067 if m:
1068 for prop, val in props.items():
1069 if type(val) is int:
1070 m.SetIntProp(prop, val)
1071 elif type(val) is float:
1072 m.SetDoubleProp(prop, val)
1073 elif type(val) is bool:
1074 m.SetBoolProp(prop, val)
1075 else:
1076 m.SetProp(prop, str(val))
1077 mols.append(m)
1078 else:
1079 UtilFuncs.eprint("WARNING: Could not process molecule with SMILES string " + s + " and properties " + str(props))
1080
1081 return mols
1082
1083 if __name__ == "__main__":
1084 main()