# HG changeset patch
# User bgruening
# Date 1638635971 0
# Node ID 5c501eb8d56c326f4c9b9aeb0e70ddf42f4cbf67
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit c1d813d3f0fec60ea6efe8a11e59d98bfdc1636f"
diff -r 000000000000 -r 5c501eb8d56c dimorphite_dl.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/dimorphite_dl.py Sat Dec 04 16:39:31 2021 +0000
@@ -0,0 +1,1311 @@
+# flake8: noqa
+# Copyright 2018 Jacob D. Durrant
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script identifies and enumerates the possible protonation sites of SMILES
+strings.
+"""
+
+from __future__ import print_function
+
+import argparse
+import os
+import sys
+
+try:
+ # Python2
+ from StringIO import StringIO
+except ImportError:
+ # Python3
+ from io import StringIO
+
+# Always let the user know a help file is available.
+print("\nFor help, use: python dimorphite_dl.py --help")
+
+# And always report citation information.
+print("\nIf you use Dimorphite-DL in your research, please cite:")
+print("Ropp PJ, Kaminsky JC, Yablonski S, Durrant JD (2019) Dimorphite-DL: An")
+print("open-source program for enumerating the ionization states of drug-like small")
+print("molecules. J Cheminform 11:14. doi:10.1186/s13321-019-0336-9.\n")
+
+try:
+ import rdkit
+ from rdkit import Chem
+ from rdkit.Chem import AllChem
+except Exception:
+ msg = "Dimorphite-DL requires RDKit. See https://www.rdkit.org/"
+ print(msg)
+ raise Exception(msg)
+
+
+def main(params=None):
+ """The main definition run when you call the script from the commandline.
+
+ :param params: The parameters to use. Entirely optional. If absent,
+ defaults to None, in which case argments will be taken from
+ those given at the command line.
+ :param params: dict, optional
+ :return: Returns a list of the SMILES strings return_as_list parameter is
+ True. Otherwise, returns None.
+ """
+
+ parser = ArgParseFuncs.get_args()
+ args = vars(parser.parse_args())
+
+ # Add in any parameters in params.
+ if params is not None:
+ for k, v in params.items():
+ args[k] = v
+
+ # If being run from the command line, print out all parameters.
+ if __name__ == "__main__":
+ print("\nPARAMETERS:\n")
+ for k in sorted(args.keys()):
+ print(k.rjust(13) + ": " + str(args[k]))
+ print("")
+
+ if args["test"]:
+ # Run tests.
+ TestFuncs.test()
+ else:
+ # Run protonation
+ if "output_file" in args and args["output_file"] is not None:
+ # An output file was specified, so write to that.
+ with open(args["output_file"], "w") as file:
+ for protonated_smi in Protonate(args):
+ file.write(protonated_smi + "\n")
+ elif "return_as_list" in args and args["return_as_list"]:
+ return list(Protonate(args))
+ else:
+ # No output file specified. Just print it to the screen.
+ for protonated_smi in Protonate(args):
+ print(protonated_smi)
+
+
+class MyParser(argparse.ArgumentParser):
+ """Overwrite default parse so it displays help file on error. See
+ https://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu"""
+
+ def error(self, message):
+ """Overwrites the default error message.
+
+ :param message: The default error message.
+ """
+
+ self.print_help()
+ msg = "ERROR: %s\n\n" % message
+ print(msg)
+ raise Exception(msg)
+
+ def print_help(self, file=None):
+ """Overwrite the default print_help function
+
+ :param file: Output file, defaults to None
+ """
+
+ print("")
+
+ if file is None:
+ file = sys.stdout
+ self._print_message(self.format_help(), file)
+ print(
+ """
+examples:
+ python dimorphite_dl.py --smiles_file sample_molecules.smi
+ python dimorphite_dl.py --smiles "CCC(=O)O" --min_ph -3.0 --max_ph -2.0
+ python dimorphite_dl.py --smiles "CCCN" --min_ph -3.0 --max_ph -2.0 --output_file output.smi
+ python dimorphite_dl.py --smiles_file sample_molecules.smi --pka_precision 2.0 --label_states
+ python dimorphite_dl.py --test"""
+ )
+ print("")
+
+
+class ArgParseFuncs:
+ """A namespace for storing functions that are useful for processing
+ command-line arguments. To keep things organized."""
+
+ @staticmethod
+ def get_args():
+ """Gets the arguments from the command line.
+
+ :return: A parser object.
+ """
+
+ parser = MyParser(
+ description="Dimorphite 1.2: Creates models of "
+ + "appropriately protonated small moleucles. "
+ + "Apache 2.0 License. Copyright 2018 Jacob D. "
+ + "Durrant."
+ )
+ parser.add_argument(
+ "--min_ph",
+ metavar="MIN",
+ type=float,
+ default=6.4,
+ help="minimum pH to consider (default: 6.4)",
+ )
+ parser.add_argument(
+ "--max_ph",
+ metavar="MAX",
+ type=float,
+ default=8.4,
+ help="maximum pH to consider (default: 8.4)",
+ )
+ parser.add_argument(
+ "--pka_precision",
+ metavar="PRE",
+ type=float,
+ default=1.0,
+ help="pKa precision factor (number of standard devations, default: 1.0)",
+ )
+ parser.add_argument(
+ "--smiles", metavar="SMI", type=str, help="SMILES string to protonate"
+ )
+ parser.add_argument(
+ "--smiles_file",
+ metavar="FILE",
+ type=str,
+ help="file that contains SMILES strings to protonate",
+ )
+ parser.add_argument(
+ "--output_file",
+ metavar="FILE",
+ type=str,
+ help="output file to write protonated SMILES (optional)",
+ )
+ parser.add_argument(
+ "--label_states",
+ action="store_true",
+ help="label protonated SMILES with target state "
+ + '(i.e., "DEPROTONATED", "PROTONATED", or "BOTH").',
+ )
+ parser.add_argument(
+ "--test", action="store_true", help="run unit tests (for debugging)"
+ )
+
+ return parser
+
+ @staticmethod
+ def clean_args(args):
+ """Cleans and normalizes input parameters
+
+ :param args: A dictionary containing the arguments.
+ :type args: dict
+ :raises Exception: No SMILES in params.
+ """
+
+ defaults = {
+ "min_ph": 6.4,
+ "max_ph": 8.4,
+ "pka_precision": 1.0,
+ "label_states": False,
+ "test": False,
+ }
+
+ for key in defaults:
+ if key not in args:
+ args[key] = defaults[key]
+
+ keys = list(args.keys())
+ for key in keys:
+ if args[key] is None:
+ del args[key]
+
+ if not "smiles" in args and not "smiles_file" in args:
+ msg = "Error: No SMILES in params. Use the -h parameter for help."
+ print(msg)
+ raise Exception(msg)
+
+ # If the user provides a smiles string, turn it into a file-like StringIO
+ # object.
+ if "smiles" in args:
+ if isinstance(args["smiles"], str):
+ args["smiles_file"] = StringIO(args["smiles"])
+
+ args["smiles_and_data"] = LoadSMIFile(args["smiles_file"])
+
+ return args
+
+
+class UtilFuncs:
+ """A namespace to store functions for manipulating mol objects. To keep
+ things organized."""
+
+ @staticmethod
+ def neutralize_mol(mol):
+ """All molecules should be neuralized to the extent possible. The user
+ should not be allowed to specify the valence of the atoms in most cases.
+
+ :param rdkit.Chem.rdchem.Mol mol: The rdkit Mol objet to be neutralized.
+ :return: The neutralized Mol object.
+ """
+
+ # Get the reaction data
+ rxn_data = [
+ [
+ "[Ov1-1:1]",
+ "[Ov2+0:1]-[H]",
+ ], # To handle O- bonded to only one atom (add hydrogen).
+ [
+ "[#7v4+1:1]-[H]",
+ "[#7v3+0:1]",
+ ], # To handle N+ bonded to a hydrogen (remove hydrogen).
+ [
+ "[Ov2-:1]",
+ "[Ov2+0:1]",
+ ], # To handle O- bonded to two atoms. Should not be Negative.
+ [
+ "[#7v3+1:1]",
+ "[#7v3+0:1]",
+ ], # To handle N+ bonded to three atoms. Should not be positive.
+ [
+ "[#7v2-1:1]",
+ "[#7+0:1]-[H]",
+ ], # To handle N- Bonded to two atoms. Add hydrogen.
+ # ['[N:1]=[N+0:2]=[N:3]-[H]', '[N:1]=[N+1:2]=[N+0:3]-[H]'], # To
+ # handle bad azide. Must be protonated. (Now handled elsewhere, before
+ # SMILES converted to Mol object.)
+ [
+ "[H]-[N:1]-[N:2]#[N:3]",
+ "[N:1]=[N+1:2]=[N:3]-[H]",
+ ], # To handle bad azide. R-N-N#N should be R-N=[N+]=N
+ ]
+
+ # Add substructures and reactions (initially none)
+ for i, rxn_datum in enumerate(rxn_data):
+ rxn_data[i].append(Chem.MolFromSmarts(rxn_datum[0]))
+ rxn_data[i].append(None)
+
+ # Add hydrogens (respects valence, so incomplete).
+ # Chem.calcImplicitValence(mol)
+ mol.UpdatePropertyCache(strict=False)
+ mol = Chem.AddHs(mol)
+
+ while True: # Keep going until all these issues have been resolved.
+ current_rxn = None # The reaction to perform.
+ current_rxn_str = None
+
+ for i, rxn_datum in enumerate(rxn_data):
+ (
+ reactant_smarts,
+ product_smarts,
+ substruct_match_mol,
+ rxn_placeholder,
+ ) = rxn_datum
+ if mol.HasSubstructMatch(substruct_match_mol):
+ if rxn_placeholder is None:
+ current_rxn_str = reactant_smarts + ">>" + product_smarts
+ current_rxn = AllChem.ReactionFromSmarts(current_rxn_str)
+ rxn_data[i][3] = current_rxn # Update the placeholder.
+ else:
+ current_rxn = rxn_data[i][3]
+ break
+
+ # Perform the reaction if necessary
+ if current_rxn is None: # No reaction left, so break out of while loop.
+ break
+ else:
+ mol = current_rxn.RunReactants((mol,))[0][0]
+ mol.UpdatePropertyCache(strict=False) # Update valences
+
+ # The mols have been altered from the reactions described above, we need
+ # to resanitize them. Make sure aromatic rings are shown as such This
+ # catches all RDKit Errors. without the catchError and sanitizeOps the
+ # Chem.SanitizeMol can crash the program.
+ sanitize_string = Chem.SanitizeMol(
+ mol,
+ sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL,
+ catchErrors=True,
+ )
+
+ return mol if sanitize_string.name == "SANITIZE_NONE" else None
+
+ @staticmethod
+ def convert_smiles_str_to_mol(smiles_str):
+ """Given a SMILES string, check that it is actually a string and not a
+ None. Then try to convert it to an RDKit Mol Object.
+
+ :param string smiles_str: The SMILES string.
+ :return: A rdkit.Chem.rdchem.Mol object, or None if it is the wrong type or
+ if it fails to convert to a Mol Obj
+ """
+
+ # Check that there are no type errors, ie Nones or non-string
+ # A non-string type will cause RDKit to hard crash
+ if smiles_str is None or type(smiles_str) is not str:
+ return None
+
+ # Try to fix azides here. They are just tricky to deal with.
+ smiles_str = smiles_str.replace("N=N=N", "N=[N+]=N")
+ smiles_str = smiles_str.replace("NN#N", "N=[N+]=N")
+
+ # Now convert to a mol object. Note the trick that is necessary to
+ # capture RDKit error/warning messages. See
+ # https://stackoverflow.com/questions/24277488/in-python-how-to-capture-the-stdout-from-a-c-shared-library-to-a-variable
+ stderr_fileno = sys.stderr.fileno()
+ stderr_save = os.dup(stderr_fileno)
+ stderr_pipe = os.pipe()
+ os.dup2(stderr_pipe[1], stderr_fileno)
+ os.close(stderr_pipe[1])
+
+ mol = Chem.MolFromSmiles(smiles_str)
+
+ os.close(stderr_fileno)
+ os.close(stderr_pipe[0])
+ os.dup2(stderr_save, stderr_fileno)
+ os.close(stderr_save)
+
+ # Check that there are None type errors Chem.MolFromSmiles has sanitize on
+ # which means if there is even a small error in the SMILES (kekulize,
+ # nitrogen charge...) then mol=None. ie.
+ # Chem.MolFromSmiles("C[N]=[N]=[N]") = None this is an example of an
+ # nitrogen charge error. It is cased in a try statement to be overly
+ # cautious.
+
+ return None if mol is None else mol
+
+ @staticmethod
+ def eprint(*args, **kwargs):
+ """Error messages should be printed to STDERR. See
+ https://stackoverflow.com/questions/5574702/how-to-print-to-stderr-in-python"""
+
+ print(*args, file=sys.stderr, **kwargs)
+
+
+class LoadSMIFile(object):
+ """A generator class for loading in the SMILES strings from a file, one at
+ a time."""
+
+ def __init__(self, filename):
+ """Initializes this class.
+
+ :param filename: The filename or file object (i.e., StringIO).
+ :type filename: str or StringIO
+ """
+
+ if type(filename) is str:
+ # It's a filename
+ self.f = open(filename, "r")
+ else:
+ # It's a file object (i.e., StringIO)
+ self.f = filename
+
+ def __iter__(self):
+ """Returns this generator object.
+
+ :return: This generator object.
+ :rtype: LoadSMIFile
+ """
+
+ return self
+
+ def __next__(self):
+ """Ensure Python3 compatibility.
+
+ :return: A dict, where the "smiles" key contains the canonical SMILES
+ string and the "data" key contains the remaining information
+ (e.g., the molecule name).
+ :rtype: dict
+ """
+
+ return self.next()
+
+ def next(self):
+ """Get the data associated with the next line.
+
+ :raises StopIteration: If there are no more lines left iin the file.
+ :return: A dict, where the "smiles" key contains the canonical SMILES
+ string and the "data" key contains the remaining information
+ (e.g., the molecule name).
+ :rtype: dict
+ """
+
+ line = self.f.readline()
+
+ if line == "":
+ # EOF
+ self.f.close()
+ raise StopIteration()
+ return
+
+ # Divide line into smi and data
+ splits = line.split()
+ if len(splits) != 0:
+ # Generate mol object
+ smiles_str = splits[0]
+
+ # Convert from SMILES string to RDKIT Mol. This series of tests is
+ # to make sure the SMILES string is properly formed and to get it
+ # into a canonical form. Filter if failed.
+ mol = UtilFuncs.convert_smiles_str_to_mol(smiles_str)
+ if mol is None:
+ UtilFuncs.eprint(
+ "WARNING: Skipping poorly formed SMILES string: " + line
+ )
+ return self.next()
+
+ # Handle nuetralizing the molecules. Filter if failed.
+ mol = UtilFuncs.neutralize_mol(mol)
+ if mol is None:
+ UtilFuncs.eprint(
+ "WARNING: Skipping poorly formed SMILES string: " + line
+ )
+ return self.next()
+
+ # Remove the hydrogens.
+ try:
+ mol = Chem.RemoveHs(mol)
+ except Exception:
+ UtilFuncs.eprint(
+ "WARNING: Skipping poorly formed SMILES string: " + line
+ )
+ return self.next()
+
+ if mol is None:
+ UtilFuncs.eprint(
+ "WARNING: Skipping poorly formed SMILES string: " + line
+ )
+ return self.next()
+
+ # Regenerate the smiles string (to standardize).
+ new_mol_string = Chem.MolToSmiles(mol, isomericSmiles=True)
+
+ return {"smiles": new_mol_string, "data": splits[1:]}
+ else:
+ # Blank line? Go to next one.
+ return self.next()
+
+
+class Protonate(object):
+ """A generator class for protonating SMILES strings, one at a time."""
+
+ def __init__(self, args):
+ """Initialize the generator.
+
+ :param args: A dictionary containing the arguments.
+ :type args: dict
+ """
+
+ # Make the args an object variable variable.
+ self.args = args
+
+ # A list to store the protonated SMILES strings associated with a
+ # single input model.
+ self.cur_prot_SMI = []
+
+ # Clean and normalize the args
+ self.args = ArgParseFuncs.clean_args(args)
+
+ # Load the substructures that can be protonated.
+ self.subs = ProtSubstructFuncs.load_protonation_substructs_calc_state_for_ph(
+ self.args["min_ph"], self.args["max_ph"], self.args["pka_precision"]
+ )
+
+ def __iter__(self):
+ """Returns this generator object.
+
+ :return: This generator object.
+ :rtype: Protonate
+ """
+
+ return self
+
+ def __next__(self):
+ """Ensure Python3 compatibility.
+
+ :return: A dict, where the "smiles" key contains the canonical SMILES
+ string and the "data" key contains the remaining information
+ (e.g., the molecule name).
+ :rtype: dict
+ """
+
+ return self.next()
+
+ def next(self):
+ """Get the next protonated SMILES string.
+
+ :raises StopIteration: If there are no more lines left iin the file.
+ :return: TODO A dict, where the "smiles" key contains the canonical SMILES
+ string and the "data" key contains the remaining information
+ (e.g., the molecule name).
+ :rtype: dict
+ """
+
+ # If there are any SMILES strings in self.cur_prot_SMI, just return
+ # the first one and update the list to include only the remaining.
+ if len(self.cur_prot_SMI) > 0:
+ first, self.cur_prot_SMI = self.cur_prot_SMI[0], self.cur_prot_SMI[1:]
+ return first
+
+ # self.cur_prot_SMI is empty, so try to add more to it.
+
+ # Get the next SMILES string from the input file.
+ try:
+ smile_and_datum = self.args["smiles_and_data"].next()
+ except StopIteration:
+ # There are no more input smiles strings...
+ raise StopIteration()
+
+ smi = smile_and_datum["smiles"]
+ data = smile_and_datum["data"] # Everything on SMILES line but the
+ # SMILES string itself (e.g., the
+ # molecule name).
+
+ # Collect the data associated with this smiles (e.g., the molecule
+ # name).
+ tag = " ".join(data)
+
+ # sites is a list of (atom index, "PROTONATED|DEPROTONATED|BOTH").
+ # Note that the second entry indicates what state the site SHOULD be
+ # in (not the one it IS in per the SMILES string). It's calculated
+ # based on the probablistic distributions obtained during training.
+ sites = ProtSubstructFuncs.get_prot_sites_and_target_states(smi, self.subs)
+
+ new_smis = [smi]
+ for site in sites:
+ # Make a new smiles with the correct protonation state. Note that
+ # new_smis is a growing list. This is how multiple protonation
+ # sites are handled.
+
+ # new_smis_to_perhaps_add = ProtSubstructFuncs.protonate_site(new_smis, site)
+ new_smis = ProtSubstructFuncs.protonate_site(new_smis, site)
+ # print(site, new_smis) # Good for debugging.
+
+ # Only add new smiles if not already in the list.
+ # for s in new_smis_to_perhaps_add:
+ # if not s in new_smis:
+ # new_smis.append(s)
+
+ # In some cases, the script might generate redundant molecules.
+ # Phosphonates, when the pH is between the two pKa values and the
+ # stdev value is big enough, for example, will generate two identical
+ # BOTH states. Let's remove this redundancy.
+ new_smis = list(set(new_smis))
+
+ # Deprotonating protonated aromatic nitrogen gives [nH-]. Change this
+ # to [n-]. This is a hack.
+ new_smis = [s.replace("[nH-]", "[n-]") for s in new_smis]
+
+ # Sometimes Dimorphite-DL generates molecules that aren't actually
+ # possible. Simply convert these to mol objects to eliminate the bad
+ # ones (that are None).
+ new_smis = [
+ s for s in new_smis if UtilFuncs.convert_smiles_str_to_mol(s) is not None
+ ]
+
+ # If there are no smi left, return the input one at the very least.
+ # All generated forms have apparently been judged
+ # inappropriate/mal-formed.
+ if len(new_smis) == 0:
+ new_smis = [smi]
+
+ # If the user wants to see the target states, add those
+ # to the ends of each line.
+ if self.args["label_states"]:
+ states = "\t".join([x[1] for x in sites])
+ new_lines = [x + "\t" + tag + "\t" + states for x in new_smis]
+ else:
+ new_lines = [x + "\t" + tag for x in new_smis]
+
+ self.cur_prot_SMI = new_lines
+
+ return self.next()
+
+
+class ProtSubstructFuncs:
+ """A namespace to store functions for loading the substructures that can
+ be protonated. To keep things organized."""
+
+ @staticmethod
+ def load_protonation_substructs_calc_state_for_ph(
+ min_ph=6.4, max_ph=8.4, pka_std_range=1
+ ):
+ """A pre-calculated list of R-groups with protonation sites, with their
+ likely pKa bins.
+
+ :param float min_ph: The lower bound on the pH range, defaults to 6.4.
+ :param float max_ph: The upper bound on the pH range, defaults to 8.4.
+ :param pka_std_range: Basically the precision (stdev from predicted pKa to
+ consider), defaults to 1.
+ :return: A dict of the protonation substructions for the specified pH
+ range.
+ """
+
+ subs = []
+ pwd = os.path.dirname(os.path.realpath(__file__))
+
+ site_structures_file = "{}/{}".format(pwd, "site_substructures.smarts")
+ with open(site_structures_file, "r") as substruct:
+ for line in substruct:
+ line = line.strip()
+ sub = {}
+ if line is not "":
+ splits = line.split()
+ sub["name"] = splits[0]
+ sub["smart"] = splits[1]
+ sub["mol"] = Chem.MolFromSmarts(sub["smart"])
+
+ # NEED TO DIVIDE THIS BY 3s
+ pka_ranges = [
+ splits[i : i + 3] for i in range(2, len(splits) - 1, 3)
+ ]
+
+ prot = []
+ for pka_range in pka_ranges:
+ site = pka_range[0]
+ std = float(pka_range[2]) * pka_std_range
+ mean = float(pka_range[1])
+ protonation_state = ProtSubstructFuncs.define_protonation_state(
+ mean, std, min_ph, max_ph
+ )
+
+ prot.append([site, protonation_state])
+
+ sub["prot_states_for_pH"] = prot
+ subs.append(sub)
+ return subs
+
+ @staticmethod
+ def define_protonation_state(mean, std, min_ph, max_ph):
+ """Updates the substructure definitions to include the protonation state
+ based on the user-given pH range. The size of the pKa range is also based
+ on the number of standard deviations to be considered by the user param.
+
+ :param float mean: The mean pKa.
+ :param float std: The precision (stdev).
+ :param float min_ph: The min pH of the range.
+ :param float max_ph: The max pH of the range.
+ :return: A string describing the protonation state.
+ """
+
+ min_pka = mean - std
+ max_pka = mean + std
+
+ # This needs to be reassigned, and 'ERROR' should never make it past the
+ # next set of checks.
+ if min_pka <= max_ph and min_ph <= max_pka:
+ protonation_state = "BOTH"
+ elif mean > max_ph:
+ protonation_state = "PROTONATED"
+ else:
+ protonation_state = "DEPROTONATED"
+
+ return protonation_state
+
+ @staticmethod
+ def get_prot_sites_and_target_states(smi, subs):
+ """For a single molecule, find all possible matches in the protonation
+ R-group list, subs. Items that are higher on the list will be matched
+ first, to the exclusion of later items.
+
+ :param string smi: A SMILES string.
+ :param list subs: Substructure information.
+ :return: A list of protonation sites and their pKa bin. ('PROTONATED',
+ 'BOTH', or 'DEPROTONATED')
+ """
+
+ # Convert the Smiles string (smi) to an RDKit Mol Obj
+ mol = UtilFuncs.convert_smiles_str_to_mol(smi)
+
+ # Check Conversion worked
+ if mol is None:
+ UtilFuncs.eprint("ERROR: ", smi)
+ return []
+
+ # Try to Add hydrogens. if failed return []
+ try:
+ mol = Chem.AddHs(mol)
+ except Exception:
+ UtilFuncs.eprint("ERROR: ", smi)
+ return []
+
+ # Check adding Hs worked
+ if mol is None:
+ UtilFuncs.eprint("ERROR: ", smi)
+ return []
+
+ ProtectUnprotectFuncs.unprotect_molecule(mol)
+ protonation_sites = []
+
+ for item in subs:
+ smart = item["mol"]
+ if mol.HasSubstructMatch(smart):
+ matches = ProtectUnprotectFuncs.get_unprotected_matches(mol, smart)
+ prot = item["prot_states_for_pH"]
+ for match in matches:
+ # We want to move the site from being relative to the
+ # substructure, to the index on the main molecule.
+ for site in prot:
+ proton = int(site[0])
+ category = site[1]
+ new_site = (match[proton], category, item["name"])
+
+ if not new_site in protonation_sites:
+ # Because sites must be unique.
+ protonation_sites.append(new_site)
+
+ ProtectUnprotectFuncs.protect_molecule(mol, match)
+
+ return protonation_sites
+
+ @staticmethod
+ def protonate_site(smis, site):
+ """Given a list of SMILES strings, we protonate the site.
+
+ :param list smis: The list of SMILES strings.
+ :param tuple site: Information about the protonation site.
+ (idx, target_prot_state, prot_site_name)
+ :return: A list of the appropriately protonated SMILES.
+ """
+
+ # Decouple the atom index and its target protonation state from the site
+ # tuple
+ idx, target_prot_state, prot_site_name = site
+
+ # Initialize the output list
+ output_smis = []
+
+ state_to_charge = {"DEPROTONATED": [-1], "PROTONATED": [0], "BOTH": [-1, 0]}
+
+ charges = state_to_charge[target_prot_state]
+
+ # Now make the actual smiles match the target protonation state.
+ output_smis = ProtSubstructFuncs.set_protonation_charge(
+ smis, idx, charges, prot_site_name
+ )
+
+ return output_smis
+
+ @staticmethod
+ def set_protonation_charge(smis, idx, charges, prot_site_name):
+ """Sets the atomic charge on a particular site for a set of SMILES.
+
+ :param list smis: A list of the SMILES strings.
+ :param int idx: The index of the atom to consider.
+ :param list charges: A list of the charges (ints) to assign at
+ this site.
+ :param string prot_site_name: The name of the protonation site.
+ :return: A list of the processed SMILES strings.
+ """
+
+ # Sets up the output list and the Nitrogen charge
+ output = []
+
+ for charge in charges:
+ # The charge for Nitrogens is 1 higher than others (i.e., protonated
+ # state is positively charged).
+ nitro_charge = charge + 1
+
+ # But there are a few nitrogen moieties where the acidic group is the
+ # neutral one. Amides are a good example. I gave some thought re. how
+ # to best flag these. I decided that those nitrogen-containing
+ # moieties where the acidic group is neutral (rather than positively
+ # charged) will have "*" in the name.
+ if "*" in prot_site_name:
+ nitro_charge = nitro_charge - 1 # Undo what was done previously.
+
+ for smi in smis:
+
+ # Convert smilesstring (smi) into a RDKit Mol
+ mol = UtilFuncs.convert_smiles_str_to_mol(smi)
+
+ # Check that the conversion worked, skip if it fails
+ if mol is None:
+ continue
+
+ atom = mol.GetAtomWithIdx(idx)
+
+ # Assign the protonation charge, with special care for Nitrogens
+ element = atom.GetAtomicNum()
+ if element == 7:
+ atom.SetFormalCharge(nitro_charge)
+ else:
+ atom.SetFormalCharge(charge)
+
+ # Convert back to SMILE and add to output
+ out_smile = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)
+ output.append(out_smile)
+
+ return output
+
+
+class ProtectUnprotectFuncs:
+ """A namespace for storing functions that are useful for protecting and
+ unprotecting molecules. To keep things organized. We need to identify and
+ mark groups that have been matched with a substructure."""
+
+ @staticmethod
+ def unprotect_molecule(mol):
+ """Sets the protected property on all atoms to 0. This also creates the
+ property for new molecules.
+
+ :param rdkit.Chem.rdchem.Mol mol: The rdkit Mol object.
+ :type mol: The rdkit Mol object with atoms unprotected.
+ """
+
+ for atom in mol.GetAtoms():
+ atom.SetProp("_protected", "0")
+
+ @staticmethod
+ def protect_molecule(mol, match):
+ """Given a 'match', a list of molecules idx's, we set the protected status
+ of each atom to 1. This will prevent any matches using that atom in the
+ future.
+
+ :param rdkit.Chem.rdchem.Mol mol: The rdkit Mol object to protect.
+ :param list match: A list of molecule idx's.
+ """
+
+ for idx in match:
+ atom = mol.GetAtomWithIdx(idx)
+ atom.SetProp("_protected", "1")
+
+ @staticmethod
+ def get_unprotected_matches(mol, substruct):
+ """Finds substructure matches with atoms that have not been protected.
+ Returns list of matches, each match a list of atom idxs.
+
+ :param rdkit.Chem.rdchem.Mol mol: The Mol object to consider.
+ :param string substruct: The SMARTS string of the substructure ot match.
+ :return: A list of the matches. Each match is itself a list of atom idxs.
+ """
+
+ matches = mol.GetSubstructMatches(substruct)
+ unprotected_matches = []
+ for match in matches:
+ if ProtectUnprotectFuncs.is_match_unprotected(mol, match):
+ unprotected_matches.append(match)
+ return unprotected_matches
+
+ @staticmethod
+ def is_match_unprotected(mol, match):
+ """Checks a molecule to see if the substructure match contains any
+ protected atoms.
+
+ :param rdkit.Chem.rdchem.Mol mol: The Mol object to check.
+ :param list match: The match to check.
+ :return: A boolean, whether the match is present or not.
+ """
+
+ for idx in match:
+ atom = mol.GetAtomWithIdx(idx)
+ protected = atom.GetProp("_protected")
+ if protected == "1":
+ return False
+ return True
+
+
+class TestFuncs:
+ """A namespace for storing functions that perform tests on the code. To
+ keep things organized."""
+
+ @staticmethod
+ def test():
+ """Tests all the 38 groups."""
+
+ smis = [
+ # [input smiles, pka, protonated, deprotonated, category]
+ ["C#CCO", "C#CCO", "C#CC[O-]", "Alcohol"],
+ ["C(=O)N", "NC=O", "[NH-]C=O", "Amide"],
+ [
+ "CC(=O)NOC(C)=O",
+ "CC(=O)NOC(C)=O",
+ "CC(=O)[N-]OC(C)=O",
+ "Amide_electronegative",
+ ],
+ ["COC(=N)N", "COC(N)=[NH2+]", "COC(=N)N", "AmidineGuanidine2"],
+ [
+ "Brc1ccc(C2NCCS2)cc1",
+ "Brc1ccc(C2[NH2+]CCS2)cc1",
+ "Brc1ccc(C2NCCS2)cc1",
+ "Amines_primary_secondary_tertiary",
+ ],
+ [
+ "CC(=O)[n+]1ccc(N)cc1",
+ "CC(=O)[n+]1ccc([NH3+])cc1",
+ "CC(=O)[n+]1ccc(N)cc1",
+ "Anilines_primary",
+ ],
+ ["CCNc1ccccc1", "CC[NH2+]c1ccccc1", "CCNc1ccccc1", "Anilines_secondary"],
+ [
+ "Cc1ccccc1N(C)C",
+ "Cc1ccccc1[NH+](C)C",
+ "Cc1ccccc1N(C)C",
+ "Anilines_tertiary",
+ ],
+ [
+ "BrC1=CC2=C(C=C1)NC=C2",
+ "Brc1ccc2[nH]ccc2c1",
+ "Brc1ccc2[n-]ccc2c1",
+ "Indole_pyrrole",
+ ],
+ [
+ "O=c1cc[nH]cc1",
+ "O=c1cc[nH]cc1",
+ "O=c1cc[n-]cc1",
+ "Aromatic_nitrogen_protonated",
+ ],
+ ["C-N=[N+]=[N@H]", "CN=[N+]=N", "CN=[N+]=[N-]", "Azide"],
+ ["BrC(C(O)=O)CBr", "O=C(O)C(Br)CBr", "O=C([O-])C(Br)CBr", "Carboxyl"],
+ ["NC(NN=O)=N", "NC(=[NH2+])NN=O", "N=C(N)NN=O", "AmidineGuanidine1"],
+ [
+ "C(F)(F)(F)C(=O)NC(=O)C",
+ "CC(=O)NC(=O)C(F)(F)F",
+ "CC(=O)[N-]C(=O)C(F)(F)F",
+ "Imide",
+ ],
+ ["O=C(C)NC(C)=O", "CC(=O)NC(C)=O", "CC(=O)[N-]C(C)=O", "Imide2"],
+ [
+ "CC(C)(C)C(N(C)O)=O",
+ "CN(O)C(=O)C(C)(C)C",
+ "CN([O-])C(=O)C(C)(C)C",
+ "N-hydroxyamide",
+ ],
+ ["C[N+](O)=O", "C[N+](=O)O", "C[N+](=O)[O-]", "Nitro"],
+ ["O=C1C=C(O)CC1", "O=C1C=C(O)CC1", "O=C1C=C([O-])CC1", "O=C-C=C-OH"],
+ ["C1CC1OO", "OOC1CC1", "[O-]OC1CC1", "Peroxide2"],
+ ["C(=O)OO", "O=COO", "O=CO[O-]", "Peroxide1"],
+ [
+ "Brc1cc(O)cc(Br)c1",
+ "Oc1cc(Br)cc(Br)c1",
+ "[O-]c1cc(Br)cc(Br)c1",
+ "Phenol",
+ ],
+ [
+ "CC(=O)c1ccc(S)cc1",
+ "CC(=O)c1ccc(S)cc1",
+ "CC(=O)c1ccc([S-])cc1",
+ "Phenyl_Thiol",
+ ],
+ [
+ "C=CCOc1ccc(C(=O)O)cc1",
+ "C=CCOc1ccc(C(=O)O)cc1",
+ "C=CCOc1ccc(C(=O)[O-])cc1",
+ "Phenyl_carboxyl",
+ ],
+ ["COP(=O)(O)OC", "COP(=O)(O)OC", "COP(=O)([O-])OC", "Phosphate_diester"],
+ ["CP(C)(=O)O", "CP(C)(=O)O", "CP(C)(=O)[O-]", "Phosphinic_acid"],
+ [
+ "CC(C)OP(C)(=O)O",
+ "CC(C)OP(C)(=O)O",
+ "CC(C)OP(C)(=O)[O-]",
+ "Phosphonate_ester",
+ ],
+ [
+ "CC1(C)OC(=O)NC1=O",
+ "CC1(C)OC(=O)NC1=O",
+ "CC1(C)OC(=O)[N-]C1=O",
+ "Ringed_imide1",
+ ],
+ ["O=C(N1)C=CC1=O", "O=C1C=CC(=O)N1", "O=C1C=CC(=O)[N-]1", "Ringed_imide2"],
+ ["O=S(OC)(O)=O", "COS(=O)(=O)O", "COS(=O)(=O)[O-]", "Sulfate"],
+ [
+ "COc1ccc(S(=O)O)cc1",
+ "COc1ccc(S(=O)O)cc1",
+ "COc1ccc(S(=O)[O-])cc1",
+ "Sulfinic_acid",
+ ],
+ ["CS(N)(=O)=O", "CS(N)(=O)=O", "CS([NH-])(=O)=O", "Sulfonamide"],
+ [
+ "CC(=O)CSCCS(O)(=O)=O",
+ "CC(=O)CSCCS(=O)(=O)O",
+ "CC(=O)CSCCS(=O)(=O)[O-]",
+ "Sulfonate",
+ ],
+ ["CC(=O)S", "CC(=O)S", "CC(=O)[S-]", "Thioic_acid"],
+ ["C(C)(C)(C)(S)", "CC(C)(C)S", "CC(C)(C)[S-]", "Thiol"],
+ [
+ "Brc1cc[nH+]cc1",
+ "Brc1cc[nH+]cc1",
+ "Brc1ccncc1",
+ "Aromatic_nitrogen_unprotonated",
+ ],
+ [
+ "C=C(O)c1c(C)cc(C)cc1C",
+ "C=C(O)c1c(C)cc(C)cc1C",
+ "C=C([O-])c1c(C)cc(C)cc1C",
+ "Vinyl_alcohol",
+ ],
+ ["CC(=O)ON", "CC(=O)O[NH3+]", "CC(=O)ON", "Primary_hydroxyl_amine"],
+ ]
+
+ smis_phos = [
+ [
+ "O=P(O)(O)OCCCC",
+ "CCCCOP(=O)(O)O",
+ "CCCCOP(=O)([O-])O",
+ "CCCCOP(=O)([O-])[O-]",
+ "Phosphate",
+ ],
+ [
+ "CC(P(O)(O)=O)C",
+ "CC(C)P(=O)(O)O",
+ "CC(C)P(=O)([O-])O",
+ "CC(C)P(=O)([O-])[O-]",
+ "Phosphonate",
+ ],
+ ]
+
+ # Load the average pKa values.
+ average_pkas = {
+ l.split()[0].replace("*", ""): float(l.split()[3])
+ for l in open("site_substructures.smarts")
+ if l.split()[0] not in ["Phosphate", "Phosphonate"]
+ }
+ average_pkas_phos = {
+ l.split()[0].replace("*", ""): [float(l.split()[3]), float(l.split()[6])]
+ for l in open("site_substructures.smarts")
+ if l.split()[0] in ["Phosphate", "Phosphonate"]
+ }
+
+ print("Running Tests")
+ print("=============")
+ print("")
+
+ print("Very Acidic (pH -10000000)")
+ print("--------------------------")
+ print("")
+
+ args = {
+ "min_ph": -10000000,
+ "max_ph": -10000000,
+ "pka_precision": 0.5,
+ "smiles": "",
+ "label_states": True,
+ }
+
+ for smi, protonated, deprotonated, category in smis:
+ args["smiles"] = smi
+ TestFuncs.test_check(args, [protonated], ["PROTONATED"])
+
+ for smi, protonated, mix, deprotonated, category in smis_phos:
+ args["smiles"] = smi
+ TestFuncs.test_check(args, [protonated], ["PROTONATED"])
+
+ args["min_ph"] = 10000000
+ args["max_ph"] = 10000000
+
+ print("")
+ print("Very Basic (pH 10000000)")
+ print("------------------------")
+ print("")
+
+ for smi, protonated, deprotonated, category in smis:
+ args["smiles"] = smi
+ TestFuncs.test_check(args, [deprotonated], ["DEPROTONATED"])
+
+ for smi, protonated, mix, deprotonated, category in smis_phos:
+ args["smiles"] = smi
+ TestFuncs.test_check(args, [deprotonated], ["DEPROTONATED"])
+
+ print("")
+ print("pH is Category pKa")
+ print("------------------")
+ print("")
+
+ for smi, protonated, deprotonated, category in smis:
+ avg_pka = average_pkas[category]
+
+ args["smiles"] = smi
+ args["min_ph"] = avg_pka
+ args["max_ph"] = avg_pka
+
+ TestFuncs.test_check(args, [protonated, deprotonated], ["BOTH"])
+
+ for smi, protonated, mix, deprotonated, category in smis_phos:
+ args["smiles"] = smi
+
+ avg_pka = average_pkas_phos[category][0]
+ args["min_ph"] = avg_pka
+ args["max_ph"] = avg_pka
+
+ TestFuncs.test_check(args, [mix, protonated], ["BOTH"])
+
+ avg_pka = average_pkas_phos[category][1]
+ args["min_ph"] = avg_pka
+ args["max_ph"] = avg_pka
+
+ TestFuncs.test_check(
+ args, [mix, deprotonated], ["DEPROTONATED", "DEPROTONATED"]
+ )
+
+ avg_pka = 0.5 * (
+ average_pkas_phos[category][0] + average_pkas_phos[category][1]
+ )
+ args["min_ph"] = avg_pka
+ args["max_ph"] = avg_pka
+ args["pka_precision"] = 5 # Should give all three
+
+ TestFuncs.test_check(
+ args, [mix, deprotonated, protonated], ["BOTH", "BOTH"]
+ )
+
+ @staticmethod
+ def test_check(args, expected_output, labels):
+ """Tests most ionizable groups. The ones that can only loose or gain a single proton.
+
+ :param args: The arguments to pass to protonate()
+ :param expected_output: A list of the expected SMILES-strings output.
+ :param labels: The labels. A list containing combo of BOTH, PROTONATED,
+ DEPROTONATED.
+ :raises Exception: Wrong number of states produced.
+ :raises Exception: Unexpected output SMILES.
+ :raises Exception: Wrong labels.
+ """
+
+ output = list(Protonate(args))
+ output = [o.split() for o in output]
+
+ num_states = len(expected_output)
+
+ if len(output) != num_states:
+ msg = (
+ args["smiles"]
+ + " should have "
+ + str(num_states)
+ + " states at at pH "
+ + str(args["min_ph"])
+ + ": "
+ + str(output)
+ )
+ print(msg)
+ raise Exception(msg)
+
+ if len(set([l[0] for l in output]) - set(expected_output)) != 0:
+ msg = (
+ args["smiles"]
+ + " is not "
+ + " AND ".join(expected_output)
+ + " at pH "
+ + str(args["min_ph"])
+ + " - "
+ + str(args["max_ph"])
+ + "; it is "
+ + " AND ".join([l[0] for l in output])
+ )
+ print(msg)
+ raise Exception(msg)
+
+ if len(set([l[1] for l in output]) - set(labels)) != 0:
+ msg = (
+ args["smiles"]
+ + " not labeled as "
+ + " AND ".join(labels)
+ + "; it is "
+ + " AND ".join([l[1] for l in output])
+ )
+ print(msg)
+ raise Exception(msg)
+
+ ph_range = sorted(list(set([args["min_ph"], args["max_ph"]])))
+ ph_range_str = "(" + " - ".join("{0:.2f}".format(n) for n in ph_range) + ")"
+ print(
+ "(CORRECT) "
+ + ph_range_str.ljust(10)
+ + " "
+ + args["smiles"]
+ + " => "
+ + " AND ".join([l[0] for l in output])
+ )
+
+
+def run(**kwargs):
+ """A helpful, importable function for those who want to call Dimorphite-DL
+ from another Python script rather than the command line. Note that this
+ function accepts keyword arguments that match the command-line parameters
+ exactly. If you want to pass and return a list of RDKit Mol objects, import
+ run_with_mol_list() instead.
+
+ :param **kwargs: For a complete description, run dimorphite_dl.py from the
+ command line with the -h option.
+ :type kwargs: dict
+ """
+
+ # Run the main function with the specified arguments.
+ main(kwargs)
+
+
+def run_with_mol_list(mol_lst, **kwargs):
+ """A helpful, importable function for those who want to call Dimorphite-DL
+ from another Python script rather than the command line. Note that this
+ function is for passing Dimorphite-DL a list of RDKit Mol objects, together
+ with command-line parameters. If you want to use only the same parameters
+ that you would use from the command line, import run() instead.
+
+ :param mol_lst: A list of rdkit.Chem.rdchem.Mol objects.
+ :type mol_lst: list
+ :raises Exception: If the **kwargs includes "smiles", "smiles_file",
+ "output_file", or "test" parameters.
+ :return: A list of properly protonated rdkit.Chem.rdchem.Mol objects.
+ :rtype: list
+ """
+
+ # Do a quick check to make sure the user input makes sense.
+ for bad_arg in ["smiles", "smiles_file", "output_file", "test"]:
+ if bad_arg in kwargs:
+ msg = (
+ "You're using Dimorphite-DL's run_with_mol_list(mol_lst, "
+ + '**kwargs) function, but you also passed the "'
+ + bad_arg
+ + '" argument. Did you mean to use the '
+ + "run(**kwargs) function instead?"
+ )
+ print(msg)
+ raise Exception(msg)
+
+ # Set the return_as_list flag so main() will return the protonated smiles
+ # as a list.
+ kwargs["return_as_list"] = True
+
+ # Having reviewed the code, it will be very difficult to rewrite it so
+ # that a list of Mol objects can be used directly. Intead, convert this
+ # list of mols to smiles and pass that. Not efficient, but it will work.
+ protonated_smiles_and_props = []
+ for m in mol_lst:
+ props = m.GetPropsAsDict()
+ kwargs["smiles"] = Chem.MolToSmiles(m, isomericSmiles=True)
+ protonated_smiles_and_props.extend(
+ [(s.split("\t")[0], props) for s in main(kwargs)]
+ )
+
+ # Now convert the list of protonated smiles strings back to RDKit Mol
+ # objects. Also, add back in the properties from the original mol objects.
+ mols = []
+ for s, props in protonated_smiles_and_props:
+ m = Chem.MolFromSmiles(s)
+ if m:
+ for prop, val in props.items():
+ if type(val) is int:
+ m.SetIntProp(prop, val)
+ elif type(val) is float:
+ m.SetDoubleProp(prop, val)
+ elif type(val) is bool:
+ m.SetBoolProp(prop, val)
+ else:
+ m.SetProp(prop, str(val))
+ mols.append(m)
+ else:
+ UtilFuncs.eprint(
+ "WARNING: Could not process molecule with SMILES string "
+ + s
+ + " and properties "
+ + str(props)
+ )
+
+ return mols
+
+
+if __name__ == "__main__":
+ main()
diff -r 000000000000 -r 5c501eb8d56c rdconf.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rdconf.py Sat Dec 04 16:39:31 2021 +0000
@@ -0,0 +1,229 @@
+#!/usr/bin/python3
+
+import gzip
+import os
+import sys
+from optparse import OptionParser
+
+from rdkit.Chem import AllChem as Chem
+
+"""
+This script was originally written by David Koes, University of Pittsburgh:
+https://github.com/dkoes/rdkit-scripts/blob/master/rdconf.py
+It is licensed under the MIT licence.
+
+Given a smiles file, generate 3D conformers in output sdf.
+Energy minimizes and filters conformers to meet energy window and rms constraints.
+
+Some time ago I compared this to alternative conformer generators and
+it was quite competitive (especially after RDKit's UFF implementation
+added OOP terms).
+"""
+
+
+# convert smiles to sdf
+def getRMS(mol, c1, c2):
+ rms = Chem.GetBestRMS(mol, mol, c1, c2)
+ return rms
+
+
+parser = OptionParser(usage="Usage: %prog [options] .smi