Mercurial > repos > bgruening > openbabel_remduplicates

diff cheminfolib.py @ 15:c5de6c19eb06 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/openbabel commit d9c51279c061a1da948a2582d5b502ca7573adbf
author: bgruening
date: Thu, 15 Aug 2024 11:00:46 +0000
parents: 12aca74f07d7
--- a/cheminfolib.py	Tue Nov 10 20:30:47 2020 +0000
+++ b/cheminfolib.py	Thu Aug 15 11:00:46 2024 +0000
@@ -11,28 +11,32 @@
 import tempfile
 from multiprocessing import Pool
 
-
 try:
     from galaxy import eggs
-    eggs.require('psycopg2')
+
+    eggs.require("psycopg2")
 except ImportError:
     psycopg2 = None
-    print('psycopg2 is not available. It is currently used in the pgchem wrappers, that are not shipped with default CTB')
+    print(
+        "psycopg2 is not available. It is currently used in the pgchem wrappers, that are not shipped with default CTB"
+    )
 
 try:
     from openbabel import openbabel, pybel
+
     openbabel.obErrorLog.StopLogging()
 except ImportError:
     openbabel, pybel = None, None
-    print('OpenBabel could not be found. A few functions are not available without OpenBabel.')
+    print(
+        "OpenBabel could not be found. A few functions are not available without OpenBabel."
+    )
 
 
 def CountLines(path):
-    out = subprocess.Popen(['wc', '-l', path],
-                           stdout=subprocess.PIPE,
-                           stderr=subprocess.STDOUT
-                           ).communicate()[0]
-    return int(out.partition(b' ')[0])
+    out = subprocess.Popen(
+        ["wc", "-l", path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT
+    ).communicate()[0]
+    return int(out.partition(b" ")[0])
 
 
 def grep(pattern, file_obj):
@@ -49,15 +53,15 @@
     for line_counter, line in enumerate(open(filepath)):
         if line_counter > 10000:
             break
-        if line.find('$$$$') != -1:
-            return 'sdf'
-        elif line.find('@<TRIPOS>MOLECULE') != -1:
-            return 'mol2'
-        elif line.find('ligand id') != -1:
-            return 'drf'
-        elif possible_inchi and re.findall('^InChI=', line):
-            return 'inchi'
-        elif re.findall(r'^M\s+END', line):
+        if line.find("$$$$") != -1:
+            return "sdf"
+        elif line.find("@<TRIPOS>MOLECULE") != -1:
+            return "mol2"
+        elif line.find("ligand id") != -1:
+            return "drf"
+        elif possible_inchi and re.findall("^InChI=", line):
+            return "inchi"
+        elif re.findall(r"^M\s+END", line):
             mol = True
         # first line is not an InChI, so it can't be an InChI file
         possible_inchi = False
@@ -65,99 +69,128 @@
     if mol:
         # END can occures before $$$$, so and SDF file will
         # be recognised as mol, if you not using this hack'
-        return 'mol'
-    return 'smi'
+        return "mol"
+    return "smi"
 
 
 def db_connect(args):
     try:
-        db_conn = psycopg2.connect("dbname=%s user=%s host=%s password=%s" % (args.dbname, args.dbuser, args.dbhost, args.dbpasswd))
+        db_conn = psycopg2.connect(
+            "dbname=%s user=%s host=%s password=%s"
+            % (args.dbname, args.dbuser, args.dbhost, args.dbpasswd)
+        )
         return db_conn
     except psycopg2.Error:
-        sys.exit('Unable to connect to the db')
+        sys.exit("Unable to connect to the db")
 
 
 ColumnNames = {
-    'can_smiles': 'Canonical SMILES',
-    'can': 'Canonical SMILES',
-    'inchi': 'InChI',
-    'inchi_key': 'InChI key',
-    'inchi_key_first': 'InChI key first',
-    'inchi_key_last': 'InChI key last',
-    'molwt': 'Molecular weight',
-    'hbd': 'Hydrogen-bond donors',
-    'donors': 'Hydrogen-bond donors',
-    'hba': 'Hydrogen-bond acceptors',
-    'acceptors': 'Hydrogen-bond acceptors',
-    'rotbonds': 'Rotatable bonds',
-    'logp': 'logP',
-    'psa': 'Polar surface area',
-    'mr': 'Molecular refractivity',
-    'atoms': 'Number of heavy atoms',
-    'rings': 'Number of rings',
-    'set_bits': 'FP2 bits',
-    'id': 'Internal identifier',
-    'tani': 'Tanimoto coefficient',
-    'spectrophore': 'Spectrophores(TM)',
-    'dist_spectrophore': 'Spectrophores(TM) distance to target',
-    'synonym': 'Entry id',
+    "can_smiles": "Canonical SMILES",
+    "can": "Canonical SMILES",
+    "inchi": "InChI",
+    "inchi_key": "InChI key",
+    "inchi_key_first": "InChI key first",
+    "inchi_key_last": "InChI key last",
+    "molwt": "Molecular weight",
+    "hbd": "Hydrogen-bond donors",
+    "donors": "Hydrogen-bond donors",
+    "hba": "Hydrogen-bond acceptors",
+    "acceptors": "Hydrogen-bond acceptors",
+    "rotbonds": "Rotatable bonds",
+    "logp": "logP",
+    "psa": "Polar surface area",
+    "mr": "Molecular refractivity",
+    "atoms": "Number of heavy atoms",
+    "rings": "Number of rings",
+    "set_bits": "FP2 bits",
+    "id": "Internal identifier",
+    "tani": "Tanimoto coefficient",
+    "spectrophore": "Spectrophores(TM)",
+    "dist_spectrophore": "Spectrophores(TM) distance to target",
+    "synonym": "Entry id",
 }
 
 OBDescriptor = {
-    'atoms': ["atoms", "Number of atoms"],
-    'hatoms': ["hatoms", "Number of heavy atoms"],  # self defined tag hatoms in plugindefines.txt
-    'can_smiles': ["cansmi", "Canonical SMILES"],
-    'can_smilesNS': ["cansmiNS", "Canonical SMILES without isotopes or stereo"],
+    "atoms": ["atoms", "Number of atoms"],
+    "hatoms": [
+        "hatoms",
+        "Number of heavy atoms",
+    ],  # self defined tag hatoms in plugindefines.txt
+    "can_smiles": ["cansmi", "Canonical SMILES"],
+    "can_smilesNS": ["cansmiNS", "Canonical SMILES without isotopes or stereo"],
     # ["abonds", "Number of aromatic bonds"],
     # ["bonds", "Number of bonds"],
     # ["dbonds", "Number of double bonds"],
     # ["formula", "Chemical formula"],
-    'hba': ["HBA1", "Number of Hydrogen Bond Acceptors 1 (JoelLib)"],
-    'hba2': ["HBA2", "Number of Hydrogen Bond Acceptors 2 (JoelLib)"],
-    'hbd': ["HBD", "Number of Hydrogen Bond Donors (JoelLib)"],
-    'inchi': ["InChI", "IUPAC InChI identifier"],
-    'inchi_key': ["InChIKey", "InChIKey"],
+    "hba": ["HBA1", "Number of Hydrogen Bond Acceptors 1 (JoelLib)"],
+    "hba2": ["HBA2", "Number of Hydrogen Bond Acceptors 2 (JoelLib)"],
+    "hbd": ["HBD", "Number of Hydrogen Bond Donors (JoelLib)"],
+    "inchi": ["InChI", "IUPAC InChI identifier"],
+    "inchi_key": ["InChIKey", "InChIKey"],
     # ["L5", "Lipinski Rule of Five"],
-    'logp': ["logP", "octanol/water partition coefficient"],
-    'mr': ["MR", "molar refractivity"],
-    'molwt': ["MW", "Molecular Weight filter"],
+    "logp": ["logP", "octanol/water partition coefficient"],
+    "mr": ["MR", "molar refractivity"],
+    "molwt": ["MW", "Molecular Weight filter"],
     # ["nF", "Number of Fluorine Atoms"],
     # ["s", "SMARTS filter"],
     # ["sbonds", "Number of single bonds"],
     # ["smarts", "SMARTS filter"],
     # ["tbonds", "Number of triple bonds"],
     # ["title", "For comparing a molecule's title"],
-    'psa': ["TPSA", "topological polar surface area"],
-    'rotbonds': ['ROTATABLE_BOND', 'rotatable bonds'],
+    "psa": ["TPSA", "topological polar surface area"],
+    "rotbonds": ["ROTATABLE_BOND", "rotatable bonds"],
 }
 
 
 def print_output(args, rows):
-    if args.oformat == 'table':
-        outfile = open(args.output, 'w')
-        requested_fields = (filter(lambda x: x not in ["[", "]", "'"], args.fetch)).split(', ')
+    if args.oformat == "table":
+        outfile = open(args.output, "w")
+        requested_fields = (
+            filter(lambda x: x not in ["[", "]", "'"], args.fetch)
+        ).split(", ")
         if args.header:
-            outfile.write('Identifier\t' + '\t'.join([ColumnNames[key] for key in requested_fields]) + '\n')
+            outfile.write(
+                "Identifier\t"
+                + "\t".join([ColumnNames[key] for key in requested_fields])
+                + "\n"
+            )
         for row in rows:
-            outfile.write(row['synonym'] + '\t' + '\t'.join([str(row[key]) for key in requested_fields]) + '\n')
+            outfile.write(
+                row["synonym"]
+                + "\t"
+                + "\t".join([str(row[key]) for key in requested_fields])
+                + "\n"
+            )
 
-    elif args.oformat in ['sdf', 'mol2']:
+    elif args.oformat in ["sdf", "mol2"]:
         outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True)
         for row in rows:
             try:
-                mol = pybel.readstring('sdf', row['mol'])
-                if args.oformat == 'sdf':
-                    keys = filter(lambda x: x not in ["[", "]", "'"], args.fetch).split(', ')
-                    mol.data.update({ColumnNames['synonym']: row['synonym']})
-                    if 'inchi_key' in keys:
-                        keys = (', '.join(keys).replace("inchi_key", "inchi_key_first, inchi_key_last")).split(', ')
-                    [mol.data.update({ColumnNames[key]: row[key]}) for key in keys if key]
+                mol = pybel.readstring("sdf", row["mol"])
+                if args.oformat == "sdf":
+                    keys = filter(lambda x: x not in ["[", "]", "'"], args.fetch).split(
+                        ", "
+                    )
+                    mol.data.update({ColumnNames["synonym"]: row["synonym"]})
+                    if "inchi_key" in keys:
+                        keys = (
+                            ", ".join(keys).replace(
+                                "inchi_key", "inchi_key_first, inchi_key_last"
+                            )
+                        ).split(", ")
+                    [
+                        mol.data.update({ColumnNames[key]: row[key]})
+                        for key in keys
+                        if key
+                    ]
                 outfile.write(mol)
             except OSError:
                 pass
     else:
-        outfile = open(args.output, 'w')
-        outfile.write('\n'.join(['%s\t%s' % (row[args.oformat], row['synonym']) for row in rows]))
+        outfile = open(args.output, "w")
+        outfile.write(
+            "\n".join(["%s\t%s" % (row[args.oformat], row["synonym"]) for row in rows])
+        )
     outfile.close()
 
 
@@ -167,31 +200,37 @@
 
 def get_properties_ext(mol):
     HBD = pybel.Smarts("[!#6;!H0]")
-    HBA = pybel.Smarts(("[$([$([#8,#16]);!$(*=N~O);"
-                        "!$(*~N=O);X1,X2]),$([#7;v3;"
-                        "!$([nH]);!$(*(-a)-a)])]"
-                        ))
+    HBA = pybel.Smarts(
+        (
+            "[$([$([#8,#16]);!$(*=N~O);"
+            "!$(*~N=O);X1,X2]),$([#7;v3;"
+            "!$([nH]);!$(*(-a)-a)])]"
+        )
+    )
     calc_desc_dict = mol.calcdesc()
 
     try:
-        logp = calc_desc_dict['logP']
+        logp = calc_desc_dict["logP"]
     except KeyError:
-        logp = calc_desc_dict['LogP']
+        logp = calc_desc_dict["LogP"]
 
-    return {"molwt": mol.molwt,
-            "logp": logp,
-            "donors": len(HBD.findall(mol)),
-            "acceptors": len(HBA.findall(mol)),
-            "psa": calc_desc_dict['TPSA'],
-            "mr": calc_desc_dict['MR'],
-            "rotbonds": mol.OBMol.NumRotors(),
-            "can": mol.write("can").split()[0].strip(),  # tthis one works fine for both zinc and chembl (no ZINC code added after can descriptor string)
-            "inchi": mol.write("inchi").strip(),
-            "inchi_key": get_inchikey(mol).strip(),
-            "rings": len(mol.sssr),
-            "atoms": mol.OBMol.NumHvyAtoms(),
-            "spectrophore": OBspectrophore(mol),
-            }
+    return {
+        "molwt": mol.molwt,
+        "logp": logp,
+        "donors": len(HBD.findall(mol)),
+        "acceptors": len(HBA.findall(mol)),
+        "psa": calc_desc_dict["TPSA"],
+        "mr": calc_desc_dict["MR"],
+        "rotbonds": mol.OBMol.NumRotors(),
+        "can": mol.write("can")
+        .split()[0]
+        .strip(),  # tthis one works fine for both zinc and chembl (no ZINC code added after can descriptor string)
+        "inchi": mol.write("inchi").strip(),
+        "inchi_key": get_inchikey(mol).strip(),
+        "rings": len(mol.sssr),
+        "atoms": mol.OBMol.NumHvyAtoms(),
+        "spectrophore": OBspectrophore(mol),
+    }
 
 
 def get_inchikey(mol):
@@ -206,10 +245,12 @@
     spectrophore = pybel.ob.OBSpectrophore()
     # Parameters: rotation angle = 20, normalization for mean and sd, accuracy = 3.0 A and non-stereospecific cages.
     spectrophore.SetNormalization(spectrophore.NormalizationTowardsZeroMeanAndUnitStd)
-    return ', '.join(["%.3f" % value for value in spectrophore.GetSpectrophore(mol.OBMol)])
+    return ", ".join(
+        ["%.3f" % value for value in spectrophore.GetSpectrophore(mol.OBMol)]
+    )
 
 
-def split_library(lib_path, lib_format='sdf', package_size=None):
+def split_library(lib_path, lib_format="sdf", package_size=None):
     """
     Split a library of compounds. Usage: split_library(lib_path, lib_format, package_size)
     IT currently ONLY WORKS FOR SD-Files
@@ -217,18 +258,39 @@
     pack = 1
     mol_counter = 0
 
-    outfile = open('/%s/%s_pack_%i.%s' % ('/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w')
+    outfile = open(
+        "/%s/%s_pack_%i.%s"
+        % (
+            "/".join(lib_path.split("/")[:-1]),
+            lib_path.split("/")[-1].split(".")[0],
+            pack,
+            "sdf",
+        ),
+        "w",
+    )
 
-    for line in open(lib_path, 'r'):
+    for line in open(lib_path, "r"):
         outfile.write(line)
-        if line.strip() == '$$$$':
+        if line.strip() == "$$$$":
             mol_counter += 1
             if mol_counter % package_size == 0:
                 outfile.close()
                 pack += 1
-                outfile = open('/%s/%s_pack_%i.%s' % ('/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w')
+                outfile = open(
+                    "/%s/%s_pack_%i.%s"
+                    % (
+                        "/".join(lib_path.split("/")[:-1]),
+                        lib_path.split("/")[-1].split(".")[0],
+                        pack,
+                        "sdf",
+                    ),
+                    "w",
+                )
                 if mol_counter * 10 % package_size == 0:
-                    print('%i molecules parsed, starting pack nr. %i' % (mol_counter, pack - 1))
+                    print(
+                        "%i molecules parsed, starting pack nr. %i"
+                        % (mol_counter, pack - 1)
+                    )
     outfile.close()
 
     return True
@@ -242,7 +304,7 @@
     output_files = []
     tfile = tempfile.NamedTemporaryFile(delete=False)
 
-    smiles_handle = open(smiles_file, 'r')
+    smiles_handle = open(smiles_file, "r")
     for count, line in enumerate(smiles_handle):
         if count % structures_in_one_file == 0 and count != 0:
             tfile.close()
@@ -257,16 +319,19 @@
 
 def mp_run(input_path, regex, PROCESSES, function_to_call):
     paths = []
-    [paths.append(compound_file) for compound_file in glob.glob(str(input_path) + str(regex))]
+    [
+        paths.append(compound_file)
+        for compound_file in glob.glob(str(input_path) + str(regex))
+    ]
     paths.sort()
 
     pool = Pool(processes=PROCESSES)
-    print('Process initialized with', PROCESSES, 'processors')
+    print("Process initialized with", PROCESSES, "processors")
     result = pool.map_async(function_to_call, paths)
     result.get()
 
     return paths
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     print(check_filetype(sys.argv[1]))
author	bgruening
date	Thu, 15 Aug 2024 11:00:46 +0000
parents	12aca74f07d7
children