Mercurial > repos > bgruening > openbabel_structure_distance_finder

--- a/change_title_to_metadata_value.py	Tue Jul 28 08:38:28 2020 -0400
+++ b/change_title_to_metadata_value.py	Mon Oct 19 14:40:22 2020 +0000
@@ -6,29 +6,27 @@
     value of a given-id of the same molecule file.
 """

-import os
-import sys
 import argparse
 import random
 import string

-
 from openbabel import openbabel, pybel
 openbabel.obErrorLog.StopLogging()

+
 def main():
     parser = argparse.ArgumentParser(
         description="Change the title from a molecule file to metadata \
-value of a given-id of the same molecule file.",
+                     value of a given-id of the same molecule file.",
     )
-    parser.add_argument('--infile', '-i',
-        required=True, help="path to the input file")
-    parser.add_argument('--outfile', '-o',
-        required=True, help="path to the output file")
-    parser.add_argument('--key', '-k',
-        required=True, help="the metadata key from the sdf file which should inlcude the new title")
-    parser.add_argument('--random', '-r',
-        action="store_true", help="Add random suffix to the title.")
+    parser.add_argument('--infile', '-i', required=True,
+                        help="path to the input file")
+    parser.add_argument('--outfile', '-o', required=True,
+                        help="path to the output file")
+    parser.add_argument('--key', '-k', required=True,
+                        help="the metadata key from the sdf file which should inlcude the new title")
+    parser.add_argument('--random', '-r', action="store_true",
+                        help="Add random suffix to the title.")

     args = parser.parse_args()

@@ -39,11 +37,10 @@
             if args.random:
                 suffix = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(13))
                 mol.title += '__%s' % suffix
-        output.write( mol )
+        output.write(mol)

     output.close()


 if __name__ == "__main__":
     main()
-
--- a/cheminfolib.py	Tue Jul 28 08:38:28 2020 -0400
+++ b/cheminfolib.py	Mon Oct 19 14:40:22 2020 +0000
@@ -4,31 +4,37 @@
     Copyright 2012, Bjoern Gruening and Xavier Lucas
 """

-import os, sys
+import glob
+import re
+import subprocess
+import sys
+import tempfile
+from multiprocessing import Pool
+

 try:
     from galaxy import eggs
     eggs.require('psycopg2')
-except:
+except ImportError:
+    psycopg2 = None
     print('psycopg2 is not available. It is currently used in the pgchem wrappers, that are not shipped with default CTB')

 try:
     from openbabel import openbabel, pybel
     openbabel.obErrorLog.StopLogging()
-except:
+except ImportError:
+    openbabel, pybel = None, None
     print('OpenBabel could not be found. A few functions are not available without OpenBabel.')

-from multiprocessing import Pool
-import glob, tempfile, re
-import subprocess

-def CountLines( path ):
+def CountLines(path):
     out = subprocess.Popen(['wc', '-l', path],
-                         stdout=subprocess.PIPE,
-                         stderr=subprocess.STDOUT
-                         ).communicate()[0]
+                           stdout=subprocess.PIPE,
+                           stderr=subprocess.STDOUT
+                           ).communicate()[0]
     return int(out.partition(b' ')[0])

+
 def grep(pattern, file_obj):
     grepper = re.compile(pattern)
     for line in file_obj:
@@ -36,6 +42,7 @@
             return True
     return False

+
 def check_filetype(filepath):
     mol = False
     possible_inchi = True
@@ -50,76 +57,78 @@
             return 'drf'
         elif possible_inchi and re.findall('^InChI=', line):
             return 'inchi'
-        elif re.findall('^M\s+END', line):
+        elif re.findall(r'^M\s+END', line):
             mol = True
         # first line is not an InChI, so it can't be an InChI file
         possible_inchi = False

     if mol:
-        # END can occures before $$$$, so and SDF file will
+        # END can occures before $$$$, so and SDF file will
         # be recognised as mol, if you not using this hack'
         return 'mol'
     return 'smi'

+
 def db_connect(args):
     try:
-        db_conn = psycopg2.connect("dbname=%s user=%s host=%s password=%s" % (args.dbname, args.dbuser, args.dbhost, args.dbpasswd));
+        db_conn = psycopg2.connect("dbname=%s user=%s host=%s password=%s" % (args.dbname, args.dbuser, args.dbhost, args.dbpasswd))
         return db_conn
-    except:
+    except psycopg2.Error:
         sys.exit('Unable to connect to the db')

+
 ColumnNames = {
-    'can_smiles' : 'Canonical SMILES',
-    'can' : 'Canonical SMILES',
-    'inchi' : 'InChI',
-    'inchi_key' : 'InChI key',
-    'inchi_key_first' : 'InChI key first',
-    'inchi_key_last' : 'InChI key last',
-    'molwt' : 'Molecular weight',
-    'hbd' : 'Hydrogen-bond donors',
-    'donors' : 'Hydrogen-bond donors',
-    'hba' : 'Hydrogen-bond acceptors',
-    'acceptors' : 'Hydrogen-bond acceptors',
-    'rotbonds' : 'Rotatable bonds',
-    'logp' : 'logP',
-    'psa' : 'Polar surface area',
-    'mr' : 'Molecular refractivity',
-    'atoms' : 'Number of heavy atoms',
-    'rings' : 'Number of rings',
-    'set_bits' : 'FP2 bits',
-    'id' : 'Internal identifier',
-    'tani' : 'Tanimoto coefficient',
-    'spectrophore' : 'Spectrophores(TM)',
-    'dist_spectrophore' : 'Spectrophores(TM) distance to target',
-    'synonym' : 'Entry id',
+    'can_smiles': 'Canonical SMILES',
+    'can': 'Canonical SMILES',
+    'inchi': 'InChI',
+    'inchi_key': 'InChI key',
+    'inchi_key_first': 'InChI key first',
+    'inchi_key_last': 'InChI key last',
+    'molwt': 'Molecular weight',
+    'hbd': 'Hydrogen-bond donors',
+    'donors': 'Hydrogen-bond donors',
+    'hba': 'Hydrogen-bond acceptors',
+    'acceptors': 'Hydrogen-bond acceptors',
+    'rotbonds': 'Rotatable bonds',
+    'logp': 'logP',
+    'psa': 'Polar surface area',
+    'mr': 'Molecular refractivity',
+    'atoms': 'Number of heavy atoms',
+    'rings': 'Number of rings',
+    'set_bits': 'FP2 bits',
+    'id': 'Internal identifier',
+    'tani': 'Tanimoto coefficient',
+    'spectrophore': 'Spectrophores(TM)',
+    'dist_spectrophore': 'Spectrophores(TM) distance to target',
+    'synonym': 'Entry id',
 }

 OBDescriptor = {
-    'atoms': ["atoms","Number of atoms"],
-    'hatoms': ["hatoms","Number of heavy atoms"], # self defined tag hatoms in plugindefines.txt
-    'can_smiles' : ["cansmi","Canonical SMILES"],
-    'can_smilesNS' : ["cansmiNS","Canonical SMILES without isotopes or stereo"],
-    #["abonds","Number of aromatic bonds"],
-    #["bonds","Number of bonds"],
-    #["dbonds","Number of double bonds"],
-    #["formula","Chemical formula"],
-    'hba': ["HBA1","Number of Hydrogen Bond Acceptors 1 (JoelLib)"],
-    'hba2': ["HBA2","Number of Hydrogen Bond Acceptors 2 (JoelLib)"],
-    'hbd': ["HBD","Number of Hydrogen Bond Donors (JoelLib)"],
-    'inchi': ["InChI","IUPAC InChI identifier"],
-    'inchi_key': ["InChIKey","InChIKey"],
-    #["L5","Lipinski Rule of Five"],
-    'logp': ["logP","octanol/water partition coefficient"],
-    'mr': ["MR","molar refractivity"],
-    'molwt': ["MW","Molecular Weight filter"],
-    #["nF","Number of Fluorine Atoms"],
-    #["s","SMARTS filter"],
-    #["sbonds","Number of single bonds"],
-    #["smarts","SMARTS filter"],
-    #["tbonds","Number of triple bonds"],
-    #["title","For comparing a molecule's title"],
-    'psa': ["TPSA","topological polar surface area"],
-    'rotbonds' : ['ROTATABLE_BOND', 'rotatable bonds'],
+    'atoms': ["atoms", "Number of atoms"],
+    'hatoms': ["hatoms", "Number of heavy atoms"],  # self defined tag hatoms in plugindefines.txt
+    'can_smiles': ["cansmi", "Canonical SMILES"],
+    'can_smilesNS': ["cansmiNS", "Canonical SMILES without isotopes or stereo"],
+    # ["abonds", "Number of aromatic bonds"],
+    # ["bonds", "Number of bonds"],
+    # ["dbonds", "Number of double bonds"],
+    # ["formula", "Chemical formula"],
+    'hba': ["HBA1", "Number of Hydrogen Bond Acceptors 1 (JoelLib)"],
+    'hba2': ["HBA2", "Number of Hydrogen Bond Acceptors 2 (JoelLib)"],
+    'hbd': ["HBD", "Number of Hydrogen Bond Donors (JoelLib)"],
+    'inchi': ["InChI", "IUPAC InChI identifier"],
+    'inchi_key': ["InChIKey", "InChIKey"],
+    # ["L5", "Lipinski Rule of Five"],
+    'logp': ["logP", "octanol/water partition coefficient"],
+    'mr': ["MR", "molar refractivity"],
+    'molwt': ["MW", "Molecular Weight filter"],
+    # ["nF", "Number of Fluorine Atoms"],
+    # ["s", "SMARTS filter"],
+    # ["sbonds", "Number of single bonds"],
+    # ["smarts", "SMARTS filter"],
+    # ["tbonds", "Number of triple bonds"],
+    # ["title", "For comparing a molecule's title"],
+    'psa': ["TPSA", "topological polar surface area"],
+    'rotbonds': ['ROTATABLE_BOND', 'rotatable bonds'],
 }


@@ -128,9 +137,9 @@
         outfile = open(args.output, 'w')
         requested_fields = (filter(lambda x: x not in ["[", "]", "'"], args.fetch)).split(', ')
         if args.header:
-            outfile.write( 'Identifier\t' + '\t'.join( [ColumnNames[key] for key in requested_fields] ) + '\n' )
+            outfile.write('Identifier\t' + '\t'.join([ColumnNames[key] for key in requested_fields]) + '\n')
         for row in rows:
-            outfile.write( row['synonym'] + '\t' + '\t'.join( [str(row[key]) for key in requested_fields] ) + '\n' )
+            outfile.write(row['synonym'] + '\t' + '\t'.join([str(row[key]) for key in requested_fields]) + '\n')

     elif args.oformat in ['sdf', 'mol2']:
         outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True)
@@ -139,103 +148,102 @@
                 mol = pybel.readstring('sdf', row['mol'])
                 if args.oformat == 'sdf':
                     keys = filter(lambda x: x not in ["[", "]", "'"], args.fetch).split(', ')
-                    mol.data.update( { ColumnNames['synonym'] : row['synonym'] } )
+                    mol.data.update({ColumnNames['synonym']: row['synonym']})
                     if 'inchi_key' in keys:
-                        keys = (', '.join(keys).replace( "inchi_key", "inchi_key_first, inchi_key_last" )).split(', ')
-                    [ mol.data.update( { ColumnNames[key] : row[key] } ) for key in keys if key]
+                        keys = (', '.join(keys).replace("inchi_key", "inchi_key_first, inchi_key_last")).split(', ')
+                    [mol.data.update({ColumnNames[key]: row[key]}) for key in keys if key]
                 outfile.write(mol)
-            except:
+            except OSError:
                 pass
     else:
         outfile = open(args.output, 'w')
-        outfile.write( '\n'.join( [ '%s\t%s' % (row[args.oformat], row['synonym'] ) for row in rows ] ) )
+        outfile.write('\n'.join(['%s\t%s' % (row[args.oformat], row['synonym']) for row in rows]))
     outfile.close()

+
 def pybel_stop_logging():
     openbabel.obErrorLog.StopLogging()

+
 def get_properties_ext(mol):
-
     HBD = pybel.Smarts("[!#6;!H0]")
-    HBA = pybel.Smarts("[$([$([#8,#16]);!$(*=N~O);" +
-                       "!$(*~N=O);X1,X2]),$([#7;v3;" +
-                       "!$([nH]);!$(*(-a)-a)])]"
-                      )
+    HBA = pybel.Smarts(("[$([$([#8,#16]);!$(*=N~O);"
+                        "!$(*~N=O);X1,X2]),$([#7;v3;"
+                        "!$([nH]);!$(*(-a)-a)])]"
+                        ))
     calc_desc_dict = mol.calcdesc()

     try:
         logp = calc_desc_dict['logP']
-    except:
+    except KeyError:
         logp = calc_desc_dict['LogP']

     return {"molwt": mol.molwt,
             "logp": logp,
             "donors": len(HBD.findall(mol)),
-            "acceptors": len(HBA.findall(mol)),
+            "acceptors": len(HBA.findall(mol)),
             "psa": calc_desc_dict['TPSA'],
             "mr": calc_desc_dict['MR'],
             "rotbonds": mol.OBMol.NumRotors(),
-            "can": mol.write("can").split()[0].strip(), ### tthis one works fine for both zinc and chembl (no ZINC code added after can descriptor string)
+            "can": mol.write("can").split()[0].strip(),  # tthis one works fine for both zinc and chembl (no ZINC code added after can descriptor string)
             "inchi": mol.write("inchi").strip(),
             "inchi_key": get_inchikey(mol).strip(),
             "rings": len(mol.sssr),
             "atoms": mol.OBMol.NumHvyAtoms(),
-            "spectrophore" : OBspectrophore(mol),
-           }
+            "spectrophore": OBspectrophore(mol),
+            }
+

 def get_inchikey(mol):
     conv = openbabel.OBConversion()
     conv.SetInAndOutFormats("mol", "inchi")
     conv.SetOptions("K", conv.OUTOPTIONS)
-    inchikey = conv.WriteString( mol.OBMol )
+    inchikey = conv.WriteString(mol.OBMol)
     return inchikey

+
 def OBspectrophore(mol):
     spectrophore = pybel.ob.OBSpectrophore()
     # Parameters: rotation angle = 20, normalization for mean and sd, accuracy = 3.0 A and non-stereospecific cages.
-    spectrophore.SetNormalization( spectrophore.NormalizationTowardsZeroMeanAndUnitStd )
-    return ', '.join( [ "%.3f" % value for value in spectrophore.GetSpectrophore( mol.OBMol ) ] )
+    spectrophore.SetNormalization(spectrophore.NormalizationTowardsZeroMeanAndUnitStd)
+    return ', '.join(["%.3f" % value for value in spectrophore.GetSpectrophore(mol.OBMol)])
+

-def squared_euclidean_distance(a, b):
-    try:
-        return ((np.asarray( a ) - np.asarray( b ))**2).sum()
-    except ValueError:
-        return 0
-
-def split_library( lib_path, lib_format = 'sdf', package_size = None ):
+def split_library(lib_path, lib_format='sdf', package_size=None):
     """
-        Split a library of compounds. Usage: split_library( lib_path, lib_format, package_size )
-        IT currently ONLY WORKS FOR SD-Files
+    Split a library of compounds. Usage: split_library(lib_path, lib_format, package_size)
+    IT currently ONLY WORKS FOR SD-Files
     """
     pack = 1
     mol_counter = 0

-    outfile = open('/%s/%s_pack_%i.%s' % ( '/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w' )
+    outfile = open('/%s/%s_pack_%i.%s' % ('/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w')

     for line in open(lib_path, 'r'):
-        outfile.write( line )
+        outfile.write(line)
         if line.strip() == '$$$$':
             mol_counter += 1
             if mol_counter % package_size == 0:
                 outfile.close()
                 pack += 1
-                outfile = open('/%s/%s_pack_%i.%s' % ( '/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w' )
-                if mol_counter*10 % package_size == 0:
-                    print('%i molecules parsed, starting pack nr. %i' % ( mol_counter, pack - 1 ))
+                outfile = open('/%s/%s_pack_%i.%s' % ('/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w')
+                if mol_counter * 10 % package_size == 0:
+                    print('%i molecules parsed, starting pack nr. %i' % (mol_counter, pack - 1))
     outfile.close()

     return True

-def split_smi_library( smiles_file, structures_in_one_file ):
+
+def split_smi_library(smiles_file, structures_in_one_file):
     """
-        Split a file with SMILES to several files for multiprocessing usage.
-        Usage: split_smi_library( smiles_file, 10 )
+    Split a file with SMILES to several files for multiprocessing usage.
+    Usage: split_smi_library(smiles_file, 10)
     """
     output_files = []
     tfile = tempfile.NamedTemporaryFile(delete=False)

     smiles_handle = open(smiles_file, 'r')
-    for count, line in enumerate( smiles_handle ):
+    for count, line in enumerate(smiles_handle):
         if count % structures_in_one_file == 0 and count != 0:
             tfile.close()
             output_files.append(tfile.name)
@@ -247,9 +255,9 @@
     return output_files


-def mp_run(input_path, regex, PROCESSES, function_to_call ):
+def mp_run(input_path, regex, PROCESSES, function_to_call):
     paths = []
-    [ paths.append(compound_file) for compound_file in glob.glob(str(input_path) + str(regex)) ]
+    [paths.append(compound_file) for compound_file in glob.glob(str(input_path) + str(regex))]
     paths.sort()

     pool = Pool(processes=PROCESSES)
@@ -259,6 +267,6 @@

     return paths

+
 if __name__ == '__main__':
     print(check_filetype(sys.argv[1]))
-
--- a/distance_finder.py	Tue Jul 28 08:38:28 2020 -0400
+++ b/distance_finder.py	Mon Oct 19 14:40:22 2020 +0000
@@ -11,7 +11,9 @@
 # a property named distance1 where the numeric part is the index (starting from 1) of the points (in that example
 # there would be properties for distance1, distance2 and distance3.

-import argparse, os, sys, math
+import argparse
+import math
+import sys

 from openbabel import pybel

@@ -30,7 +32,6 @@
     :return:
     """

-
     points = []

     # read the points
@@ -41,7 +42,7 @@
                 p = line.split()
                 if len(p) == 3:
                     points.append((float(p[0]), float(p[1]), float(p[2])))
-                    log("Read points",p)
+                    log("Read points", p)
                     continue
             log("Failed to read line:", line)
     log('Found', len(points), 'atom points')
@@ -56,7 +57,6 @@

         try:
             # print("Processing mol", mol.title)
-
             clone = pybel.Molecule(mol)
             clone.removeh()

@@ -82,7 +82,7 @@
             sdf_writer.write(mol)

         except Exception as e:
-            log('Failed to handle molecule: '+ str(e))
+            log('Failed to handle molecule: ' + str(e))
             continue

     sdf_writer.close()
@@ -93,12 +93,10 @@
     global work_dir

     parser = argparse.ArgumentParser(description='XChem distances - measure distances to particular points')
-
     parser.add_argument('-i', '--input', help="SDF containing the 3D molecules to score)")
     parser.add_argument('-p', '--points', help="PDB format file with atoms")
     parser.add_argument('-o', '--outfile', default='output.sdf', help="File name for results")

-
     args = parser.parse_args()
     log("XChem distances args: ", args)
--- a/macros.xml	Tue Jul 28 08:38:28 2020 -0400
+++ b/macros.xml	Mon Oct 19 14:40:22 2020 +0000
@@ -18,6 +18,11 @@
         help="Valid file types are: SDF, MOL, MOL2, CML, InChI, SMILES, and PDB"/>
     </xml>

+    <xml name="infile_all_types_except_inchi">
+        <param name="infile" format="sdf,mol,mol2,cml,smi,pdb" type="data" label="Molecular input file"
+        help="Valid file types are: SDF, MOL, MOL2, CML, SMILES, and PDB"/>
+    </xml>
+
     <xml name="2D_3D_opts">
         <param name="gen2d" type="boolean" truevalue="--gen2d" falsevalue="" checked="false"
             label="Generate 2D coordinates" help="(--gen2d)" />
--- a/multi_obgrep.py	Tue Jul 28 08:38:28 2020 -0400
+++ b/multi_obgrep.py	Mon Oct 19 14:40:22 2020 +0000
@@ -4,40 +4,43 @@
     Output: Molecule file filtered with obgrep.
     Copyright 2013, Bjoern Gruening and Xavier Lucas
 """
-import sys, os
 import argparse
 import multiprocessing
-import tempfile
-import subprocess
-import shutil
+import os
 import shlex
+import shutil
+import subprocess
+import tempfile

-from openbabel import openbabel, pybel
-openbabel.obErrorLog.StopLogging()
+
 def parse_command_line():
     parser = argparse.ArgumentParser()
     parser.add_argument('-i', '--infile', required=True, help='Molecule file.')
-    parser.add_argument('-q', '--query',  required=True, help='Query file, containing different SMARTS in each line.')
+    parser.add_argument('-q', '--query', required=True, help='Query file, containing different SMARTS in each line.')
     parser.add_argument('-o', '--outfile', required=True, help='Path to the output file.')
     parser.add_argument("--iformat", help="Input format, like smi, sdf, inchi")
     parser.add_argument("--n-times", dest="n_times", type=int,
-                    default=0, help="Print a molecule only if the pattern occurs # times inside the molecule.")
+                        default=0, help="Print a molecule only if the pattern occurs # times inside the molecule.")
     parser.add_argument('-p', '--processors', type=int, default=multiprocessing.cpu_count())
     parser.add_argument("--invert-matches", dest="invert_matches", action="store_true",
-                    default=False, help="Invert the matching, print non-matching molecules.")
+                        default=False, help="Invert the matching, print non-matching molecules.")
     parser.add_argument("--only-name", dest="only_name", action="store_true",
-                    default=False, help="Only print the name of the molecules.")
+                        default=False, help="Only print the name of the molecules.")
     parser.add_argument("--full-match", dest="full_match", action="store_true",
-                    default=False, help="Full match, print matching-molecules only when the number of heavy atoms is also equal to the number of atoms in the SMARTS pattern.")
+                        default=False, help="Full match, print matching-molecules only when the number of heavy atoms is also equal to the number of atoms in the SMARTS pattern.")
     parser.add_argument("--number-of-matches", dest="number_of_matches", action="store_true",
-                    default=False, help="Print the number of matches.")
+                        default=False, help="Print the number of matches.")
     return parser.parse_args()

+
 results = list()
+
+
 def mp_callback(res):
     results.append(res)

-def mp_helper( query, args ):
+
+def mp_helper(query, args):
     """
         Helper function for multiprocessing.
         That function is a wrapper around obgrep.
@@ -57,44 +60,44 @@

     tmp = tempfile.NamedTemporaryFile(delete=False)
     cmd = 'obgrep %s "%s" %s' % (' '.join(cmd_list), query, args.infile)
-    child = subprocess.Popen(shlex.split(cmd),
-        stdout=open(tmp.name, 'w+'), stderr=subprocess.PIPE)
+    child = subprocess.Popen(shlex.split(cmd), stdout=open(tmp.name, 'w+'), stderr=subprocess.PIPE)

     stdout, stderr = child.communicate()
     return (tmp.name, query)


-def obgrep( args ):
-
+def obgrep(args):
     temp_file = tempfile.NamedTemporaryFile()
     temp_link = "%s.%s" % (temp_file.name, args.iformat)
     temp_file.close()
     os.symlink(args.infile, temp_link)
     args.infile = temp_link

-    pool = multiprocessing.Pool( args.processors )
-    for query in open( args.query ):
+    pool = multiprocessing.Pool(args.processors)
+    for query in open(args.query):
         pool.apply_async(mp_helper, args=(query.strip(), args), callback=mp_callback)
-        #mp_callback( mp_helper(query.strip(), args) )
+        # mp_callback(mp_helper(query.strip(), args))
     pool.close()
     pool.join()

-    out_handle = open( args.outfile, 'wb' )
+    out_handle = open(args.outfile, 'wb')
     for result_file, query in results:
-        res_handle = open(result_file,'rb')
-        shutil.copyfileobj( res_handle, out_handle )
+        res_handle = open(result_file, 'rb')
+        shutil.copyfileobj(res_handle, out_handle)
         res_handle.close()
-        os.remove( result_file )
+        os.remove(result_file)
     out_handle.close()

-    os.remove( temp_link )
+    os.remove(temp_link)
+

 def __main__():
     """
         Multiprocessing obgrep search.
     """
     args = parse_command_line()
-    obgrep( args )
+    obgrep(args)
+

-if __name__ == "__main__" :
+if __name__ == "__main__":
     __main__()
--- a/ob_addh.py	Tue Jul 28 08:38:28 2020 -0400
+++ b/ob_addh.py	Mon Oct 19 14:40:22 2020 +0000
@@ -3,21 +3,23 @@
     Input:  Molecule file
     Output: Molecule file with hydrogen atoms added at the target pH.
 """
-import sys, os
 import argparse
+import sys

 from openbabel import openbabel, pybel
 openbabel.obErrorLog.StopLogging()

+
 def parse_command_line(argv):
     parser = argparse.ArgumentParser()
-    parser.add_argument('--iformat', type=str, default='sdf' , help='input file format')
+    parser.add_argument('--iformat', type=str, default='sdf', help='input file format')
     parser.add_argument('-i', '--input', type=str, required=True, help='input file name')
     parser.add_argument('-o', '--output', type=str, required=True, help='output file name')
     parser.add_argument('--polar', action="store_true", default=False, help='Add hydrogen atoms only to polar atoms')
     parser.add_argument('--pH', type=float, default="7.4", help='Specify target pH value')
     return parser.parse_args()

+
 def addh(args):
     outfile = pybel.Outputfile(args.iformat, args.output, overwrite=True)
     for mol in pybel.readfile(args.iformat, args.input):
@@ -27,6 +29,7 @@
             outfile.write(mol)
     outfile.close()

+
 def __main__():
     """
         Add hydrogen atoms at a certain pH value
@@ -34,5 +37,6 @@
     args = parse_command_line(sys.argv)
     addh(args)

-if __name__ == "__main__" :
+
+if __name__ == "__main__":
     __main__()
--- a/ob_filter.py	Tue Jul 28 08:38:28 2020 -0400
+++ b/ob_filter.py	Mon Oct 19 14:40:22 2020 +0000
@@ -6,35 +6,32 @@

     TODO: AND/OR conditions?
 """
-import sys, os
 import argparse
+import json
+import shlex
+import subprocess
+import sys
+
 import cheminfolib
-import json
-import shlex, subprocess
-
 from openbabel import pybel
 cheminfolib.pybel_stop_logging()

+
 def parse_command_line():
     parser = argparse.ArgumentParser()
     parser.add_argument('-i', '--input', help='Input file name')
     parser.add_argument('-iformat', help='Input file format')
-    parser.add_argument('-oformat',
-        default='smi',
-        help='Output file format')
-    parser.add_argument('-o', '--output',
-        help='Output file name',
-        required=True)
-    parser.add_argument('--filters',
-        help="Specify the filters to apply",
-        required=True,
-        )
-    parser.add_argument('--list_of_names',
-        help="A file with list of molecule names to extract. Every name is in one line.",
-        required=False,
-        )
+    parser.add_argument('-oformat', default='smi',
+                        help='Output file format')
+    parser.add_argument('-o', '--output', help='Output file name',
+                        required=True)
+    parser.add_argument('--filters', help="Specify the filters to apply",
+                        required=True)
+    parser.add_argument('--list_of_names', required=False,
+                        help="A file with list of molecule names to extract. Every name is in one line.")
     return parser.parse_args()

+
 def filter_precalculated_compounds(args, filters):
     outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True)
     for mol in pybel.readfile('sdf', args.input):
@@ -53,6 +50,7 @@
             outfile.write(mol)
     outfile.close()

+
 def filter_new_compounds(args, filters):

     if args.iformat == args.oformat:
@@ -70,10 +68,9 @@
         filter_cmd += ' %s>=%s %s<=%s ' % (ob_descriptor_name, min, ob_descriptor_name, max)

     args = shlex.split('%s "%s"' % (cmd, filter_cmd))
-    #print '%s "%s"' % (cmd, filter_cmd)
+    # print '%s "%s"' % (cmd, filter_cmd)
     # calling openbabel with subprocess and pipe potential errors occuring in openbabel to stdout
-    child = subprocess.Popen(args,
-        stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    child = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

     stdout, stderr = child.communicate()
     return_code = child.returncode
@@ -87,6 +84,7 @@
         sys.stdout.write(stdout.decode('utf-8'))
         sys.stdout.write(stderr.decode('utf-8'))

+
 def filter_by_name(args):
     outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True)
     for mol in pybel.readfile('sdf', args.input):
@@ -95,16 +93,17 @@
                 outfile.write(mol)
     outfile.close()

+
 def __main__():
     """
         Select compounds with certain properties from a small library
     """
     args = parse_command_line()
-
+
     if args.filters == '__filter_by_name__':
         filter_by_name(args)
         return
-
+
     # Its a small trick to get the parameters in an easy way from the xml file.
     # To keep it readable in the xml file, many white-spaces are included in that string it needs to be removed.
     # Also the last loop creates a ',{' that is not an valid jason expression.
@@ -114,7 +113,7 @@
         mol = next(pybel.readfile('sdf', args.input))
         for key, elem in filters.items():
             property = cheminfolib.ColumnNames.get(key, key)
-            if not property in mol.data:
+            if property not in mol.data:
                 break
         else:
             # if the for loop finishes in a normal way, we should habe all properties at least in the first molecule
@@ -124,5 +123,5 @@
     filter_new_compounds(args, filters)


-if __name__ == "__main__" :
+if __name__ == "__main__":
     __main__()
--- a/ob_genProp.py	Tue Jul 28 08:38:28 2020 -0400
+++ b/ob_genProp.py	Mon Oct 19 14:40:22 2020 +0000
@@ -4,23 +4,25 @@
     Output: Physico-chemical properties are computed and stored as metadata in the sdf output file.
     Copyright 2012, Bjoern Gruening and Xavier Lucas
 """
-import sys, os
 import argparse
+import sys
+
+import cheminfolib
 import openbabel
+from openbabel import pybel
 openbabel.obErrorLog.StopLogging()
-import cheminfolib

-from openbabel import pybel

 def parse_command_line(argv):
     parser = argparse.ArgumentParser()
-    parser.add_argument('--iformat', default='sdf' , help='input file format')
+    parser.add_argument('--iformat', default='sdf', help='input file format')
     parser.add_argument('-i', '--input', required=True, help='input file name')
-    parser.add_argument('--oformat', default='sdf', choices = ['sdf', 'table'] , help='output file format')
+    parser.add_argument('--oformat', default='sdf', choices=['sdf', 'table'], help='output file format')
     parser.add_argument('--header', type=bool, help='Include the header as the first line of the output table')
     parser.add_argument('-o', '--output', required=True, help='output file name')
     return parser.parse_args()

+
 def compute_properties(args):
     if args.oformat == 'sdf':
         outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True)
@@ -29,18 +31,19 @@
         if args.header:
             mol = next(pybel.readfile(args.iformat, args.input))
             metadata = cheminfolib.get_properties_ext(mol)
-            outfile.write( '%s\n' % '\t'.join( [ cheminfolib.ColumnNames[key] for key in metadata ] ) )
+            outfile.write('%s\n' % '\t'.join([cheminfolib.ColumnNames[key] for key in metadata]))

     for mol in pybel.readfile(args.iformat, args.input):
         if mol.OBMol.NumHvyAtoms() > 5:
             metadata = cheminfolib.get_properties_ext(mol)
             if args.oformat == 'sdf':
-                [ mol.data.update( { cheminfolib.ColumnNames[key] : metadata[key] } ) for key in metadata ]
+                [mol.data.update({cheminfolib.ColumnNames[key]: metadata[key]}) for key in metadata]
                 outfile.write(mol)
             else:
-                outfile.write( '%s\n' % ('\t'.join( [ str(metadata[key]) for key in metadata ] ) ) )
+                outfile.write('%s\n' % ('\t'.join([str(metadata[key]) for key in metadata])))
     outfile.close()

+
 def __main__():
     """
         Physico-chemical properties are computed and stored as metadata in the sdf output file
@@ -48,5 +51,6 @@
     args = parse_command_line(sys.argv)
     compute_properties(args)

-if __name__ == "__main__" :
+
+if __name__ == "__main__":
     __main__()
--- a/ob_remIons.py	Tue Jul 28 08:38:28 2020 -0400
+++ b/ob_remIons.py	Mon Oct 19 14:40:22 2020 +0000
@@ -4,29 +4,33 @@
     Output: Molecule file with removed ions and fragments.
     Copyright 2012, Bjoern Gruening and Xavier Lucas
 """
-import sys, os
 import argparse

 from openbabel import openbabel, pybel
 openbabel.obErrorLog.StopLogging()

+
 def parse_command_line():
     parser = argparse.ArgumentParser()
-    parser.add_argument('-iformat', default='sdf' , help='input file format')
+    parser.add_argument('-iformat', default='sdf', help='input file format')
     parser.add_argument('-i', '--input', required=True, help='input file name')
     parser.add_argument('-o', '--output', required=True, help='output file name')
     return parser.parse_args()

+
 def remove_ions(args):
     outfile = pybel.Outputfile(args.iformat, args.output, overwrite=True)
     for mol in pybel.readfile(args.iformat, args.input):
         if mol.OBMol.NumHvyAtoms() > 5:
             mol.OBMol.StripSalts(0)
+            if 'inchi' in mol.data:
+                del mol.data['inchi']  # remove inchi cache so modified mol is saved
             # Check if new small fragments have been created and remove them
             if mol.OBMol.NumHvyAtoms() > 5:
                 outfile.write(mol)
     outfile.close()

+
 def __main__():
     """
         Remove any counterion and delete any fragment but the largest one for each molecule.
@@ -34,5 +38,6 @@
     args = parse_command_line()
     remove_ions(args)

-if __name__ == "__main__" :
+
+if __name__ == "__main__":
     __main__()
--- a/ob_spectrophore_search.py	Tue Jul 28 08:38:28 2020 -0400
+++ b/ob_spectrophore_search.py	Mon Oct 19 14:40:22 2020 +0000
@@ -4,18 +4,17 @@
     Output: parse the target file using the same protocol used to generate the databases in our servers. Physico-chemical properties are computed and stored as metadata in the sdf output file.
     Copyright 2012, Bjoern Gruening and Xavier Lucas
 """
-import sys, os
 import argparse
-import math
+
 import numpy as np
-
 from openbabel import openbabel, pybel
 openbabel.obErrorLog.StopLogging()
-#TODO get rid of eval()
+# TODO get rid of eval()

 global spectrophore
 spectrophore = pybel.ob.OBSpectrophore()

+
 def parse_command_line():
     parser = argparse.ArgumentParser()
     parser.add_argument('--target', required=True, help='target file name in sdf format with Spectrophores(TM) descriptors stored as meta-data')
@@ -28,26 +27,29 @@
     parser.add_argument('-r', '--resolution', type=float, default="3.0", help='Resolution')
     return parser.parse_args()

+
 def set_parameters(args):
     if args.normalization == 'No':
-        spectrophore.SetNormalization( spectrophore.NoNormalization )
+        spectrophore.SetNormalization(spectrophore.NoNormalization)
     else:
-        spectrophore.SetNormalization( eval('spectrophore.NormalizationTowards' + args.normalization) )
-    spectrophore.SetAccuracy( eval('spectrophore.AngStepSize' + args.accuracy) )
-    spectrophore.SetStereo( eval('spectrophore.' + args.stereo + 'StereoSpecificProbes') )
-    spectrophore.SetResolution( args.resolution )
+        spectrophore.SetNormalization(eval('spectrophore.NormalizationTowards' + args.normalization))
+    spectrophore.SetAccuracy(eval('spectrophore.AngStepSize' + args.accuracy))
+    spectrophore.SetStereo(eval('spectrophore.' + args.stereo + 'StereoSpecificProbes'))
+    spectrophore.SetResolution(args.resolution)
     return True

+
 def Compute_Spectrophores_distance(target_spectrophore, args):
     outfile = open(args.output, 'w')
     for mol in open(args.library, 'r'):
         try:
-            distance = ( ( np.asarray( target_spectrophore, dtype=float ) - np.asarray( mol.split('\t')[ args.column - 1 ].strip().split(', '), dtype=float) )**2).sum()
+            distance = ((np.asarray(target_spectrophore, dtype=float) - np.asarray(mol.split('\t')[args.column - 1].strip().split(', '), dtype=float))**2).sum()
         except ValueError:
             distance = 0
-        outfile.write( '%s\t%f\n' % (mol.strip(), distance ) )
+        outfile.write('%s\t%f\n' % (mol.strip(), distance))
     outfile.close()

+
 def __main__():
     """
         Computation of Spectrophores(TM) distances to a target molecule.
@@ -59,7 +61,8 @@
     mol = next(pybel.readfile('sdf', args.target))
     target_spectrophore = mol.data["Spectrophores(TM)"].strip().split(', ')
     # Compute the paired-distance between every molecule in the library and the target
-    distances = Compute_Spectrophores_distance(target_spectrophore, args)
+    Compute_Spectrophores_distance(target_spectrophore, args)
+

-if __name__ == "__main__" :
+if __name__ == "__main__":
     __main__()
--- a/remove_protonation_state.py	Tue Jul 28 08:38:28 2020 -0400
+++ b/remove_protonation_state.py	Mon Oct 19 14:40:22 2020 +0000
@@ -4,32 +4,37 @@
     Output: Molecule file with removed ions and fragments.
     Copyright 2013, Bjoern Gruening and Xavier Lucas
 """
-import sys, os
 import argparse

 from openbabel import openbabel, pybel
 openbabel.obErrorLog.StopLogging()

+
 def parse_command_line():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--iformat', default='sdf' , help='input file format')
+    parser.add_argument('--iformat', default='sdf', help='input file format')
     parser.add_argument('-i', '--input', required=True, help='input file name')
     parser.add_argument('-o', '--output', required=True, help='output file name')
     return parser.parse_args()

-def remove_protonation( args ):
+
+def remove_protonation(args):
     outfile = pybel.Outputfile(args.iformat, args.output, overwrite=True)
     for mol in pybel.readfile(args.iformat, args.input):
         [atom.OBAtom.SetFormalCharge(0) for atom in mol.atoms]
-        outfile.write( mol )
+        if 'inchi' in mol.data:
+            del mol.data['inchi']  # remove inchi cache so modified mol is saved
+        outfile.write(mol)
     outfile.close()

+
 def __main__():
     """
         Remove any protonation state from each atom in each molecule.
     """
     args = parse_command_line()
-    remove_protonation( args )
+    remove_protonation(args)
+

-if __name__ == "__main__" :
+if __name__ == "__main__":
     __main__()
--- a/subsearch.py	Tue Jul 28 08:38:28 2020 -0400
+++ b/subsearch.py	Mon Oct 19 14:40:22 2020 +0000
@@ -4,36 +4,41 @@
     Output: Moleculs filtered with specified substructures.
     Copyright 2013, Bjoern Gruening and Xavier Lucas
 """
-import sys, os
 import argparse
 import multiprocessing
-import tempfile
+import os
+import shutil
 import subprocess
-import shutil
+import sys
+import tempfile

 from openbabel import openbabel, pybel
 openbabel.obErrorLog.StopLogging()

+
 def parse_command_line():
     parser = argparse.ArgumentParser()
     parser.add_argument('-i', '--infile', required=True, help='Molecule file.')
     parser.add_argument('--iformat', help='Input format.')
-    parser.add_argument('--fastsearch-index', dest="fastsearch_index",
-        required=True, help='Path to the openbabel fastsearch index.')
+    parser.add_argument('--fastsearch-index', dest="fastsearch_index", required=True,
+                        help='Path to the openbabel fastsearch index.')
     parser.add_argument('-o', '--outfile', required=True, help='Path to the output file.')
-    parser.add_argument('--oformat',
-        default='smi', help='Output file format')
-    parser.add_argument("--max-candidates", dest="max_candidates", type=int,
-                    default=4000, help="The maximum number of candidates.")
-    parser.add_argument('-p', '--processors', type=int,
-        default=multiprocessing.cpu_count())
+    parser.add_argument('--oformat', default='smi', help='Output file format')
+    parser.add_argument("--max-candidates", dest="max_candidates", type=int, default=4000,
+                        help="The maximum number of candidates.")
+    parser.add_argument('-p', '--processors', type=int,
+                        default=multiprocessing.cpu_count())
     return parser.parse_args()

+
 results = list()
+
+
 def mp_callback(res):
     results.append(res)

-def mp_helper( query, args ):
+
+def mp_helper(query, args):
     """
         Helper function for multiprocessing.
         That function is a wrapper around the following command:
@@ -48,8 +53,7 @@
     tmp = tempfile.NamedTemporaryFile(delete=False)
     cmd = 'obabel -ifs %s -O %s %s -s%s -al %s' % (args.fastsearch_index, tmp.name, opts, query, args.max_candidates)

-    child = subprocess.Popen(cmd.split(),
-        stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    child = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)

     stdout, stderr = child.communicate()
     return_code = child.returncode
@@ -65,43 +69,43 @@
     return (tmp.name, query)


-def get_smiles_or_smarts( args ):
+def get_smiles_or_smarts(args):
     """
     Wrapper to retrieve a striped SMILES or SMARTS string from different input formats.
     """
     if args.iformat in ['smi', 'text', 'tabular']:
-        with open( args.infile ) as text_file:
+        with open(args.infile) as text_file:
             for line in text_file:
                 yield line.split('\t')[0].strip()
     else:
         # inchi or sdf files
-        for mol in pybel.readfile( args.iformat, args.infile ):
+        for mol in pybel.readfile(args.iformat, args.infile):
             yield mol.write('smiles').split('\t')[0]

-def substructure_search( args ):

-    pool = multiprocessing.Pool( args.processors )
-    for query in get_smiles_or_smarts( args ):
+def substructure_search(args):
+    pool = multiprocessing.Pool(args.processors)
+    for query in get_smiles_or_smarts(args):
         pool.apply_async(mp_helper, args=(query, args), callback=mp_callback)
-        #mp_callback( mp_helper(query, args) )
+        # mp_callback(mp_helper(query, args))
     pool.close()
     pool.join()

     if args.oformat == 'names':
-        out_handle = open( args.outfile, 'w' )
+        out_handle = open(args.outfile, 'w')
         for result_file, query in results:
             with open(result_file) as res_handle:
                 for line in res_handle:
-                    out_handle.write('%s\t%s\n' % ( line.strip(), query ))
-            os.remove( result_file )
+                    out_handle.write('%s\t%s\n' % (line.strip(), query))
+            os.remove(result_file)
         out_handle.close()
     else:
-        out_handle = open( args.outfile, 'wb' )
+        out_handle = open(args.outfile, 'wb')
         for result_file, query in results:
-            res_handle = open(result_file,'rb')
-            shutil.copyfileobj( res_handle, out_handle )
+            res_handle = open(result_file, 'rb')
+            shutil.copyfileobj(res_handle, out_handle)
             res_handle.close()
-            os.remove( result_file )
+            os.remove(result_file)
         out_handle.close()


@@ -110,7 +114,8 @@
         Multiprocessing Open Babel Substructure Search.
     """
     args = parse_command_line()
-    substructure_search( args )
+    substructure_search(args)
+

-if __name__ == "__main__" :
+if __name__ == "__main__":
     __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/na-sal.inchi	Mon Oct 19 14:40:22 2020 +0000
@@ -0,0 +1,1 @@
+InChI=1S/C7H6O3.Na/c8-6-4-2-1-3-5(6)7(9)10;/h1-4,8H,(H,9,10);/q;+1/p-1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/na-sal_obrmions.inchi	Mon Oct 19 14:40:22 2020 +0000
@@ -0,0 +1,1 @@
+InChI=1S/C7H6O3/c8-6-4-2-1-3-5(6)7(9)10/h1-4,8H,(H,9,10)/p-1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ob_remove_protonation_state.inchi	Mon Oct 19 14:40:22 2020 +0000
@@ -0,0 +1,1 @@
+InChI=1S/C7H5O3.Na/c8-6-4-2-1-3-5(6)7(9)10;/h1-4H,(H,9,10);