diff cheminfolib.py @ 0:2704d4017b13 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/openbabel commit 01da22e4184a5a6f6a3dd4631a7b9c31d1b6d502
author bgruening
date Sat, 20 May 2017 08:39:53 -0400
parents
children aebc671bae78
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cheminfolib.py	Sat May 20 08:39:53 2017 -0400
@@ -0,0 +1,264 @@
+#!/usr/bin/env python
+"""
+    Small library with cheminformatic functions based on openbabel and pgchem.
+    Copyright 2012, Bjoern Gruening and Xavier Lucas
+"""
+
+import os, sys
+
+try:
+    from galaxy import eggs
+    eggs.require('psycopg2')
+except:
+    print('psycopg2 is not available. It is currently used in the pgchem wrappers, that are not shipped with default CTB')
+
+try:
+    import pybel
+    import openbabel
+except:
+    print('OpenBabel could not be found. A few functions are not available without OpenBabel.')
+
+from multiprocessing import Pool
+import glob, tempfile, re
+import subprocess
+
+def CountLines( path ):
+    out = subprocess.Popen(['wc', '-l', path],
+                         stdout=subprocess.PIPE,
+                         stderr=subprocess.STDOUT
+                         ).communicate()[0]
+    return int(out.partition(b' ')[0])
+
+def grep(pattern, file_obj):
+    grepper = re.compile(pattern)
+    for line in file_obj:
+        if grepper.search(line):
+            return True
+    return False
+
+def check_filetype(filepath):
+    mol = False
+    possible_inchi = True
+    for line_counter, line in enumerate(open(filepath)):
+        if line_counter > 10000:
+            break
+        if line.find('$$$$') != -1:
+            return 'sdf'
+        elif line.find('@<TRIPOS>MOLECULE') != -1:
+            return 'mol2'
+        elif line.find('ligand id') != -1:
+            return 'drf'
+        elif possible_inchi and re.findall('^InChI=', line):
+            return 'inchi'
+        elif re.findall('^M\s+END', line):
+            mol = True
+        # first line is not an InChI, so it can't be an InChI file
+        possible_inchi = False
+
+    if mol:
+        # END can occures before $$$$, so and SDF file will 
+        # be recognised as mol, if you not using this hack'
+        return 'mol'
+    return 'smi'
+
+def db_connect(args):
+    try:
+        db_conn = psycopg2.connect("dbname=%s user=%s host=%s password=%s" % (args.dbname, args.dbuser, args.dbhost, args.dbpasswd));
+        return db_conn
+    except:
+        sys.exit('Unable to connect to the db')
+
+ColumnNames = {
+    'can_smiles' : 'Canonical SMILES',
+    'can' : 'Canonical SMILES',
+    'inchi' : 'InChI',
+    'inchi_key' : 'InChI key',
+    'inchi_key_first' : 'InChI key first',
+    'inchi_key_last' : 'InChI key last',
+    'molwt' : 'Molecular weight',
+    'hbd' : 'Hydrogen-bond donors',
+    'donors' : 'Hydrogen-bond donors',
+    'hba' : 'Hydrogen-bond acceptors',
+    'acceptors' : 'Hydrogen-bond acceptors',
+    'rotbonds' : 'Rotatable bonds',
+    'logp' : 'logP',
+    'psa' : 'Polar surface area',
+    'mr' : 'Molecular refractivity',
+    'atoms' : 'Number of heavy atoms',
+    'rings' : 'Number of rings',
+    'set_bits' : 'FP2 bits',
+    'id' : 'Internal identifier',
+    'tani' : 'Tanimoto coefficient',
+    'spectrophore' : 'Spectrophores(TM)',
+    'dist_spectrophore' : 'Spectrophores(TM) distance to target',
+    'synonym' : 'Entry id',
+}
+
+OBDescriptor = {
+    'atoms': ["atoms","Number of atoms"],
+    'hatoms': ["hatoms","Number of heavy atoms"], # self defined tag hatoms in plugindefines.txt
+    'can_smiles' : ["cansmi","Canonical SMILES"],
+    'can_smilesNS' : ["cansmiNS","Canonical SMILES without isotopes or stereo"],
+    #["abonds","Number of aromatic bonds"],
+    #["bonds","Number of bonds"],
+    #["dbonds","Number of double bonds"],
+    #["formula","Chemical formula"],
+    'hba': ["HBA1","Number of Hydrogen Bond Acceptors 1 (JoelLib)"],
+    'hba2': ["HBA2","Number of Hydrogen Bond Acceptors 2 (JoelLib)"],
+    'hbd': ["HBD","Number of Hydrogen Bond Donors (JoelLib)"],
+    'inchi': ["InChI","IUPAC InChI identifier"],
+    'inchi_key': ["InChIKey","InChIKey"],
+    #["L5","Lipinski Rule of Five"],
+    'logp': ["logP","octanol/water partition coefficient"],
+    'mr': ["MR","molar refractivity"],
+    'molwt': ["MW","Molecular Weight filter"],
+    #["nF","Number of Fluorine Atoms"],
+    #["s","SMARTS filter"],
+    #["sbonds","Number of single bonds"],
+    #["smarts","SMARTS filter"],
+    #["tbonds","Number of triple bonds"],
+    #["title","For comparing a molecule's title"],
+    'psa': ["TPSA","topological polar surface area"],
+    'rotbonds' : ['ROTATABLE_BOND', 'rotatable bonds'],
+}
+
+
+def print_output(args, rows):
+    if args.oformat == 'table':
+        outfile = open(args.output, 'w')
+        requested_fields = (filter(lambda x: x not in ["[", "]", "'"], args.fetch)).split(', ')
+        if args.header:
+            outfile.write( 'Identifier\t' + '\t'.join( [ColumnNames[key] for key in requested_fields] ) + '\n' )
+        for row in rows:
+            outfile.write( row['synonym'] + '\t' + '\t'.join( [str(row[key]) for key in requested_fields] ) + '\n' )
+
+    elif args.oformat in ['sdf', 'mol2']:
+        outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True)
+        for row in rows:
+            try:
+                mol = pybel.readstring('sdf', row['mol'])
+                if args.oformat == 'sdf':
+                    keys = filter(lambda x: x not in ["[", "]", "'"], args.fetch).split(', ')
+                    mol.data.update( { ColumnNames['synonym'] : row['synonym'] } )
+                    if 'inchi_key' in keys:
+                        keys = (', '.join(keys).replace( "inchi_key", "inchi_key_first, inchi_key_last" )).split(', ')
+                    [ mol.data.update( { ColumnNames[key] : row[key] } ) for key in keys if key]
+                outfile.write(mol)
+            except:
+                pass
+    else:
+        outfile = open(args.output, 'w')
+        outfile.write( '\n'.join( [ '%s\t%s' % (row[args.oformat], row['synonym'] ) for row in rows ] ) )
+    outfile.close()
+
+def pybel_stop_logging():
+    openbabel.obErrorLog.StopLogging()
+
+def get_properties_ext(mol):
+
+    HBD = pybel.Smarts("[!#6;!H0]")
+    HBA = pybel.Smarts("[$([$([#8,#16]);!$(*=N~O);" +
+                       "!$(*~N=O);X1,X2]),$([#7;v3;" +
+                       "!$([nH]);!$(*(-a)-a)])]"
+                      )
+    calc_desc_dict = mol.calcdesc()
+
+    try:
+        logp = calc_desc_dict['logP']
+    except:
+        logp = calc_desc_dict['LogP']
+
+    return {"molwt": mol.molwt,
+            "logp": logp,
+            "donors": len(HBD.findall(mol)),
+            "acceptors": len(HBA.findall(mol)), 
+            "psa": calc_desc_dict['TPSA'],
+            "mr": calc_desc_dict['MR'],
+            "rotbonds": mol.OBMol.NumRotors(),
+            "can": mol.write("can").split()[0].strip(), ### tthis one works fine for both zinc and chembl (no ZINC code added after can descriptor string)
+            "inchi": mol.write("inchi").strip(),
+            "inchi_key": get_inchikey(mol).strip(),
+            "rings": len(mol.sssr),
+            "atoms": mol.OBMol.NumHvyAtoms(),
+            "spectrophore" : OBspectrophore(mol),
+           }
+
+def get_inchikey(mol):
+    conv = openbabel.OBConversion()
+    conv.SetInAndOutFormats("mol", "inchi")
+    conv.SetOptions("K", conv.OUTOPTIONS)
+    inchikey = conv.WriteString( mol.OBMol )
+    return inchikey
+
+def OBspectrophore(mol):
+    spectrophore = pybel.ob.OBSpectrophore()
+    # Parameters: rotation angle = 20, normalization for mean and sd, accuracy = 3.0 A and non-stereospecific cages.
+    spectrophore.SetNormalization( spectrophore.NormalizationTowardsZeroMeanAndUnitStd )
+    return ', '.join( [ "%.3f" % value for value in spectrophore.GetSpectrophore( mol.OBMol ) ] )
+
+def squared_euclidean_distance(a, b):
+    try:
+        return ((np.asarray( a ) - np.asarray( b ))**2).sum()
+    except ValueError:
+        return 0
+
+def split_library( lib_path, lib_format = 'sdf', package_size = None ):
+    """
+        Split a library of compounds. Usage: split_library( lib_path, lib_format, package_size )
+        IT currently ONLY WORKS FOR SD-Files
+    """
+    pack = 1
+    mol_counter = 0
+
+    outfile = open('/%s/%s_pack_%i.%s' % ( '/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w' )
+
+    for line in open(lib_path, 'r'):
+        outfile.write( line )
+        if line.strip() == '$$$$':
+            mol_counter += 1
+            if mol_counter % package_size == 0:
+                outfile.close()
+                pack += 1
+                outfile = open('/%s/%s_pack_%i.%s' % ( '/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w' )
+                if mol_counter*10 % package_size == 0:
+                    print('%i molecules parsed, starting pack nr. %i' % ( mol_counter, pack - 1 ))
+    outfile.close()
+
+    return True
+
+def split_smi_library( smiles_file, structures_in_one_file ):
+    """
+        Split a file with SMILES to several files for multiprocessing usage. 
+        Usage: split_smi_library( smiles_file, 10 )
+    """
+    output_files = []
+    tfile = tempfile.NamedTemporaryFile(delete=False)
+
+    smiles_handle = open(smiles_file, 'r')
+    for count, line in enumerate( smiles_handle ):
+        if count % structures_in_one_file == 0 and count != 0:
+            tfile.close()
+            output_files.append(tfile.name)
+            tfile = tempfile.NamedTemporaryFile(delete=False)
+        tfile.write(line)
+    tfile.close()
+    output_files.append(tfile.name)
+    smiles_handle.close()
+    return output_files
+
+
+def mp_run(input_path, regex, PROCESSES, function_to_call ):
+    paths = []
+    [ paths.append(compound_file) for compound_file in glob.glob(str(input_path) + str(regex)) ]
+    paths.sort()
+
+    pool = Pool(processes=PROCESSES)
+    print('Process initialized with', PROCESSES, 'processors')
+    result = pool.map_async(function_to_call, paths)
+    result.get()
+
+    return paths
+
+if __name__ == '__main__':
+    print(check_filetype(sys.argv[1]))
+