diff rdkit_descriptors.py @ 8:a1c53f0533b0 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 4d0bfcf37bfbedafc7ff0672dfe452766ca8a606"
author bgruening
date Wed, 17 Feb 2021 12:59:43 +0000
parents 6674260c1459
children 0993ac4f4a23
line wrap: on
line diff
--- a/rdkit_descriptors.py	Tue Jul 28 08:43:19 2020 -0400
+++ b/rdkit_descriptors.py	Wed Feb 17 12:59:43 2021 +0000
@@ -1,44 +1,49 @@
 #!/usr/bin/env python
 
-from rdkit.Chem import Descriptors
-from rdkit import Chem
-import sys, os, re
 import argparse
 import inspect
+import sys
 
-def get_supplier( infile, format = 'smiles' ):
+from rdkit import Chem
+from rdkit.Chem import Descriptors
+
+
+def get_supplier(infile, format='smiles'):
     """
-    Returns a generator over a SMILES or InChI file. Every element is of RDKit 
+    Returns a generator over a SMILES or InChI file. Every element is of RDKit
     molecule and has its original string as _Name property.
     """
     with open(infile) as handle:
         for line in handle:
             line = line.strip()
             if format == 'smiles':
-                mol = Chem.MolFromSmiles( line, sanitize=True )
+                mol = Chem.MolFromSmiles(line, sanitize=True)
             elif format == 'inchi':
-                mol = Chem.inchi.MolFromInchi( line, sanitize=True, removeHs=True, logLevel=None, treatWarningAsError=False )
+                mol = Chem.inchi.MolFromInchi(line, sanitize=True, removeHs=True, logLevel=None, treatWarningAsError=False)
             if mol is None:
                 yield False
             else:
-                mol.SetProp( '_Name', line.split('\t')[0] )
+                mol.SetProp('_Name', line.split('\t')[0])
                 yield mol
 
+
 def get_rdkit_descriptor_functions():
     """
     Returns all descriptor functions under the Chem.Descriptors Module as tuple of (name, function)
     """
-    ret = [ (name, f) for name, f in inspect.getmembers( Descriptors ) if inspect.isfunction( f ) and not name.startswith( '_' ) ]
+    ret = [(name, f) for name, f in inspect.getmembers(Descriptors) if inspect.isfunction(f) and not name.startswith('_')]
+    # some which are not in the official Descriptors module we need to add manually
+    ret.extend([('FormalCharge', Chem.GetFormalCharge), ('SSSR', Chem.GetSSSR)])
     ret.sort()
     return ret
 
 
-def descriptors( mol, functions ):
+def descriptors(mol, functions):
     """
     Calculates the descriptors of a given molecule.
     """
     for name, function in functions:
-        yield (name, function( mol ))
+        yield (name, function(mol))
 
 
 if __name__ == "__main__":
@@ -46,31 +51,44 @@
     parser.add_argument('-i', '--infile', required=True, help='Path to the input file.')
     parser.add_argument("--iformat", help="Specify the input file format.")
 
-    parser.add_argument('-o', '--outfile', type=argparse.FileType('w+'), 
-        default=sys.stdout, help="path to the result file, default it sdtout")
+    parser.add_argument('-o', '--outfile', type=argparse.FileType('w+'),
+                        default=sys.stdout,
+                        help="path to the result file, default is stdout")
+
+    parser.add_argument('-s', '--select', default=None,
+                        help="select a subset of comma-separated descriptors to use")
 
     parser.add_argument("--header", dest="header", action="store_true",
-                    default=False,
-                    help="Write header line.")
+                        default=False,
+                        help="Write header line.")
 
     args = parser.parse_args()
 
     if args.iformat == 'sdf':
-        supplier = Chem.SDMolSupplier( args.infile )
-    elif args.iformat =='smi':
-        supplier = get_supplier( args.infile, format = 'smiles' )
+        supplier = Chem.SDMolSupplier(args.infile)
+    elif args.iformat == 'smi':
+        supplier = get_supplier(args.infile, format='smiles')
     elif args.iformat == 'inchi':
-        supplier = get_supplier( args.infile, format = 'inchi' )
+        supplier = get_supplier(args.infile, format='inchi')
+    elif args.iformat == 'pdb':
+        supplier = [Chem.MolFromPDBFile(args.infile)]
+    elif args.iformat == 'mol2':
+        supplier = [Chem.MolFromMol2File(args.infile)]
 
     functions = get_rdkit_descriptor_functions()
+    if args.select and args.select != 'None':
+        selected = args.select.split(',')
+        functions = [(name, f) for name, f in functions if name in selected]
 
     if args.header:
-        args.outfile.write( '%s\n' % '\t'.join( ['MoleculeID'] + [name for name, f in functions] ) )
+        args.outfile.write('%s\n' % '\t'.join(['MoleculeID'] + [name for name, f in functions]))
 
     for mol in supplier:
         if not mol:
             continue
-        descs = descriptors( mol, functions )
-        molecule_id = mol.GetProp("_Name")
-        args.outfile.write( "%s\n" % '\t'.join( [molecule_id]+ [str(round(res, 6)) for name, res in descs] ) )
-
+        descs = descriptors(mol, functions)
+        try:
+            molecule_id = mol.GetProp("_Name")
+        except KeyError:
+            molecule_id = Chem.MolToSmiles(mol)
+        args.outfile.write("%s\n" % '\t'.join([molecule_id] + [str(round(res, 6)) for name, res in descs]))