Mercurial > repos > bgruening > openbabel_structure_distance_finder

diff ob_filter.py @ 0:c066b5accacf draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/openbabel commit 6c84abdd07f292048bf2194073e2e938e94158c4"
author: bgruening
date: Wed, 25 Mar 2020 16:47:13 -0400
children: f0502d809c4b
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ob_filter.py	Wed Mar 25 16:47:13 2020 -0400
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+"""
+    Input: set of molecules with pre-calculated physico-chemical properties
+    Output: set of molecules that pass all the filters
+    Copyright 2012, Bjoern Gruening and Xavier Lucas
+
+    TODO: AND/OR conditions?
+"""
+import sys, os
+import argparse
+import cheminfolib
+import json
+import pybel
+import shlex, subprocess
+
+cheminfolib.pybel_stop_logging()
+
+def parse_command_line():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-i', '--input', help='Input file name')
+    parser.add_argument('-iformat', help='Input file format')
+    parser.add_argument('-oformat', 
+        default='smi',
+        help='Output file format')
+    parser.add_argument('-o', '--output', 
+        help='Output file name',
+        required=True)
+    parser.add_argument('--filters', 
+        help="Specify the filters to apply",
+        required=True,
+        )
+    parser.add_argument('--list_of_names', 
+        help="A file with list of molecule names to extract. Every name is in one line.",
+        required=False,
+        )
+    return parser.parse_args()
+
+def filter_precalculated_compounds(args, filters):
+    outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True)
+    for mol in pybel.readfile('sdf', args.input):
+        for key, elem in filters.items():
+            # map the short description to the larger metadata names stored in the sdf file
+            property = cheminfolib.ColumnNames[key]
+            min = elem[0]
+            max = elem[1]
+            if float(mol.data[property]) >= float(min) and float(mol.data[property]) <= float(max):
+                pass
+            else:
+                # leave the filter loop, because one filter constrained are not satisfied
+                break
+        else:
+            # if the filter loop terminates in a normal way (no break) all filter rules are satisfied, so save the compound
+            outfile.write(mol)
+    outfile.close()
+
+def filter_new_compounds(args, filters):
+
+    if args.iformat == args.oformat:
+        # use the -ocopy option from openbabel to speed up the filtering, additionally no conversion is carried out
+        # http://openbabel.org/docs/dev/FileFormats/Copy_raw_text.html#copy-raw-text
+        cmd = 'obabel -i%s %s -ocopy -O %s --filter' % (args.iformat, args.input, args.output)
+    else:
+        cmd = 'obabel -i%s %s -o%s -O %s --filter' % (args.iformat, args.input, args.oformat, args.output)
+    filter_cmd = ''
+    # OBDescriptor stores a mapping from our desc shortcut to the OB name [0] and a long description [1]
+    for key, elem in filters.items():
+        ob_descriptor_name = cheminfolib.OBDescriptor[key][0]
+        min = elem[0]
+        max = elem[1]
+        filter_cmd += ' %s>=%s %s<=%s ' % (ob_descriptor_name, min, ob_descriptor_name, max)
+
+    args = shlex.split('%s "%s"' % (cmd, filter_cmd))
+    #print '%s "%s"' % (cmd, filter_cmd)
+    # calling openbabel with subprocess and pipe potential errors occuring in openbabel to stdout
+    child = subprocess.Popen(args,
+        stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    stdout, stderr = child.communicate()
+    return_code = child.returncode
+
+    if return_code:
+        sys.stdout.write(stdout.decode('utf-8'))
+        sys.stderr.write(stderr.decode('utf-8'))
+        sys.stderr.write("Return error code %i from command:\n" % return_code)
+        sys.stderr.write("%s\n" % cmd)
+    else:
+        sys.stdout.write(stdout.decode('utf-8'))
+        sys.stdout.write(stderr.decode('utf-8'))
+
+def filter_by_name(args):
+    outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True)
+    for mol in pybel.readfile('sdf', args.input):
+        for name in open(args.list_of_names):
+            if mol.title.strip() == name.strip():
+                outfile.write(mol)
+    outfile.close()
+
+def __main__():
+    """
+        Select compounds with certain properties from a small library
+    """
+    args = parse_command_line()
+    
+    if args.filters == '__filter_by_name__':
+        filter_by_name(args)
+        return
+    
+    # Its a small trick to get the parameters in an easy way from the xml file.
+    # To keep it readable in the xml file, many white-spaces are included in that string it needs to be removed.
+    # Also the last loop creates a ',{' that is not an valid jason expression.
+    filters = json.loads((args.filters).replace(' ', '').replace(',}', '}'))
+    if args.iformat == 'sdf':
+        # Check if the sdf file contains all of the required metadata to invoke the precalculation filtering
+        mol = next(pybel.readfile('sdf', args.input))
+        for key, elem in filters.items():
+            property = cheminfolib.ColumnNames[key]
+            if not property in mol.data:
+                break
+        else:
+            # if the for loop finishes in a normal way, we should habe all properties at least in the first molecule
+            # assume it is the same for all other molecules and start the precalculated filtering
+            filter_precalculated_compounds(args, filters)
+            return True
+    filter_new_compounds(args, filters)
+
+
+if __name__ == "__main__" :
+    __main__()
author	bgruening
date	Wed, 25 Mar 2020 16:47:13 -0400
parents
children	f0502d809c4b