annotate filter/ob_filter.py @ 1:b52872a1755a draft

update cheminfolib dependency
author bgruening
date Wed, 21 Aug 2013 03:06:02 -0400
parents 527ecd2fc500
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
1 #!/usr/bin/env python
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
2 """
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
3 Input: set of molecules with pre-calculated physico-chemical properties
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
4 Output: set of molecules that pass all the filters
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
5 Copyright 2012, Bjoern Gruening and Xavier Lucas
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
6
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
7 TODO: AND/OR conditions?
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
8 """
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
9 import sys, os
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
10 import argparse
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
11 import cheminfolib
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
12 import json
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
13 import pybel
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
14 import shlex, subprocess
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
15
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
16 cheminfolib.pybel_stop_logging()
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
17
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
18 def parse_command_line():
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
19 parser = argparse.ArgumentParser()
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
20 parser.add_argument('-i', '--input', help='Input file name')
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
21 parser.add_argument('-iformat', help='Input file format')
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
22 parser.add_argument('-oformat',
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
23 default='smi',
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
24 help='Output file format')
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
25 parser.add_argument('-o', '--output',
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
26 help='Output file name',
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
27 required=True)
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
28 parser.add_argument('--filters',
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
29 help="Specify the filters to apply",
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
30 required=True,
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
31 )
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
32 return parser.parse_args()
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
33
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
34 def filter_precalculated_compounds(args, filters):
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
35 outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True)
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
36 for mol in pybel.readfile('sdf', args.input):
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
37 for key, elem in filters.items():
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
38 # map the short description to the larger metadata names stored in the sdf file
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
39 property = cheminfolib.ColumnNames[key]
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
40 min = elem[0]
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
41 max = elem[1]
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
42 if float(mol.data[property]) >= float(min) and float(mol.data[property]) <= float(max):
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
43 pass
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
44 else:
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
45 # leave the filter loop, because one filter constrained are not satisfied
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
46 break
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
47 else:
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
48 # if the filter loop terminates in a normal way (no break) all filter rules are satisfied, so save the compound
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
49 outfile.write(mol)
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
50 outfile.close()
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
51
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
52 def filter_new_compounds(args, filters):
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
53
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
54 if args.iformat == args.oformat:
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
55 # use the -ocopy option from openbabel to speed up the filtering, additionally no conversion is carried out
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
56 # http://openbabel.org/docs/dev/FileFormats/Copy_raw_text.html#copy-raw-text
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
57 cmd = 'obabel -i%s %s -ocopy -O %s --filter' % (args.iformat, args.input, args.output)
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
58 else:
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
59 cmd = 'obabel -i%s %s -o%s -O %s --filter' % (args.iformat, args.input, args.oformat, args.output)
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
60 filter_cmd = ''
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
61 # OBDescriptor stores a mapping from our desc shortcut to the OB name [0] and a long description [1]
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
62 for key, elem in filters.items():
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
63 ob_descriptor_name = cheminfolib.OBDescriptor[key][0]
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
64 min = elem[0]
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
65 max = elem[1]
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
66 filter_cmd += ' %s>=%s %s<=%s ' % (ob_descriptor_name, min, ob_descriptor_name, max)
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
67
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
68 args = shlex.split('%s "%s"' % (cmd, filter_cmd))
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
69 #print '%s "%s"' % (cmd, filter_cmd)
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
70 # calling openbabel with subprocess and pipe potential errors occuring in openbabel to stdout
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
71 child = subprocess.Popen(args,
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
72 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
73
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
74 stdout, stderr = child.communicate()
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
75 return_code = child.returncode
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
76
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
77 if return_code:
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
78 sys.stdout.write(stdout)
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
79 sys.stderr.write(stderr)
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
80 sys.stderr.write("Return error code %i from command:\n" % return_code)
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
81 sys.stderr.write("%s\n" % cmd)
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
82 else:
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
83 sys.stdout.write(stdout)
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
84 sys.stdout.write(stderr)
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
85
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
86
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
87 def __main__():
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
88 """
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
89 Select compounds with certain properties from a small library
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
90 """
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
91 args = parse_command_line()
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
92 # Its a small trick to get the parameters in an easy way from the xml file.
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
93 # To keep it readable in the xml file, many white-spaces are included in that string it needs to be removed.
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
94 # Also the last loop creates a ',{' that is not an valid jason expression.
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
95 filters = json.loads((args.filters).replace(' ', '').replace(',}', '}'))
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
96 if args.iformat == 'sdf':
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
97 # Check if the sdf file contains all of the required metadata to invoke the precalculation filtering
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
98 mol = pybel.readfile('sdf', args.input).next()
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
99 for key, elem in filters.items():
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
100 property = cheminfolib.ColumnNames[key]
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
101 if not property in mol.data:
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
102 break
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
103 else:
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
104 # if the for loop finishes in a normal way, we should habe all properties at least in the first molecule
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
105 # assume it is the same for all other molecules and start the precalculated filtering
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
106 filter_precalculated_compounds(args, filters)
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
107 return True
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
108 filter_new_compounds(args, filters)
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
109
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
110
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
111 if __name__ == "__main__" :
527ecd2fc500 Uploaded
bgruening
parents:
diff changeset
112 __main__()