Mercurial > repos > bgruening > openbabel_genprop
comparison cheminfolib.py @ 15:50eaae9df8d3 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/openbabel commit d9c51279c061a1da948a2582d5b502ca7573adbf
author | bgruening |
---|---|
date | Thu, 15 Aug 2024 11:05:33 +0000 |
parents | 417845394cdf |
children |
comparison
equal
deleted
inserted
replaced
14:b249bf9be0d1 | 15:50eaae9df8d3 |
---|---|
9 import subprocess | 9 import subprocess |
10 import sys | 10 import sys |
11 import tempfile | 11 import tempfile |
12 from multiprocessing import Pool | 12 from multiprocessing import Pool |
13 | 13 |
14 | |
15 try: | 14 try: |
16 from galaxy import eggs | 15 from galaxy import eggs |
17 eggs.require('psycopg2') | 16 |
17 eggs.require("psycopg2") | |
18 except ImportError: | 18 except ImportError: |
19 psycopg2 = None | 19 psycopg2 = None |
20 print('psycopg2 is not available. It is currently used in the pgchem wrappers, that are not shipped with default CTB') | 20 print( |
21 "psycopg2 is not available. It is currently used in the pgchem wrappers, that are not shipped with default CTB" | |
22 ) | |
21 | 23 |
22 try: | 24 try: |
23 from openbabel import openbabel, pybel | 25 from openbabel import openbabel, pybel |
26 | |
24 openbabel.obErrorLog.StopLogging() | 27 openbabel.obErrorLog.StopLogging() |
25 except ImportError: | 28 except ImportError: |
26 openbabel, pybel = None, None | 29 openbabel, pybel = None, None |
27 print('OpenBabel could not be found. A few functions are not available without OpenBabel.') | 30 print( |
31 "OpenBabel could not be found. A few functions are not available without OpenBabel." | |
32 ) | |
28 | 33 |
29 | 34 |
30 def CountLines(path): | 35 def CountLines(path): |
31 out = subprocess.Popen(['wc', '-l', path], | 36 out = subprocess.Popen( |
32 stdout=subprocess.PIPE, | 37 ["wc", "-l", path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT |
33 stderr=subprocess.STDOUT | 38 ).communicate()[0] |
34 ).communicate()[0] | 39 return int(out.partition(b" ")[0]) |
35 return int(out.partition(b' ')[0]) | |
36 | 40 |
37 | 41 |
38 def grep(pattern, file_obj): | 42 def grep(pattern, file_obj): |
39 grepper = re.compile(pattern) | 43 grepper = re.compile(pattern) |
40 for line in file_obj: | 44 for line in file_obj: |
47 mol = False | 51 mol = False |
48 possible_inchi = True | 52 possible_inchi = True |
49 for line_counter, line in enumerate(open(filepath)): | 53 for line_counter, line in enumerate(open(filepath)): |
50 if line_counter > 10000: | 54 if line_counter > 10000: |
51 break | 55 break |
52 if line.find('$$$$') != -1: | 56 if line.find("$$$$") != -1: |
53 return 'sdf' | 57 return "sdf" |
54 elif line.find('@<TRIPOS>MOLECULE') != -1: | 58 elif line.find("@<TRIPOS>MOLECULE") != -1: |
55 return 'mol2' | 59 return "mol2" |
56 elif line.find('ligand id') != -1: | 60 elif line.find("ligand id") != -1: |
57 return 'drf' | 61 return "drf" |
58 elif possible_inchi and re.findall('^InChI=', line): | 62 elif possible_inchi and re.findall("^InChI=", line): |
59 return 'inchi' | 63 return "inchi" |
60 elif re.findall(r'^M\s+END', line): | 64 elif re.findall(r"^M\s+END", line): |
61 mol = True | 65 mol = True |
62 # first line is not an InChI, so it can't be an InChI file | 66 # first line is not an InChI, so it can't be an InChI file |
63 possible_inchi = False | 67 possible_inchi = False |
64 | 68 |
65 if mol: | 69 if mol: |
66 # END can occures before $$$$, so and SDF file will | 70 # END can occures before $$$$, so and SDF file will |
67 # be recognised as mol, if you not using this hack' | 71 # be recognised as mol, if you not using this hack' |
68 return 'mol' | 72 return "mol" |
69 return 'smi' | 73 return "smi" |
70 | 74 |
71 | 75 |
72 def db_connect(args): | 76 def db_connect(args): |
73 try: | 77 try: |
74 db_conn = psycopg2.connect("dbname=%s user=%s host=%s password=%s" % (args.dbname, args.dbuser, args.dbhost, args.dbpasswd)) | 78 db_conn = psycopg2.connect( |
79 "dbname=%s user=%s host=%s password=%s" | |
80 % (args.dbname, args.dbuser, args.dbhost, args.dbpasswd) | |
81 ) | |
75 return db_conn | 82 return db_conn |
76 except psycopg2.Error: | 83 except psycopg2.Error: |
77 sys.exit('Unable to connect to the db') | 84 sys.exit("Unable to connect to the db") |
78 | 85 |
79 | 86 |
80 ColumnNames = { | 87 ColumnNames = { |
81 'can_smiles': 'Canonical SMILES', | 88 "can_smiles": "Canonical SMILES", |
82 'can': 'Canonical SMILES', | 89 "can": "Canonical SMILES", |
83 'inchi': 'InChI', | 90 "inchi": "InChI", |
84 'inchi_key': 'InChI key', | 91 "inchi_key": "InChI key", |
85 'inchi_key_first': 'InChI key first', | 92 "inchi_key_first": "InChI key first", |
86 'inchi_key_last': 'InChI key last', | 93 "inchi_key_last": "InChI key last", |
87 'molwt': 'Molecular weight', | 94 "molwt": "Molecular weight", |
88 'hbd': 'Hydrogen-bond donors', | 95 "hbd": "Hydrogen-bond donors", |
89 'donors': 'Hydrogen-bond donors', | 96 "donors": "Hydrogen-bond donors", |
90 'hba': 'Hydrogen-bond acceptors', | 97 "hba": "Hydrogen-bond acceptors", |
91 'acceptors': 'Hydrogen-bond acceptors', | 98 "acceptors": "Hydrogen-bond acceptors", |
92 'rotbonds': 'Rotatable bonds', | 99 "rotbonds": "Rotatable bonds", |
93 'logp': 'logP', | 100 "logp": "logP", |
94 'psa': 'Polar surface area', | 101 "psa": "Polar surface area", |
95 'mr': 'Molecular refractivity', | 102 "mr": "Molecular refractivity", |
96 'atoms': 'Number of heavy atoms', | 103 "atoms": "Number of heavy atoms", |
97 'rings': 'Number of rings', | 104 "rings": "Number of rings", |
98 'set_bits': 'FP2 bits', | 105 "set_bits": "FP2 bits", |
99 'id': 'Internal identifier', | 106 "id": "Internal identifier", |
100 'tani': 'Tanimoto coefficient', | 107 "tani": "Tanimoto coefficient", |
101 'spectrophore': 'Spectrophores(TM)', | 108 "spectrophore": "Spectrophores(TM)", |
102 'dist_spectrophore': 'Spectrophores(TM) distance to target', | 109 "dist_spectrophore": "Spectrophores(TM) distance to target", |
103 'synonym': 'Entry id', | 110 "synonym": "Entry id", |
104 } | 111 } |
105 | 112 |
106 OBDescriptor = { | 113 OBDescriptor = { |
107 'atoms': ["atoms", "Number of atoms"], | 114 "atoms": ["atoms", "Number of atoms"], |
108 'hatoms': ["hatoms", "Number of heavy atoms"], # self defined tag hatoms in plugindefines.txt | 115 "hatoms": [ |
109 'can_smiles': ["cansmi", "Canonical SMILES"], | 116 "hatoms", |
110 'can_smilesNS': ["cansmiNS", "Canonical SMILES without isotopes or stereo"], | 117 "Number of heavy atoms", |
118 ], # self defined tag hatoms in plugindefines.txt | |
119 "can_smiles": ["cansmi", "Canonical SMILES"], | |
120 "can_smilesNS": ["cansmiNS", "Canonical SMILES without isotopes or stereo"], | |
111 # ["abonds", "Number of aromatic bonds"], | 121 # ["abonds", "Number of aromatic bonds"], |
112 # ["bonds", "Number of bonds"], | 122 # ["bonds", "Number of bonds"], |
113 # ["dbonds", "Number of double bonds"], | 123 # ["dbonds", "Number of double bonds"], |
114 # ["formula", "Chemical formula"], | 124 # ["formula", "Chemical formula"], |
115 'hba': ["HBA1", "Number of Hydrogen Bond Acceptors 1 (JoelLib)"], | 125 "hba": ["HBA1", "Number of Hydrogen Bond Acceptors 1 (JoelLib)"], |
116 'hba2': ["HBA2", "Number of Hydrogen Bond Acceptors 2 (JoelLib)"], | 126 "hba2": ["HBA2", "Number of Hydrogen Bond Acceptors 2 (JoelLib)"], |
117 'hbd': ["HBD", "Number of Hydrogen Bond Donors (JoelLib)"], | 127 "hbd": ["HBD", "Number of Hydrogen Bond Donors (JoelLib)"], |
118 'inchi': ["InChI", "IUPAC InChI identifier"], | 128 "inchi": ["InChI", "IUPAC InChI identifier"], |
119 'inchi_key': ["InChIKey", "InChIKey"], | 129 "inchi_key": ["InChIKey", "InChIKey"], |
120 # ["L5", "Lipinski Rule of Five"], | 130 # ["L5", "Lipinski Rule of Five"], |
121 'logp': ["logP", "octanol/water partition coefficient"], | 131 "logp": ["logP", "octanol/water partition coefficient"], |
122 'mr': ["MR", "molar refractivity"], | 132 "mr": ["MR", "molar refractivity"], |
123 'molwt': ["MW", "Molecular Weight filter"], | 133 "molwt": ["MW", "Molecular Weight filter"], |
124 # ["nF", "Number of Fluorine Atoms"], | 134 # ["nF", "Number of Fluorine Atoms"], |
125 # ["s", "SMARTS filter"], | 135 # ["s", "SMARTS filter"], |
126 # ["sbonds", "Number of single bonds"], | 136 # ["sbonds", "Number of single bonds"], |
127 # ["smarts", "SMARTS filter"], | 137 # ["smarts", "SMARTS filter"], |
128 # ["tbonds", "Number of triple bonds"], | 138 # ["tbonds", "Number of triple bonds"], |
129 # ["title", "For comparing a molecule's title"], | 139 # ["title", "For comparing a molecule's title"], |
130 'psa': ["TPSA", "topological polar surface area"], | 140 "psa": ["TPSA", "topological polar surface area"], |
131 'rotbonds': ['ROTATABLE_BOND', 'rotatable bonds'], | 141 "rotbonds": ["ROTATABLE_BOND", "rotatable bonds"], |
132 } | 142 } |
133 | 143 |
134 | 144 |
135 def print_output(args, rows): | 145 def print_output(args, rows): |
136 if args.oformat == 'table': | 146 if args.oformat == "table": |
137 outfile = open(args.output, 'w') | 147 outfile = open(args.output, "w") |
138 requested_fields = (filter(lambda x: x not in ["[", "]", "'"], args.fetch)).split(', ') | 148 requested_fields = ( |
149 filter(lambda x: x not in ["[", "]", "'"], args.fetch) | |
150 ).split(", ") | |
139 if args.header: | 151 if args.header: |
140 outfile.write('Identifier\t' + '\t'.join([ColumnNames[key] for key in requested_fields]) + '\n') | 152 outfile.write( |
153 "Identifier\t" | |
154 + "\t".join([ColumnNames[key] for key in requested_fields]) | |
155 + "\n" | |
156 ) | |
141 for row in rows: | 157 for row in rows: |
142 outfile.write(row['synonym'] + '\t' + '\t'.join([str(row[key]) for key in requested_fields]) + '\n') | 158 outfile.write( |
143 | 159 row["synonym"] |
144 elif args.oformat in ['sdf', 'mol2']: | 160 + "\t" |
161 + "\t".join([str(row[key]) for key in requested_fields]) | |
162 + "\n" | |
163 ) | |
164 | |
165 elif args.oformat in ["sdf", "mol2"]: | |
145 outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True) | 166 outfile = pybel.Outputfile(args.oformat, args.output, overwrite=True) |
146 for row in rows: | 167 for row in rows: |
147 try: | 168 try: |
148 mol = pybel.readstring('sdf', row['mol']) | 169 mol = pybel.readstring("sdf", row["mol"]) |
149 if args.oformat == 'sdf': | 170 if args.oformat == "sdf": |
150 keys = filter(lambda x: x not in ["[", "]", "'"], args.fetch).split(', ') | 171 keys = filter(lambda x: x not in ["[", "]", "'"], args.fetch).split( |
151 mol.data.update({ColumnNames['synonym']: row['synonym']}) | 172 ", " |
152 if 'inchi_key' in keys: | 173 ) |
153 keys = (', '.join(keys).replace("inchi_key", "inchi_key_first, inchi_key_last")).split(', ') | 174 mol.data.update({ColumnNames["synonym"]: row["synonym"]}) |
154 [mol.data.update({ColumnNames[key]: row[key]}) for key in keys if key] | 175 if "inchi_key" in keys: |
176 keys = ( | |
177 ", ".join(keys).replace( | |
178 "inchi_key", "inchi_key_first, inchi_key_last" | |
179 ) | |
180 ).split(", ") | |
181 [ | |
182 mol.data.update({ColumnNames[key]: row[key]}) | |
183 for key in keys | |
184 if key | |
185 ] | |
155 outfile.write(mol) | 186 outfile.write(mol) |
156 except OSError: | 187 except OSError: |
157 pass | 188 pass |
158 else: | 189 else: |
159 outfile = open(args.output, 'w') | 190 outfile = open(args.output, "w") |
160 outfile.write('\n'.join(['%s\t%s' % (row[args.oformat], row['synonym']) for row in rows])) | 191 outfile.write( |
192 "\n".join(["%s\t%s" % (row[args.oformat], row["synonym"]) for row in rows]) | |
193 ) | |
161 outfile.close() | 194 outfile.close() |
162 | 195 |
163 | 196 |
164 def pybel_stop_logging(): | 197 def pybel_stop_logging(): |
165 openbabel.obErrorLog.StopLogging() | 198 openbabel.obErrorLog.StopLogging() |
166 | 199 |
167 | 200 |
168 def get_properties_ext(mol): | 201 def get_properties_ext(mol): |
169 HBD = pybel.Smarts("[!#6;!H0]") | 202 HBD = pybel.Smarts("[!#6;!H0]") |
170 HBA = pybel.Smarts(("[$([$([#8,#16]);!$(*=N~O);" | 203 HBA = pybel.Smarts( |
171 "!$(*~N=O);X1,X2]),$([#7;v3;" | 204 ( |
172 "!$([nH]);!$(*(-a)-a)])]" | 205 "[$([$([#8,#16]);!$(*=N~O);" |
173 )) | 206 "!$(*~N=O);X1,X2]),$([#7;v3;" |
207 "!$([nH]);!$(*(-a)-a)])]" | |
208 ) | |
209 ) | |
174 calc_desc_dict = mol.calcdesc() | 210 calc_desc_dict = mol.calcdesc() |
175 | 211 |
176 try: | 212 try: |
177 logp = calc_desc_dict['logP'] | 213 logp = calc_desc_dict["logP"] |
178 except KeyError: | 214 except KeyError: |
179 logp = calc_desc_dict['LogP'] | 215 logp = calc_desc_dict["LogP"] |
180 | 216 |
181 return {"molwt": mol.molwt, | 217 return { |
182 "logp": logp, | 218 "molwt": mol.molwt, |
183 "donors": len(HBD.findall(mol)), | 219 "logp": logp, |
184 "acceptors": len(HBA.findall(mol)), | 220 "donors": len(HBD.findall(mol)), |
185 "psa": calc_desc_dict['TPSA'], | 221 "acceptors": len(HBA.findall(mol)), |
186 "mr": calc_desc_dict['MR'], | 222 "psa": calc_desc_dict["TPSA"], |
187 "rotbonds": mol.OBMol.NumRotors(), | 223 "mr": calc_desc_dict["MR"], |
188 "can": mol.write("can").split()[0].strip(), # tthis one works fine for both zinc and chembl (no ZINC code added after can descriptor string) | 224 "rotbonds": mol.OBMol.NumRotors(), |
189 "inchi": mol.write("inchi").strip(), | 225 "can": mol.write("can") |
190 "inchi_key": get_inchikey(mol).strip(), | 226 .split()[0] |
191 "rings": len(mol.sssr), | 227 .strip(), # tthis one works fine for both zinc and chembl (no ZINC code added after can descriptor string) |
192 "atoms": mol.OBMol.NumHvyAtoms(), | 228 "inchi": mol.write("inchi").strip(), |
193 "spectrophore": OBspectrophore(mol), | 229 "inchi_key": get_inchikey(mol).strip(), |
194 } | 230 "rings": len(mol.sssr), |
231 "atoms": mol.OBMol.NumHvyAtoms(), | |
232 "spectrophore": OBspectrophore(mol), | |
233 } | |
195 | 234 |
196 | 235 |
197 def get_inchikey(mol): | 236 def get_inchikey(mol): |
198 conv = openbabel.OBConversion() | 237 conv = openbabel.OBConversion() |
199 conv.SetInAndOutFormats("mol", "inchi") | 238 conv.SetInAndOutFormats("mol", "inchi") |
204 | 243 |
205 def OBspectrophore(mol): | 244 def OBspectrophore(mol): |
206 spectrophore = pybel.ob.OBSpectrophore() | 245 spectrophore = pybel.ob.OBSpectrophore() |
207 # Parameters: rotation angle = 20, normalization for mean and sd, accuracy = 3.0 A and non-stereospecific cages. | 246 # Parameters: rotation angle = 20, normalization for mean and sd, accuracy = 3.0 A and non-stereospecific cages. |
208 spectrophore.SetNormalization(spectrophore.NormalizationTowardsZeroMeanAndUnitStd) | 247 spectrophore.SetNormalization(spectrophore.NormalizationTowardsZeroMeanAndUnitStd) |
209 return ', '.join(["%.3f" % value for value in spectrophore.GetSpectrophore(mol.OBMol)]) | 248 return ", ".join( |
210 | 249 ["%.3f" % value for value in spectrophore.GetSpectrophore(mol.OBMol)] |
211 | 250 ) |
212 def split_library(lib_path, lib_format='sdf', package_size=None): | 251 |
252 | |
253 def split_library(lib_path, lib_format="sdf", package_size=None): | |
213 """ | 254 """ |
214 Split a library of compounds. Usage: split_library(lib_path, lib_format, package_size) | 255 Split a library of compounds. Usage: split_library(lib_path, lib_format, package_size) |
215 IT currently ONLY WORKS FOR SD-Files | 256 IT currently ONLY WORKS FOR SD-Files |
216 """ | 257 """ |
217 pack = 1 | 258 pack = 1 |
218 mol_counter = 0 | 259 mol_counter = 0 |
219 | 260 |
220 outfile = open('/%s/%s_pack_%i.%s' % ('/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w') | 261 outfile = open( |
221 | 262 "/%s/%s_pack_%i.%s" |
222 for line in open(lib_path, 'r'): | 263 % ( |
264 "/".join(lib_path.split("/")[:-1]), | |
265 lib_path.split("/")[-1].split(".")[0], | |
266 pack, | |
267 "sdf", | |
268 ), | |
269 "w", | |
270 ) | |
271 | |
272 for line in open(lib_path, "r"): | |
223 outfile.write(line) | 273 outfile.write(line) |
224 if line.strip() == '$$$$': | 274 if line.strip() == "$$$$": |
225 mol_counter += 1 | 275 mol_counter += 1 |
226 if mol_counter % package_size == 0: | 276 if mol_counter % package_size == 0: |
227 outfile.close() | 277 outfile.close() |
228 pack += 1 | 278 pack += 1 |
229 outfile = open('/%s/%s_pack_%i.%s' % ('/'.join(lib_path.split('/')[:-1]), lib_path.split('/')[-1].split('.')[0], pack, 'sdf'), 'w') | 279 outfile = open( |
280 "/%s/%s_pack_%i.%s" | |
281 % ( | |
282 "/".join(lib_path.split("/")[:-1]), | |
283 lib_path.split("/")[-1].split(".")[0], | |
284 pack, | |
285 "sdf", | |
286 ), | |
287 "w", | |
288 ) | |
230 if mol_counter * 10 % package_size == 0: | 289 if mol_counter * 10 % package_size == 0: |
231 print('%i molecules parsed, starting pack nr. %i' % (mol_counter, pack - 1)) | 290 print( |
291 "%i molecules parsed, starting pack nr. %i" | |
292 % (mol_counter, pack - 1) | |
293 ) | |
232 outfile.close() | 294 outfile.close() |
233 | 295 |
234 return True | 296 return True |
235 | 297 |
236 | 298 |
240 Usage: split_smi_library(smiles_file, 10) | 302 Usage: split_smi_library(smiles_file, 10) |
241 """ | 303 """ |
242 output_files = [] | 304 output_files = [] |
243 tfile = tempfile.NamedTemporaryFile(delete=False) | 305 tfile = tempfile.NamedTemporaryFile(delete=False) |
244 | 306 |
245 smiles_handle = open(smiles_file, 'r') | 307 smiles_handle = open(smiles_file, "r") |
246 for count, line in enumerate(smiles_handle): | 308 for count, line in enumerate(smiles_handle): |
247 if count % structures_in_one_file == 0 and count != 0: | 309 if count % structures_in_one_file == 0 and count != 0: |
248 tfile.close() | 310 tfile.close() |
249 output_files.append(tfile.name) | 311 output_files.append(tfile.name) |
250 tfile = tempfile.NamedTemporaryFile(delete=False) | 312 tfile = tempfile.NamedTemporaryFile(delete=False) |
255 return output_files | 317 return output_files |
256 | 318 |
257 | 319 |
258 def mp_run(input_path, regex, PROCESSES, function_to_call): | 320 def mp_run(input_path, regex, PROCESSES, function_to_call): |
259 paths = [] | 321 paths = [] |
260 [paths.append(compound_file) for compound_file in glob.glob(str(input_path) + str(regex))] | 322 [ |
323 paths.append(compound_file) | |
324 for compound_file in glob.glob(str(input_path) + str(regex)) | |
325 ] | |
261 paths.sort() | 326 paths.sort() |
262 | 327 |
263 pool = Pool(processes=PROCESSES) | 328 pool = Pool(processes=PROCESSES) |
264 print('Process initialized with', PROCESSES, 'processors') | 329 print("Process initialized with", PROCESSES, "processors") |
265 result = pool.map_async(function_to_call, paths) | 330 result = pool.map_async(function_to_call, paths) |
266 result.get() | 331 result.get() |
267 | 332 |
268 return paths | 333 return paths |
269 | 334 |
270 | 335 |
271 if __name__ == '__main__': | 336 if __name__ == "__main__": |
272 print(check_filetype(sys.argv[1])) | 337 print(check_filetype(sys.argv[1])) |