comparison qed.py @ 0:5ccd3a432785 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/silicos-it/qed commit 4379e712f76f2bb12ee2cc270dd8a0e806df2cd6
author bgruening
date Tue, 23 May 2017 03:57:14 -0400
parents
children ab73abead7fa
comparison
equal deleted inserted replaced
-1:000000000000 0:5ccd3a432785
1 #!/usr/bin/env python
2 __all__ = ['weights_max', 'weights_mean', 'weights_none', 'default']
3
4 # RDKit
5 from rdkit.Chem import Descriptors
6 from rdkit import Chem
7
8 # General
9 from copy import deepcopy
10 from math import exp, log
11 import sys, os, re
12 import argparse
13
14
15 class SilicosItError(Exception):
16 """Base class for exceptions in Silicos-it code"""
17 pass
18
19 class WrongArgument(SilicosItError):
20 """
21 Exception raised when argument to function is not of correct type.
22
23 Attributes:
24 function -- function in which error occurred
25 msg -- explanation of the error
26 """
27 def __init__(self, function, msg):
28 self.function = function
29 self.msg = msg
30
31 def check_filetype(filepath):
32 mol = False
33 possible_inchi = True
34 for line_counter, line in enumerate(open(filepath)):
35 if line_counter > 10000:
36 break
37 if line.find('$$$$') != -1:
38 return 'sdf'
39 elif line.find('@<TRIPOS>MOLECULE') != -1:
40 return 'mol2'
41 elif line.find('ligand id') != -1:
42 return 'drf'
43 elif possible_inchi and re.findall('^InChI=', line):
44 return 'inchi'
45 elif re.findall('^M\s+END', line):
46 mol = True
47 # first line is not an InChI, so it can't be an InChI file
48 possible_inchi = False
49
50 if mol:
51 # END can occures before $$$$, so and SDF file will
52 # be recognised as mol, if you not using this hack'
53 return 'mol'
54 return 'smi'
55
56 AliphaticRings = Chem.MolFromSmarts('[$([A;R][!a])]')
57
58 AcceptorSmarts = [
59 '[oH0;X2]',
60 '[OH1;X2;v2]',
61 '[OH0;X2;v2]',
62 '[OH0;X1;v2]',
63 '[O-;X1]',
64 '[SH0;X2;v2]',
65 '[SH0;X1;v2]',
66 '[S-;X1]',
67 '[nH0;X2]',
68 '[NH0;X1;v3]',
69 '[$([N;+0;X3;v3]);!$(N[C,S]=O)]'
70 ]
71 Acceptors = []
72 for hba in AcceptorSmarts:
73 Acceptors.append(Chem.MolFromSmarts(hba))
74
75 StructuralAlertSmarts = [
76 '*1[O,S,N]*1',
77 '[S,C](=[O,S])[F,Br,Cl,I]',
78 '[CX4][Cl,Br,I]',
79 '[C,c]S(=O)(=O)O[C,c]',
80 '[$([CH]),$(CC)]#CC(=O)[C,c]',
81 '[$([CH]),$(CC)]#CC(=O)O[C,c]',
82 'n[OH]',
83 '[$([CH]),$(CC)]#CS(=O)(=O)[C,c]',
84 'C=C(C=O)C=O',
85 'n1c([F,Cl,Br,I])cccc1',
86 '[CH1](=O)',
87 '[O,o][O,o]',
88 '[C;!R]=[N;!R]',
89 '[N!R]=[N!R]',
90 '[#6](=O)[#6](=O)',
91 '[S,s][S,s]',
92 '[N,n][NH2]',
93 'C(=O)N[NH2]',
94 '[C,c]=S',
95 '[$([CH2]),$([CH][CX4]),$(C([CX4])[CX4])]=[$([CH2]),$([CH][CX4]),$(C([CX4])[CX4])]',
96 'C1(=[O,N])C=CC(=[O,N])C=C1',
97 'C1(=[O,N])C(=[O,N])C=CC=C1',
98 'a21aa3a(aa1aaaa2)aaaa3',
99 'a31a(a2a(aa1)aaaa2)aaaa3',
100 'a1aa2a3a(a1)A=AA=A3=AA=A2',
101 'c1cc([NH2])ccc1',
102 '[Hg,Fe,As,Sb,Zn,Se,se,Te,B,Si,Na,Ca,Ge,Ag,Mg,K,Ba,Sr,Be,Ti,Mo,Mn,Ru,Pd,Ni,Cu,Au,Cd,Al,Ga,Sn,Rh,Tl,Bi,Nb,Li,Pb,Hf,Ho]',
103 'I',
104 'OS(=O)(=O)[O-]',
105 '[N+](=O)[O-]',
106 'C(=O)N[OH]',
107 'C1NC(=O)NC(=O)1',
108 '[SH]',
109 '[S-]',
110 'c1ccc([Cl,Br,I,F])c([Cl,Br,I,F])c1[Cl,Br,I,F]',
111 'c1cc([Cl,Br,I,F])cc([Cl,Br,I,F])c1[Cl,Br,I,F]',
112 '[CR1]1[CR1][CR1][CR1][CR1][CR1][CR1]1',
113 '[CR1]1[CR1][CR1]cc[CR1][CR1]1',
114 '[CR2]1[CR2][CR2][CR2][CR2][CR2][CR2][CR2]1',
115 '[CR2]1[CR2][CR2]cc[CR2][CR2][CR2]1',
116 '[CH2R2]1N[CH2R2][CH2R2][CH2R2][CH2R2][CH2R2]1',
117 '[CH2R2]1N[CH2R2][CH2R2][CH2R2][CH2R2][CH2R2][CH2R2]1',
118 'C#C',
119 '[OR2,NR2]@[CR2]@[CR2]@[OR2,NR2]@[CR2]@[CR2]@[OR2,NR2]',
120 '[$([N+R]),$([n+R]),$([N+]=C)][O-]',
121 '[C,c]=N[OH]',
122 '[C,c]=NOC=O',
123 '[C,c](=O)[CX4,CR0X3,O][C,c](=O)',
124 'c1ccc2c(c1)ccc(=O)o2',
125 '[O+,o+,S+,s+]',
126 'N=C=O',
127 '[NX3,NX4][F,Cl,Br,I]',
128 'c1ccccc1OC(=O)[#6]',
129 '[CR0]=[CR0][CR0]=[CR0]',
130 '[C+,c+,C-,c-]',
131 'N=[N+]=[N-]',
132 'C12C(NC(N1)=O)CSC2',
133 'c1c([OH])c([OH,NH2,NH])ccc1',
134 'P',
135 '[N,O,S]C#N',
136 'C=C=O',
137 '[Si][F,Cl,Br,I]',
138 '[SX2]O',
139 '[SiR0,CR0](c1ccccc1)(c2ccccc2)(c3ccccc3)',
140 'O1CCCCC1OC2CCC3CCCCC3C2',
141 'N=[CR0][N,n,O,S]',
142 '[cR2]1[cR2][cR2]([Nv3X3,Nv4X4])[cR2][cR2][cR2]1[cR2]2[cR2][cR2][cR2]([Nv3X3,Nv4X4])[cR2][cR2]2',
143 'C=[C!r]C#N',
144 '[cR2]1[cR2]c([N+0X3R0,nX3R0])c([N+0X3R0,nX3R0])[cR2][cR2]1',
145 '[cR2]1[cR2]c([N+0X3R0,nX3R0])[cR2]c([N+0X3R0,nX3R0])[cR2]1',
146 '[cR2]1[cR2]c([N+0X3R0,nX3R0])[cR2][cR2]c1([N+0X3R0,nX3R0])',
147 '[OH]c1ccc([OH,NH2,NH])cc1',
148 'c1ccccc1OC(=O)O',
149 '[SX2H0][N]',
150 'c12ccccc1(SC(S)=N2)',
151 'c12ccccc1(SC(=S)N2)',
152 'c1nnnn1C=O',
153 's1c(S)nnc1NC=O',
154 'S1C=CSC1=S',
155 'C(=O)Onnn',
156 'OS(=O)(=O)C(F)(F)F',
157 'N#CC[OH]',
158 'N#CC(=O)',
159 'S(=O)(=O)C#N',
160 'N[CH2]C#N',
161 'C1(=O)NCC1',
162 'S(=O)(=O)[O-,OH]',
163 'NC[F,Cl,Br,I]',
164 'C=[C!r]O',
165 '[NX2+0]=[O+0]',
166 '[OR0,NR0][OR0,NR0]',
167 'C(=O)O[C,H1].C(=O)O[C,H1].C(=O)O[C,H1]',
168 '[CX2R0][NX3R0]',
169 'c1ccccc1[C;!R]=[C;!R]c2ccccc2',
170 '[NX3R0,NX4R0,OR0,SX2R0][CX4][NX3R0,NX4R0,OR0,SX2R0]',
171 '[s,S,c,C,n,N,o,O]~[n+,N+](~[s,S,c,C,n,N,o,O])(~[s,S,c,C,n,N,o,O])~[s,S,c,C,n,N,o,O]',
172 '[s,S,c,C,n,N,o,O]~[nX3+,NX3+](~[s,S,c,C,n,N])~[s,S,c,C,n,N]',
173 '[*]=[N+]=[*]',
174 '[SX3](=O)[O-,OH]',
175 'N#N',
176 'F.F.F.F',
177 '[R0;D2][R0;D2][R0;D2][R0;D2]',
178 '[cR,CR]~C(=O)NC(=O)~[cR,CR]',
179 'C=!@CC=[O,S]',
180 '[#6,#8,#16][C,c](=O)O[C,c]',
181 'c[C;R0](=[O,S])[C,c]',
182 'c[SX2][C;!R]',
183 'C=C=C',
184 'c1nc([F,Cl,Br,I,S])ncc1',
185 'c1ncnc([F,Cl,Br,I,S])c1',
186 'c1nc(c2c(n1)nc(n2)[F,Cl,Br,I])',
187 '[C,c]S(=O)(=O)c1ccc(cc1)F',
188 '[15N]',
189 '[13C]',
190 '[18O]',
191 '[34S]'
192 ]
193
194 StructuralAlerts = []
195 for smarts in StructuralAlertSmarts:
196 StructuralAlerts.append(Chem.MolFromSmarts(smarts))
197
198
199 # ADS parameters for the 8 molecular properties: [row][column]
200 # rows[8]: MW, ALOGP, HBA, HBD, PSA, ROTB, AROM, ALERTS
201 # columns[7]: A, B, C, D, E, F, DMAX
202 # ALOGP parameters from Gregory Gerebtzoff (2012, Roche)
203 pads1 = [ [2.817065973, 392.5754953, 290.7489764, 2.419764353, 49.22325677, 65.37051707, 104.9805561],
204 [0.486849448, 186.2293718, 2.066177165, 3.902720615, 1.027025453, 0.913012565, 145.4314800],
205 [2.948620388, 160.4605972, 3.615294657, 4.435986202, 0.290141953, 1.300669958, 148.7763046],
206 [1.618662227, 1010.051101, 0.985094388, 0.000000001, 0.713820843, 0.920922555, 258.1632616],
207 [1.876861559, 125.2232657, 62.90773554, 87.83366614, 12.01999824, 28.51324732, 104.5686167],
208 [0.010000000, 272.4121427, 2.558379970, 1.565547684, 1.271567166, 2.758063707, 105.4420403],
209 [3.217788970, 957.7374108, 2.274627939, 0.000000001, 1.317690384, 0.375760881, 312.3372610],
210 [0.010000000, 1199.094025, -0.09002883, 0.000000001, 0.185904477, 0.875193782, 417.7253140] ]
211 # ALOGP parameters from the original publication
212 pads2 = [ [2.817065973, 392.5754953, 290.7489764, 2.419764353, 49.22325677, 65.37051707, 104.9805561],
213 [3.172690585, 137.8624751, 2.534937431, 4.581497897, 0.822739154, 0.576295591, 131.3186604],
214 [2.948620388, 160.4605972, 3.615294657, 4.435986202, 0.290141953, 1.300669958, 148.7763046],
215 [1.618662227, 1010.051101, 0.985094388, 0.000000001, 0.713820843, 0.920922555, 258.1632616],
216 [1.876861559, 125.2232657, 62.90773554, 87.83366614, 12.01999824, 28.51324732, 104.5686167],
217 [0.010000000, 272.4121427, 2.558379970, 1.565547684, 1.271567166, 2.758063707, 105.4420403],
218 [3.217788970, 957.7374108, 2.274627939, 0.000000001, 1.317690384, 0.375760881, 312.3372610],
219 [0.010000000, 1199.094025, -0.09002883, 0.000000001, 0.185904477, 0.875193782, 417.7253140] ]
220
221 def ads(x, a, b, c, d, e, f, dmax):
222 return ((a+(b/(1+exp(-1*(x-c+d/2)/e))*(1-1/(1+exp(-1*(x-c-d/2)/f))))) / dmax)
223
224 def properties(mol):
225 """
226 Calculates the properties that are required to calculate the QED descriptor.
227 """
228 matches = []
229 if mol is None:
230 raise WrongArgument("properties(mol)", "mol argument is \'None\'")
231 x = [0] * 9
232 x[0] = Descriptors.MolWt(mol) # MW
233 x[1] = Descriptors.MolLogP(mol) # ALOGP
234 for hba in Acceptors: # HBA
235 if mol.HasSubstructMatch(hba):
236 matches = mol.GetSubstructMatches(hba)
237 x[2] += len(matches)
238 x[3] = Descriptors.NumHDonors(mol) # HBD
239 x[4] = Descriptors.TPSA(mol) # PSA
240 x[5] = Descriptors.NumRotatableBonds(mol) # ROTB
241 x[6] = Chem.GetSSSR(Chem.DeleteSubstructs(deepcopy(mol), AliphaticRings)) # AROM
242 for alert in StructuralAlerts: # ALERTS
243 if (mol.HasSubstructMatch(alert)): x[7] += 1
244 ro5_failed = 0
245 if x[3] > 5:
246 ro5_failed += 1 #HBD
247 if x[2] > 10:
248 ro5_failed += 1 #HBA
249 if x[0] >= 500:
250 ro5_failed += 1
251 if x[1] > 5:
252 ro5_failed += 1
253 x[8] = ro5_failed
254 return x
255
256
257 def qed(w, p, gerebtzoff):
258 d = [0.00] * 8
259 if gerebtzoff:
260 for i in range(0, 8):
261 d[i] = ads(p[i], pads1[i][0], pads1[i][1], pads1[i][2], pads1[i][3], pads1[i][4], pads1[i][5], pads1[i][6])
262 else:
263 for i in range(0, 8):
264 d[i] = ads(p[i], pads2[i][0], pads2[i][1], pads2[i][2], pads2[i][3], pads2[i][4], pads2[i][5], pads2[i][6])
265 t = 0.0
266 for i in range(0, 8):
267 t += w[i] * log(d[i])
268 return (exp(t / sum(w)))
269
270
271 def weights_max(mol, gerebtzoff = True, props = False):
272 """
273 Calculates the QED descriptor using maximal descriptor weights.
274 If props is specified we skip the calculation step and use the props-list of properties.
275 """
276 if not props:
277 props = properties(mol)
278 return qed([0.50, 0.25, 0.00, 0.50, 0.00, 0.50, 0.25, 1.00], props, gerebtzoff)
279
280
281 def weights_mean(mol, gerebtzoff = True, props = False):
282 """
283 Calculates the QED descriptor using average descriptor weights.
284 If props is specified we skip the calculation step and use the props-list of properties.
285 """
286 if not props:
287 props = properties(mol)
288 return qed([0.66, 0.46, 0.05, 0.61, 0.06, 0.65, 0.48, 0.95], props, gerebtzoff)
289
290
291 def weights_none(mol, gerebtzoff = True, props = False):
292 """
293 Calculates the QED descriptor using unit weights.
294 If props is specified we skip the calculation step and use the props-list of properties.
295 """
296 if not props:
297 props = properties(mol)
298 return qed([1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00], props, gerebtzoff)
299
300
301 def default(mol, gerebtzoff = True):
302 """
303 Calculates the QED descriptor using average descriptor weights and Gregory Gerebtzoff parameters.
304 """
305 return weights_mean(mol, gerebtzoff)
306
307
308 if __name__ == "__main__":
309 parser = argparse.ArgumentParser()
310 parser.add_argument('-i', '--input',
311 required=True,
312 help='path to the input file name')
313 parser.add_argument("-m", "--method",
314 dest="method",
315 choices=['max', 'mean', 'unweighted'],
316 default="mean",
317 help="Specify the method you want to use.")
318 parser.add_argument("--iformat",
319 help="Input format. It must be supported by openbabel.")
320 parser.add_argument('-o', '--outfile', type=argparse.FileType('w+'),
321 default=sys.stdout,
322 help="path to the result file, default it sdtout")
323 parser.add_argument("--header", dest="header", action="store_true",
324 default=False,
325 help="Write header line.")
326
327
328 args = parser.parse_args()
329
330 # Elucidate filetype and open supplier
331 ifile = os.path.abspath(args.input)
332 if not os.path.isfile(ifile):
333 print "Error: ", ifile, " is not a file or cannot be found."
334 sys.exit(1)
335 if not os.path.exists(ifile):
336 print "Error: ", ifile, " does not exist or cannot be found."
337 sys.exit(1)
338 if not os.access(ifile, os.R_OK):
339 print "Error: ", ifile, " is not readable."
340 sys.exit(1)
341
342 if not args.iformat:
343 # try to guess the filetype
344 filetype = check_filetype( ifile )
345 else:
346 filetype = args.iformat # sdf or smi
347
348
349 """
350 We want to store the original SMILES in the output. So in case of a SMILES file iterate over the file and convert each line separate.
351 """
352 if filetype == 'sdf':
353 supplier = Chem.SDMolSupplier( ifile )
354 # Process file
355 if args.header:
356 args.outfile.write("MW\tALOGP\tHBA\tHBD\tPSA\tROTB\tAROM\tALERTS\tLRo5\tQED\tNAME\n")
357 count = 0
358 for mol in supplier:
359 count += 1
360 if mol is None:
361 print "Warning: skipping molecule ", count, " and continuing with next."
362 continue
363 props = properties(mol)
364
365 if args.method == 'max':
366 calc_qed = weights_max(mol, True, props)
367 elif args.method == 'unweighted':
368 calc_qed = weights_none(mol, True, props)
369 else:
370 calc_qed = weights_mean(mol, True, props)
371
372 args.outfile.write( "%.2f\t%.3f\t%d\t%d\t%.2f\t%d\t%d\t%d\t%s\t%.3f\t%-s\n" % (
373 props[0],
374 props[1],
375 props[2],
376 props[3],
377 props[4],
378 props[5],
379 props[6],
380 props[7],
381 props[8],
382 calc_qed,
383 mol.GetProp("_Name"),
384 ))
385 elif filetype == 'smi':
386 supplier = Chem.SmilesMolSupplier( ifile, " \t", 0, 1, False, True )
387
388 # Process file
389 if args.header:
390 args.outfile.write("MW\tALOGP\tHBA\tHBD\tPSA\tROTB\tAROM\tALERTS\tLRo5\tQED\tNAME\tSMILES\n")
391 count = 0
392 for line in open(ifile):
393 tokens = line.strip().split('\t')
394 if len(tokens) > 1:
395 smiles, title = tokens
396 else:
397 smiles = tokens[0]
398 title = ''
399 mol = Chem.MolFromSmiles(smiles)
400 count += 1
401 if mol is None:
402 print "Warning: skipping molecule ", count, " and continuing with next."
403 continue
404 props = properties(mol)
405
406 if args.method == 'max':
407 calc_qed = weights_max(mol, True, props)
408 elif args.method == 'unweighted':
409 calc_qed = weights_none(mol, True, props)
410 else:
411 calc_qed = weights_mean(mol, True, props)
412
413 args.outfile.write( "%.2f\t%.3f\t%d\t%d\t%.2f\t%d\t%d\t%d\t%s\t%.3f\t%-s\t%s\n" % (
414 props[0],
415 props[1],
416 props[2],
417 props[3],
418 props[4],
419 props[5],
420 props[6],
421 props[7],
422 props[8],
423 calc_qed,
424 title,
425 smiles
426 ))
427 else:
428 sys.exit("Error: unknown file-type: %s" % filetype)