comparison rdkit_descriptors.py @ 9:0993ac4f4a23 draft default tip

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit c1d813d3f0fec60ea6efe8a11e59d98bfdc1636f"
author bgruening
date Sat, 04 Dec 2021 16:40:00 +0000
parents a1c53f0533b0
children
comparison
equal deleted inserted replaced
8:a1c53f0533b0 9:0993ac4f4a23
6 6
7 from rdkit import Chem 7 from rdkit import Chem
8 from rdkit.Chem import Descriptors 8 from rdkit.Chem import Descriptors
9 9
10 10
11 def get_supplier(infile, format='smiles'): 11 def get_supplier(infile, format="smiles"):
12 """ 12 """
13 Returns a generator over a SMILES or InChI file. Every element is of RDKit 13 Returns a generator over a SMILES or InChI file. Every element is of RDKit
14 molecule and has its original string as _Name property. 14 molecule and has its original string as _Name property.
15 """ 15 """
16 with open(infile) as handle: 16 with open(infile) as handle:
17 for line in handle: 17 for line in handle:
18 line = line.strip() 18 line = line.strip()
19 if format == 'smiles': 19 if format == "smiles":
20 mol = Chem.MolFromSmiles(line, sanitize=True) 20 mol = Chem.MolFromSmiles(line, sanitize=True)
21 elif format == 'inchi': 21 elif format == "inchi":
22 mol = Chem.inchi.MolFromInchi(line, sanitize=True, removeHs=True, logLevel=None, treatWarningAsError=False) 22 mol = Chem.inchi.MolFromInchi(
23 line,
24 sanitize=True,
25 removeHs=True,
26 logLevel=None,
27 treatWarningAsError=False,
28 )
23 if mol is None: 29 if mol is None:
24 yield False 30 yield False
25 else: 31 else:
26 mol.SetProp('_Name', line.split('\t')[0]) 32 mol.SetProp("_Name", line.split("\t")[0])
27 yield mol 33 yield mol
28 34
29 35
30 def get_rdkit_descriptor_functions(): 36 def get_rdkit_descriptor_functions():
31 """ 37 """
32 Returns all descriptor functions under the Chem.Descriptors Module as tuple of (name, function) 38 Returns all descriptor functions under the Chem.Descriptors Module as tuple of (name, function)
33 """ 39 """
34 ret = [(name, f) for name, f in inspect.getmembers(Descriptors) if inspect.isfunction(f) and not name.startswith('_')] 40 ret = [
41 (name, f)
42 for name, f in inspect.getmembers(Descriptors)
43 if inspect.isfunction(f) and not name.startswith("_")
44 ]
35 # some which are not in the official Descriptors module we need to add manually 45 # some which are not in the official Descriptors module we need to add manually
36 ret.extend([('FormalCharge', Chem.GetFormalCharge), ('SSSR', Chem.GetSSSR)]) 46 ret.extend([("FormalCharge", Chem.GetFormalCharge), ("SSSR", Chem.GetSSSR)])
37 ret.sort() 47 ret.sort()
38 return ret 48 return ret
39 49
40 50
41 def descriptors(mol, functions): 51 def descriptors(mol, functions):
46 yield (name, function(mol)) 56 yield (name, function(mol))
47 57
48 58
49 if __name__ == "__main__": 59 if __name__ == "__main__":
50 parser = argparse.ArgumentParser() 60 parser = argparse.ArgumentParser()
51 parser.add_argument('-i', '--infile', required=True, help='Path to the input file.') 61 parser.add_argument("-i", "--infile", required=True, help="Path to the input file.")
52 parser.add_argument("--iformat", help="Specify the input file format.") 62 parser.add_argument("--iformat", help="Specify the input file format.")
53 63
54 parser.add_argument('-o', '--outfile', type=argparse.FileType('w+'), 64 parser.add_argument(
55 default=sys.stdout, 65 "-o",
56 help="path to the result file, default is stdout") 66 "--outfile",
67 type=argparse.FileType("w+"),
68 default=sys.stdout,
69 help="path to the result file, default is stdout",
70 )
57 71
58 parser.add_argument('-s', '--select', default=None, 72 parser.add_argument(
59 help="select a subset of comma-separated descriptors to use") 73 "-s",
74 "--select",
75 default=None,
76 help="select a subset of comma-separated descriptors to use",
77 )
60 78
61 parser.add_argument("--header", dest="header", action="store_true", 79 parser.add_argument(
62 default=False, 80 "--header",
63 help="Write header line.") 81 dest="header",
82 action="store_true",
83 default=False,
84 help="Write header line.",
85 )
64 86
65 args = parser.parse_args() 87 args = parser.parse_args()
66 88
67 if args.iformat == 'sdf': 89 if args.iformat == "sdf":
68 supplier = Chem.SDMolSupplier(args.infile) 90 supplier = Chem.SDMolSupplier(args.infile)
69 elif args.iformat == 'smi': 91 elif args.iformat == "smi":
70 supplier = get_supplier(args.infile, format='smiles') 92 supplier = get_supplier(args.infile, format="smiles")
71 elif args.iformat == 'inchi': 93 elif args.iformat == "inchi":
72 supplier = get_supplier(args.infile, format='inchi') 94 supplier = get_supplier(args.infile, format="inchi")
73 elif args.iformat == 'pdb': 95 elif args.iformat == "pdb":
74 supplier = [Chem.MolFromPDBFile(args.infile)] 96 supplier = [Chem.MolFromPDBFile(args.infile)]
75 elif args.iformat == 'mol2': 97 elif args.iformat == "mol2":
76 supplier = [Chem.MolFromMol2File(args.infile)] 98 supplier = [Chem.MolFromMol2File(args.infile)]
77 99
78 functions = get_rdkit_descriptor_functions() 100 functions = get_rdkit_descriptor_functions()
79 if args.select and args.select != 'None': 101 if args.select and args.select != "None":
80 selected = args.select.split(',') 102 selected = args.select.split(",")
81 functions = [(name, f) for name, f in functions if name in selected] 103 functions = [(name, f) for name, f in functions if name in selected]
82 104
83 if args.header: 105 if args.header:
84 args.outfile.write('%s\n' % '\t'.join(['MoleculeID'] + [name for name, f in functions])) 106 args.outfile.write(
107 "%s\n" % "\t".join(["MoleculeID"] + [name for name, f in functions])
108 )
85 109
86 for mol in supplier: 110 for mol in supplier:
87 if not mol: 111 if not mol:
88 continue 112 continue
89 descs = descriptors(mol, functions) 113 descs = descriptors(mol, functions)
90 try: 114 try:
91 molecule_id = mol.GetProp("_Name") 115 molecule_id = mol.GetProp("_Name")
92 except KeyError: 116 except KeyError:
93 molecule_id = Chem.MolToSmiles(mol) 117 molecule_id = Chem.MolToSmiles(mol)
94 args.outfile.write("%s\n" % '\t'.join([molecule_id] + [str(round(res, 6)) for name, res in descs])) 118 args.outfile.write(
119 "%s\n"
120 % "\t".join([molecule_id] + [str(round(res, 6)) for name, res in descs])
121 )