changeset 5:351fbd750a6d draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 4d0bfcf37bfbedafc7ff0672dfe452766ca8a606"
author bgruening
date Wed, 17 Feb 2021 13:00:12 +0000
parents 55553120df69
children 4beb3e026bbb
files dimorphite_dl.py rdkit_descriptors.py sdf_to_tab.py test-data/mol.pdb test-data/mol_pdb_charges.tab test-data/rdkit_descriptors_result1.tab test-data/rdkit_descriptors_subset.tab
diffstat 7 files changed, 124 insertions(+), 28 deletions(-) [+]
line wrap: on
line diff
--- a/dimorphite_dl.py	Tue Jul 28 08:43:00 2020 -0400
+++ b/dimorphite_dl.py	Wed Feb 17 13:00:12 2021 +0000
@@ -1,3 +1,4 @@
+# flake8: noqa
 # Copyright 2018 Jacob D. Durrant
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,7 +19,6 @@
 """
 
 from __future__ import print_function
-import copy
 import os
 import argparse
 import sys
--- a/rdkit_descriptors.py	Tue Jul 28 08:43:00 2020 -0400
+++ b/rdkit_descriptors.py	Wed Feb 17 13:00:12 2021 +0000
@@ -1,44 +1,49 @@
 #!/usr/bin/env python
 
-from rdkit.Chem import Descriptors
-from rdkit import Chem
-import sys, os, re
 import argparse
 import inspect
+import sys
 
-def get_supplier( infile, format = 'smiles' ):
+from rdkit import Chem
+from rdkit.Chem import Descriptors
+
+
+def get_supplier(infile, format='smiles'):
     """
-    Returns a generator over a SMILES or InChI file. Every element is of RDKit 
+    Returns a generator over a SMILES or InChI file. Every element is of RDKit
     molecule and has its original string as _Name property.
     """
     with open(infile) as handle:
         for line in handle:
             line = line.strip()
             if format == 'smiles':
-                mol = Chem.MolFromSmiles( line, sanitize=True )
+                mol = Chem.MolFromSmiles(line, sanitize=True)
             elif format == 'inchi':
-                mol = Chem.inchi.MolFromInchi( line, sanitize=True, removeHs=True, logLevel=None, treatWarningAsError=False )
+                mol = Chem.inchi.MolFromInchi(line, sanitize=True, removeHs=True, logLevel=None, treatWarningAsError=False)
             if mol is None:
                 yield False
             else:
-                mol.SetProp( '_Name', line.split('\t')[0] )
+                mol.SetProp('_Name', line.split('\t')[0])
                 yield mol
 
+
 def get_rdkit_descriptor_functions():
     """
     Returns all descriptor functions under the Chem.Descriptors Module as tuple of (name, function)
     """
-    ret = [ (name, f) for name, f in inspect.getmembers( Descriptors ) if inspect.isfunction( f ) and not name.startswith( '_' ) ]
+    ret = [(name, f) for name, f in inspect.getmembers(Descriptors) if inspect.isfunction(f) and not name.startswith('_')]
+    # some which are not in the official Descriptors module we need to add manually
+    ret.extend([('FormalCharge', Chem.GetFormalCharge), ('SSSR', Chem.GetSSSR)])
     ret.sort()
     return ret
 
 
-def descriptors( mol, functions ):
+def descriptors(mol, functions):
     """
     Calculates the descriptors of a given molecule.
     """
     for name, function in functions:
-        yield (name, function( mol ))
+        yield (name, function(mol))
 
 
 if __name__ == "__main__":
@@ -46,31 +51,44 @@
     parser.add_argument('-i', '--infile', required=True, help='Path to the input file.')
     parser.add_argument("--iformat", help="Specify the input file format.")
 
-    parser.add_argument('-o', '--outfile', type=argparse.FileType('w+'), 
-        default=sys.stdout, help="path to the result file, default it sdtout")
+    parser.add_argument('-o', '--outfile', type=argparse.FileType('w+'),
+                        default=sys.stdout,
+                        help="path to the result file, default is stdout")
+
+    parser.add_argument('-s', '--select', default=None,
+                        help="select a subset of comma-separated descriptors to use")
 
     parser.add_argument("--header", dest="header", action="store_true",
-                    default=False,
-                    help="Write header line.")
+                        default=False,
+                        help="Write header line.")
 
     args = parser.parse_args()
 
     if args.iformat == 'sdf':
-        supplier = Chem.SDMolSupplier( args.infile )
-    elif args.iformat =='smi':
-        supplier = get_supplier( args.infile, format = 'smiles' )
+        supplier = Chem.SDMolSupplier(args.infile)
+    elif args.iformat == 'smi':
+        supplier = get_supplier(args.infile, format='smiles')
     elif args.iformat == 'inchi':
-        supplier = get_supplier( args.infile, format = 'inchi' )
+        supplier = get_supplier(args.infile, format='inchi')
+    elif args.iformat == 'pdb':
+        supplier = [Chem.MolFromPDBFile(args.infile)]
+    elif args.iformat == 'mol2':
+        supplier = [Chem.MolFromMol2File(args.infile)]
 
     functions = get_rdkit_descriptor_functions()
+    if args.select and args.select != 'None':
+        selected = args.select.split(',')
+        functions = [(name, f) for name, f in functions if name in selected]
 
     if args.header:
-        args.outfile.write( '%s\n' % '\t'.join( ['MoleculeID'] + [name for name, f in functions] ) )
+        args.outfile.write('%s\n' % '\t'.join(['MoleculeID'] + [name for name, f in functions]))
 
     for mol in supplier:
         if not mol:
             continue
-        descs = descriptors( mol, functions )
-        molecule_id = mol.GetProp("_Name")
-        args.outfile.write( "%s\n" % '\t'.join( [molecule_id]+ [str(round(res, 6)) for name, res in descs] ) )
-
+        descs = descriptors(mol, functions)
+        try:
+            molecule_id = mol.GetProp("_Name")
+        except KeyError:
+            molecule_id = Chem.MolToSmiles(mol)
+        args.outfile.write("%s\n" % '\t'.join([molecule_id] + [str(round(res, 6)) for name, res in descs]))
--- a/sdf_to_tab.py	Tue Jul 28 08:43:00 2020 -0400
+++ b/sdf_to_tab.py	Wed Feb 17 13:00:12 2021 +0000
@@ -1,8 +1,10 @@
 #!/usr/bin/env python3
 import argparse
+
 import pandas as pd
 from rdkit import Chem
 
+
 def sdf_to_tab(vars):
     mols = Chem.SDMolSupplier(vars.inp, sanitize=False)
     df = pd.DataFrame()  # for output
@@ -29,6 +31,7 @@
     sorted_cols = sorted(df.columns.values.tolist())
     df.to_csv(vars.out, sep='\t', header=vars.header, columns=sorted_cols)
 
+
 def main():
     parser = argparse.ArgumentParser(description="Convert SDF to tabular")
     parser.add_argument('--inp', '-i', help="The input file", required=True)
@@ -41,7 +44,7 @@
     parser.add_argument('--name', '-n', action='store_true',
                         help="Include molecule name in output.")
     sdf_to_tab(parser.parse_args())
-    
+
 
 if __name__ == "__main__":
     main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mol.pdb	Wed Feb 17 13:00:12 2021 +0000
@@ -0,0 +1,72 @@
+COMPND    CNCC(O)CCCc1ccccc1 
+AUTHOR    GENERATED BY OPEN BABEL 3.1.0
+HETATM    1  C   UNL     1       9.206   6.617  23.375  1.00  0.00           C  
+HETATM    2  N   UNL     1       9.288   5.239  22.843  1.00  0.00           N1+
+HETATM    3  C   UNL     1       9.901   4.245  23.787  1.00  0.00           C  
+HETATM    4  C   UNL     1       9.552   2.837  23.275  1.00  0.00           C  
+HETATM    5  O   UNL     1      10.280   2.666  22.057  1.00  0.00           O  
+HETATM    6  C   UNL     1       9.877   1.780  24.331  1.00  0.00           C  
+HETATM    7  C   UNL     1       9.398   0.387  23.922  1.00  0.00           C  
+HETATM    8  C   UNL     1      10.424  -0.687  24.293  1.00  0.00           C  
+HETATM    9  C   UNL     1      11.616  -0.605  23.380  1.00  0.00           C  
+HETATM   10  C   UNL     1      11.773  -1.516  22.327  1.00  0.00           C  
+HETATM   11  C   UNL     1      12.918  -1.487  21.533  1.00  0.00           C  
+HETATM   12  C   UNL     1      13.916  -0.552  21.786  1.00  0.00           C  
+HETATM   13  C   UNL     1      13.767   0.367  22.824  1.00  0.00           C  
+HETATM   14  C   UNL     1      12.623   0.342  23.620  1.00  0.00           C  
+HETATM   15  H   UNL     1       8.759   7.256  22.643  1.00  0.00           H  
+HETATM   16  H   UNL     1      10.189   6.970  23.605  1.00  0.00           H  
+HETATM   17  H   UNL     1       8.609   6.620  24.264  1.00  0.00           H  
+HETATM   18  H   UNL     1       9.849   5.259  21.991  1.00  0.00           H  
+HETATM   19  H   UNL     1       8.329   4.932  22.679  1.00  0.00           H  
+HETATM   20  H   UNL     1       9.504   4.384  24.771  1.00  0.00           H  
+HETATM   21  H   UNL     1      10.962   4.375  23.832  1.00  0.00           H  
+HETATM   22  H   UNL     1       8.505   2.722  23.087  1.00  0.00           H  
+HETATM   23  H   UNL     1      11.228   2.771  22.229  1.00  0.00           H  
+HETATM   24  H   UNL     1       9.401   2.052  25.249  1.00  0.00           H  
+HETATM   25  H   UNL     1      10.941   1.741  24.440  1.00  0.00           H  
+HETATM   26  H   UNL     1       9.242   0.370  22.864  1.00  0.00           H  
+HETATM   27  H   UNL     1       8.487   0.178  24.443  1.00  0.00           H  
+HETATM   28  H   UNL     1       9.974  -1.653  24.199  1.00  0.00           H  
+HETATM   29  H   UNL     1      10.746  -0.530  25.301  1.00  0.00           H  
+HETATM   30  H   UNL     1      11.037  -2.214  22.138  1.00  0.00           H  
+HETATM   31  H   UNL     1      13.025  -2.159  20.758  1.00  0.00           H  
+HETATM   32  H   UNL     1      14.769  -0.538  21.204  1.00  0.00           H  
+HETATM   33  H   UNL     1      14.504   1.066  23.003  1.00  0.00           H  
+HETATM   34  H   UNL     1      12.517   1.022  24.389  1.00  0.00           H  
+CONECT    1    2   15   16   17                                       
+CONECT    2    1    3   18   19                                       
+CONECT    3    2    4   20   21                                       
+CONECT    4    3    5    6   22                                       
+CONECT    5    4   23                                                 
+CONECT    6    4    7   24   25                                       
+CONECT    7    6    8   26   27                                       
+CONECT    8    7    9   28   29                                       
+CONECT    9    8   10   10   14                                       
+CONECT   10    9    9   11   30                                       
+CONECT   11   10   12   12   31                                       
+CONECT   12   11   11   13   32                                       
+CONECT   13   12   14   14   33                                       
+CONECT   14    9   13   13   34                                       
+CONECT   15    1                                                      
+CONECT   16    1                                                      
+CONECT   17    1                                                      
+CONECT   18    2                                                      
+CONECT   19    2                                                      
+CONECT   20    3                                                      
+CONECT   21    3                                                      
+CONECT   22    4                                                      
+CONECT   23    5                                                      
+CONECT   24    6                                                      
+CONECT   25    6                                                      
+CONECT   26    7                                                      
+CONECT   27    7                                                      
+CONECT   28    8                                                      
+CONECT   29    8                                                      
+CONECT   30   10                                                      
+CONECT   31   11                                                      
+CONECT   32   12                                                      
+CONECT   33   13                                                      
+CONECT   34   14                                                      
+MASTER        0    0    0    0    0    0    0    0   34    0   34    0
+END
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mol_pdb_charges.tab	Wed Feb 17 13:00:12 2021 +0000
@@ -0,0 +1,1 @@
+CNCC(O)CCCc1ccccc1	1
--- a/test-data/rdkit_descriptors_result1.tab	Tue Jul 28 08:43:00 2020 -0400
+++ b/test-data/rdkit_descriptors_result1.tab	Wed Feb 17 13:00:12 2021 +0000
@@ -1,2 +1,2 @@
-MoleculeID	BalabanJ	BertzCT	Chi0	Chi0n	Chi0v	Chi1	Chi1n	Chi1v	Chi2n	Chi2v	Chi3n	Chi3v	Chi4n	Chi4v	EState_VSA1	EState_VSA10	EState_VSA11	EState_VSA2	EState_VSA3	EState_VSA4	EState_VSA5	EState_VSA6	EState_VSA7	EState_VSA8	EState_VSA9	ExactMolWt	FpDensityMorgan1	FpDensityMorgan2	FpDensityMorgan3	FractionCSP3	HallKierAlpha	HeavyAtomCount	HeavyAtomMolWt	Ipc	Kappa1	Kappa2	Kappa3	LabuteASA	MaxAbsEStateIndex	MaxAbsPartialCharge	MaxEStateIndex	MaxPartialCharge	MinAbsEStateIndex	MinAbsPartialCharge	MinEStateIndex	MinPartialCharge	MolLogP	MolMR	MolWt	NHOHCount	NOCount	NumAliphaticCarbocycles	NumAliphaticHeterocycles	NumAliphaticRings	NumAromaticCarbocycles	NumAromaticHeterocycles	NumAromaticRings	NumHAcceptors	NumHDonors	NumHeteroatoms	NumRadicalElectrons	NumRotatableBonds	NumSaturatedCarbocycles	NumSaturatedHeterocycles	NumSaturatedRings	NumValenceElectrons	PEOE_VSA1	PEOE_VSA10	PEOE_VSA11	PEOE_VSA12	PEOE_VSA13	PEOE_VSA14	PEOE_VSA2	PEOE_VSA3	PEOE_VSA4	PEOE_VSA5	PEOE_VSA6	PEOE_VSA7	PEOE_VSA8	PEOE_VSA9	RingCount	SMR_VSA1	SMR_VSA10	SMR_VSA2	SMR_VSA3	SMR_VSA4	SMR_VSA5	SMR_VSA6	SMR_VSA7	SMR_VSA8	SMR_VSA9	SlogP_VSA1	SlogP_VSA10	SlogP_VSA11	SlogP_VSA12	SlogP_VSA2	SlogP_VSA3	SlogP_VSA4	SlogP_VSA5	SlogP_VSA6	SlogP_VSA7	SlogP_VSA8	SlogP_VSA9	TPSA	VSA_EState1	VSA_EState10	VSA_EState2	VSA_EState3	VSA_EState4	VSA_EState5	VSA_EState6	VSA_EState7	VSA_EState8	VSA_EState9	fr_Al_COO	fr_Al_OH	fr_Al_OH_noTert	fr_ArN	fr_Ar_COO	fr_Ar_N	fr_Ar_NH	fr_Ar_OH	fr_COO	fr_COO2	fr_C_O	fr_C_O_noCOO	fr_C_S	fr_HOCCN	fr_Imine	fr_NH0	fr_NH1	fr_NH2	fr_N_O	fr_Ndealkylation1	fr_Ndealkylation2	fr_Nhpyrrole	fr_SH	fr_aldehyde	fr_alkyl_carbamate	fr_alkyl_halide	fr_allylic_oxid	fr_amide	fr_amidine	fr_aniline	fr_aryl_methyl	fr_azide	fr_azo	fr_barbitur	fr_benzene	fr_benzodiazepine	fr_bicyclic	fr_diazo	fr_dihydropyridine	fr_epoxide	fr_ester	fr_ether	fr_furan	fr_guanido	fr_halogen	fr_hdrzine	fr_hdrzone	fr_imidazole	fr_imide	fr_isocyan	fr_isothiocyan	fr_ketone	fr_ketone_Topliss	fr_lactam	fr_lactone	fr_methoxy	fr_morpholine	fr_nitrile	fr_nitro	fr_nitro_arom	fr_nitro_arom_nonortho	fr_nitroso	fr_oxazole	fr_oxime	fr_para_hydroxylation	fr_phenol	fr_phenol_noOrthoHbond	fr_phos_acid	fr_phos_ester	fr_piperdine	fr_piperzine	fr_priamide	fr_prisulfonamd	fr_pyridine	fr_quatN	fr_sulfide	fr_sulfonamd	fr_sulfone	fr_term_acetylene	fr_tetrazole	fr_thiazole	fr_thiocyan	fr_thiophene	fr_unbrch_alkane	fr_urea	qed
-3037	2.370228	503.61088	12.413849	8.821565	10.333422	8.058551	5.008353	5.764282	3.722845	4.595717	2.463985	2.934179	1.596526	1.985926	0.0	10.213055	0.0	11.499024	27.592991	0.0	12.132734	24.265468	0.0	0.0	23.20188	268.005785	0.764706	1.176471	1.588235	0.076923	-1.38	17	259.047	6943.4452	12.086867	4.861181	2.842672	109.048439	9.683208	0.507662	9.683208	0.118709	0.147014	0.118709	0.147014	-0.507662	3.9954	69.0396	269.127	2	2	0	0	0	2	0	2	2	2	4	0	2	0	0	0	88	10.213055	11.499024	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	23.20188	47.525105	16.466088	0.0	2	10.213055	23.20188	0.0	0.0	0.0	6.420822	0.0	57.570372	0.0	11.499024	0.0	0.0	11.499024	23.20188	10.213055	6.420822	0.0	11.126903	36.398202	10.045267	0.0	0.0	40.46	0.0	11.70887	0.0	20.448487	1.29642	0.294029	9.600621	0.373796	0.0	0.0	0	0	0	0	0	0	0	2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2	0	0	0	0	0	0	0	0	0	2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2	2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0.864713
+MoleculeID	BalabanJ	BertzCT	Chi0	Chi0n	Chi0v	Chi1	Chi1n	Chi1v	Chi2n	Chi2v	Chi3n	Chi3v	Chi4n	Chi4v	EState_VSA1	EState_VSA10	EState_VSA11	EState_VSA2	EState_VSA3	EState_VSA4	EState_VSA5	EState_VSA6	EState_VSA7	EState_VSA8	EState_VSA9	ExactMolWt	FormalCharge	FpDensityMorgan1	FpDensityMorgan2	FpDensityMorgan3	FractionCSP3	HallKierAlpha	HeavyAtomCount	HeavyAtomMolWt	Ipc	Kappa1	Kappa2	Kappa3	LabuteASA	MaxAbsEStateIndex	MaxAbsPartialCharge	MaxEStateIndex	MaxPartialCharge	MinAbsEStateIndex	MinAbsPartialCharge	MinEStateIndex	MinPartialCharge	MolLogP	MolMR	MolWt	NHOHCount	NOCount	NumAliphaticCarbocycles	NumAliphaticHeterocycles	NumAliphaticRings	NumAromaticCarbocycles	NumAromaticHeterocycles	NumAromaticRings	NumHAcceptors	NumHDonors	NumHeteroatoms	NumRadicalElectrons	NumRotatableBonds	NumSaturatedCarbocycles	NumSaturatedHeterocycles	NumSaturatedRings	NumValenceElectrons	PEOE_VSA1	PEOE_VSA10	PEOE_VSA11	PEOE_VSA12	PEOE_VSA13	PEOE_VSA14	PEOE_VSA2	PEOE_VSA3	PEOE_VSA4	PEOE_VSA5	PEOE_VSA6	PEOE_VSA7	PEOE_VSA8	PEOE_VSA9	RingCount	SMR_VSA1	SMR_VSA10	SMR_VSA2	SMR_VSA3	SMR_VSA4	SMR_VSA5	SMR_VSA6	SMR_VSA7	SMR_VSA8	SMR_VSA9	SSSR	SlogP_VSA1	SlogP_VSA10	SlogP_VSA11	SlogP_VSA12	SlogP_VSA2	SlogP_VSA3	SlogP_VSA4	SlogP_VSA5	SlogP_VSA6	SlogP_VSA7	SlogP_VSA8	SlogP_VSA9	TPSA	VSA_EState1	VSA_EState10	VSA_EState2	VSA_EState3	VSA_EState4	VSA_EState5	VSA_EState6	VSA_EState7	VSA_EState8	VSA_EState9	fr_Al_COO	fr_Al_OH	fr_Al_OH_noTert	fr_ArN	fr_Ar_COO	fr_Ar_N	fr_Ar_NH	fr_Ar_OH	fr_COO	fr_COO2	fr_C_O	fr_C_O_noCOO	fr_C_S	fr_HOCCN	fr_Imine	fr_NH0	fr_NH1	fr_NH2	fr_N_O	fr_Ndealkylation1	fr_Ndealkylation2	fr_Nhpyrrole	fr_SH	fr_aldehyde	fr_alkyl_carbamate	fr_alkyl_halide	fr_allylic_oxid	fr_amide	fr_amidine	fr_aniline	fr_aryl_methyl	fr_azide	fr_azo	fr_barbitur	fr_benzene	fr_benzodiazepine	fr_bicyclic	fr_diazo	fr_dihydropyridine	fr_epoxide	fr_ester	fr_ether	fr_furan	fr_guanido	fr_halogen	fr_hdrzine	fr_hdrzone	fr_imidazole	fr_imide	fr_isocyan	fr_isothiocyan	fr_ketone	fr_ketone_Topliss	fr_lactam	fr_lactone	fr_methoxy	fr_morpholine	fr_nitrile	fr_nitro	fr_nitro_arom	fr_nitro_arom_nonortho	fr_nitroso	fr_oxazole	fr_oxime	fr_para_hydroxylation	fr_phenol	fr_phenol_noOrthoHbond	fr_phos_acid	fr_phos_ester	fr_piperdine	fr_piperzine	fr_priamide	fr_prisulfonamd	fr_pyridine	fr_quatN	fr_sulfide	fr_sulfonamd	fr_sulfone	fr_term_acetylene	fr_tetrazole	fr_thiazole	fr_thiocyan	fr_thiophene	fr_unbrch_alkane	fr_urea	qed
+3037	2.370228	503.61088	12.413849	8.821565	10.333422	8.058551	5.008353	5.764282	3.722845	4.595717	2.463985	2.934179	1.596526	1.985926	0.0	10.213055	0.0	11.499024	27.592991	0.0	12.132734	24.265468	0.0	0.0	23.20188	268.005785	0	0.764706	1.176471	1.588235	0.076923	-1.38	17	259.047	6943.4452	12.086867	4.861181	2.842672	109.048439	9.683208	0.507662	9.683208	0.118709	0.147014	0.118709	0.147014	-0.507662	3.9954	69.0396	269.127	2	2	0	0	0	2	0	2	2	2	4	0	2	0	0	0	88	10.213055	11.499024	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	23.20188	47.525105	16.466088	0.0	2	10.213055	23.20188	0.0	0.0	0.0	6.420822	0.0	57.570372	0.0	11.499024	2	0.0	0.0	11.499024	23.20188	10.213055	6.420822	0.0	11.126903	36.398202	10.045267	0.0	0.0	40.46	0.0	11.70887	0.0	20.448487	1.29642	0.294029	9.600621	0.373796	0.0	0.0	0	0	0	0	0	0	0	2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2	0	0	0	0	0	0	0	0	0	2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2	2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0.864713
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rdkit_descriptors_subset.tab	Wed Feb 17 13:00:12 2021 +0000
@@ -0,0 +1,2 @@
+MoleculeID	FormalCharge	MolWt	qed
+3037	0	269.127	0.864713