changeset 0:d7ea4e8cb1f3 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit a44364ca5bccd47f9f331143e1abb286096e8807
author bgruening
date Sat, 20 May 2017 12:41:19 -0400
parents
children 2f212f34b80c
files rdkit_descriptors.py rdkit_descriptors.xml test-data/CID_3037.sdf test-data/rdkit_descriptors_result1.tab
diffstat 4 files changed, 454 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rdkit_descriptors.py	Sat May 20 12:41:19 2017 -0400
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+
+from rdkit.Chem import Descriptors
+from rdkit import Chem
+import sys, os, re
+import argparse
+import inspect
+
+def get_supplier( infile, format = 'smiles' ):
+    """
+    Returns a generator over a SMILES or InChI file. Every element is of RDKit 
+    molecule and has its original string as _Name property.
+    """
+    with open(infile) as handle:
+        for line in handle:
+            line = line.strip()
+            if format == 'smiles':
+                mol = Chem.MolFromSmiles( line, sanitize=True )
+            elif format == 'inchi':
+                mol = Chem.inchi.MolFromInchi( line, sanitize=True, removeHs=True, logLevel=None, treatWarningAsError=False )
+            if mol is None:
+                yield False
+            else:
+                mol.SetProp( '_Name', line.split('\t')[0] )
+                yield mol
+
+
+def get_rdkit_descriptor_functions():
+    """
+    Returns all descriptor functions under the Chem.Descriptors Module as tuple of (name, function)
+    """
+    ret = [ (name, f) for name, f in inspect.getmembers( Descriptors ) if inspect.isfunction( f ) and not name.startswith( '_' ) ]
+    ret.sort()
+    return ret
+
+
+def descriptors( mol, functions ):
+    """
+    Calculates the descriptors of a given molecule.
+    """
+    for name, function in functions:
+        yield (name, function( mol ))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-i', '--infile', required=True, help='Path to the input file.')
+    parser.add_argument("--iformat", help="Specify the input file format.")
+
+    parser.add_argument('-o', '--outfile', type=argparse.FileType('w+'), 
+        default=sys.stdout, help="path to the result file, default it sdtout")
+
+    parser.add_argument("--header", dest="header", action="store_true",
+                    default=False,
+                    help="Write header line.")
+
+    args = parser.parse_args()
+
+    if args.iformat == 'sdf':
+        supplier = Chem.SDMolSupplier( args.infile )
+    elif args.iformat =='smi':
+        supplier = get_supplier( args.infile, format = 'smiles' )
+    elif args.iformat == 'inchi':
+        supplier = get_supplier( args.infile, format = 'inchi' )
+
+    functions = get_rdkit_descriptor_functions()
+
+    if args.header:
+        args.outfile.write( '%s\n' % '\t'.join( [name for name, f in functions] ) )
+
+    for mol in supplier:
+        if not mol:
+            continue
+        descs = descriptors( mol, functions )
+        molecule_id = mol.GetProp("_Name")
+        args.outfile.write( "%s\n" % '\t'.join( [molecule_id]+ [str(res) for name, res in descs] ) )
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rdkit_descriptors.xml	Sat May 20 12:41:19 2017 -0400
@@ -0,0 +1,155 @@
+<tool id="ctb_rdkit_describtors" name="Descriptors" version="0.3">
+    <description>calculated with RDKit</description>
+    <!--parallelism method="multi" split_inputs="infile" split_mode="to_size" split_size="10000" shared_inputs="" merge_outputs="outfile"></parallelism-->
+    <requirements>
+        <requirement type="package" version="2016.03.3">rdkit</requirement>
+    </requirements>
+    <command>
+<![CDATA[
+        python '$__tool_directory__/rdkit_descriptors.py'
+            -i '${infile}'
+            --iformat '${infile.ext}'
+            -o '${outfile}'
+            $header
+]]>
+    </command>
+    <inputs>
+        <param name="infile" format="smi,sdf,mol2" type="data" label="Molecule data"
+            help="In SD- or SMILES-format"/>
+        <param name="header" type="boolean" label="Include the descriptor name as header"
+            truevalue="--header" falsevalue="" checked="false" />
+    </inputs>
+    <outputs>
+        <data format="tabular" name="outfile" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="infile" ftype='sdf' value="CID_3037.sdf" />
+            <param name="header" value="True" />
+            <output name="outfile" ftype='tabular' file="rdkit_descriptors_result1.tab" />
+        </test>
+    </tests>
+    <help>
+<![CDATA[
+
+.. class:: infomark
+
+**What this tool does**
+
+| RDKit is an open source toolkit for cheminformatics and machine learning.
+| This implementation focuses on descriptor calculation, though, RDKit offers a vast number of other functions.
+|
+| The table below shows a brief overview of the descriptors.
+|
+
++-----------------------------------+------------+
+|    Descriptor/Descriptor Family   |  Language  |
++===================================+============+
+| Gasteiger/Marsili Partial Charges |     C++    |
++-----------------------------------+------------+
+|            BalabanJ               |   Python   |
++-----------------------------------+------------+
+|             BertzCT               |   Python   |
++-----------------------------------+------------+
+|               Ipc                 |   Python   |
++-----------------------------------+------------+
+|          HallKierAlpha            |   Python   |
++-----------------------------------+------------+
+|         Kappa1 - Kappa3           |   Python   |
++-----------------------------------+------------+
+|            Chi0, Chi1             |   Python   |
++-----------------------------------+------------+
+|           Chi0n - Chi4n           |   Python   |
++-----------------------------------+------------+
+|           Chi0v - Chi4v           |   Python   |
++-----------------------------------+------------+
+|              MolLogP              |     C++    |
++-----------------------------------+------------+
+|               MolMR               |     C++    |
++-----------------------------------+------------+
+|               MolWt               |     C++    |
++-----------------------------------+------------+
+|           HeavyAtomCount          |   Python   |
++-----------------------------------+------------+
+|           HeavyAtomMolWt          |   Python   |
++-----------------------------------+------------+
+|             NHOHCount             |     C++    |
++-----------------------------------+------------+
+|              NOCount              |     C++    |
++-----------------------------------+------------+
+|            NumHAcceptors          |     C++    |
++-----------------------------------+------------+
+|             NumHDonors            |     C++    |
++-----------------------------------+------------+
+|            NumHeteroatoms         |     C++    |
++-----------------------------------+------------+
+|          NumRotatableBonds        |     C++    |
++-----------------------------------+------------+
+|         NumValenceElectrons       |   Python   |
++-----------------------------------+------------+
+|              RingCount            |     C++    |
++-----------------------------------+------------+
+|                 TPSA              |     C++    |
++-----------------------------------+------------+
+|              LabuteASA            |     C++    |
++-----------------------------------+------------+
+|       PEOE_VSA1 - PEOE_VSA14      | Python/C++ |
++-----------------------------------+------------+
+|         SMR_VSA1 - SMR_VSA10      | Python/C++ |
++-----------------------------------+------------+
+|      SlogP_VSA1 - SlogP_VSA12     | Python/C++ |
++-----------------------------------+------------+
+|     EState_VSA1 - EState_VSA11    |   Python   |
++-----------------------------------+------------+
+|     VSA_EState1 - VSA_EState10    |   Python   |
++-----------------------------------+------------+
+|           Topliss fragments       |   Python   |
++-----------------------------------+------------+
+
+|
+| A full list of the descriptors can be obtained here_.
+
+.. _here: https://rdkit.readthedocs.org/en/latest/GettingStartedInPython.html#list-of-available-descriptors
+
+-----
+
+.. class:: warningmark
+
+**HINT**
+
+Use the **cut columns from a table** tool to select just the desired descriptors.
+
+-----
+
+.. class:: infomark
+
+**Input**
+
+| - `SD-Format`_
+| - `SMILES Format`_
+| - `Corina MOL2`_
+
+.. _SD-Format: http://en.wikipedia.org/wiki/Chemical_table_file
+.. _SMILES Format: http://en.wikipedia.org/wiki/Simplified_molecular_input_line_entry_specification
+.. _Corina MOL2: http://www.molecular-networks.com/products/corina
+
+-----
+
+.. class:: infomark
+
+ **Output**
+
+Tabularfile, where each descriptor (value) is shown in a seperate column.
+
+
+]]>
+    </help>
+    <citations>
+        <citation type="bibtex">
+            @article{rdkit,
+                author = {Greg Landrum},
+                title = {RDKit: Open-source cheminformatics},
+                url ={http://www.rdkit.org}
+            }</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/CID_3037.sdf	Sat May 20 12:41:19 2017 -0400
@@ -0,0 +1,220 @@
+3037
+  -OEChem-08231108593D
+
+ 27 28  0     0  0  0  0  0  0999 V2000
+   -4.8550    1.3401    0.2120 Cl  0  0  0  0  0  0  0  0  0  0  0  0
+    4.8529   -1.3406    0.2121 Cl  0  0  0  0  0  0  0  0  0  0  0  0
+   -0.1809   -2.1668   -0.3789 O   0  0  0  0  0  0  0  0  0  0  0  0
+    0.1788    2.1664   -0.3787 O   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.0011   -0.0002    1.4744 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.2222   -0.2738    0.6597 C   0  0  0  0  0  0  0  0  0  0  0  0
+    1.2377    0.2772    0.6480 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.2586   -1.3462   -0.2316 C   0  0  0  0  0  0  0  0  0  0  0  0
+    1.2565    1.3457   -0.2314 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.3343    0.5568    0.7972 C   0  0  0  0  0  0  0  0  0  0  0  0
+    2.3322   -0.5574    0.7972 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.4069   -1.5879   -0.9855 C   0  0  0  0  0  0  0  0  0  0  0  0
+    2.4048    1.5875   -0.9852 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -3.4827    0.3152    0.0433 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.4807   -0.3156    0.0435 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -3.5190   -0.7571   -0.8481 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.5170    0.7568   -0.8478 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.1548    0.8649    2.1342 H   0  0  0  0  0  0  0  0  0  0  0  0
+    0.1601   -0.8435    2.1593 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.3089    1.3938    1.4913 H   0  0  0  0  0  0  0  0  0  0  0  0
+    2.3053   -1.3909    1.4943 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.4415   -2.4213   -1.6818 H   0  0  0  0  0  0  0  0  0  0  0  0
+    2.4469    2.4191   -1.6835 H   0  0  0  0  0  0  0  0  0  0  0  0
+   -4.4070   -0.9574   -1.4422 H   0  0  0  0  0  0  0  0  0  0  0  0
+    4.4050    0.9570   -1.4418 H   0  0  0  0  0  0  0  0  0  0  0  0
+    0.2961   -2.2262    0.4641 H   0  0  0  0  0  0  0  0  0  0  0  0
+    0.3872    2.8487   -1.0397 H   0  0  0  0  0  0  0  0  0  0  0  0
+  1 14  1  0  0  0  0
+  2 15  1  0  0  0  0
+  3  8  1  0  0  0  0
+  3 26  1  0  0  0  0
+  4  9  1  0  0  0  0
+  4 27  1  0  0  0  0
+  5  6  1  0  0  0  0
+  5  7  1  0  0  0  0
+  5 18  1  0  0  0  0
+  5 19  1  0  0  0  0
+  6  8  2  0  0  0  0
+  6 10  1  0  0  0  0
+  7  9  2  0  0  0  0
+  7 11  1  0  0  0  0
+  8 12  1  0  0  0  0
+  9 13  1  0  0  0  0
+ 10 14  2  0  0  0  0
+ 10 20  1  0  0  0  0
+ 11 15  2  0  0  0  0
+ 11 21  1  0  0  0  0
+ 12 16  2  0  0  0  0
+ 12 22  1  0  0  0  0
+ 13 17  2  0  0  0  0
+ 13 23  1  0  0  0  0
+ 14 16  1  0  0  0  0
+ 15 17  1  0  0  0  0
+ 16 24  1  0  0  0  0
+ 17 25  1  0  0  0  0
+M  END
+> <PUBCHEM_COMPOUND_CID>
+3037
+
+> <PUBCHEM_CONFORMER_RMSD>
+0.6
+
+> <PUBCHEM_CONFORMER_DIVERSEORDER>
+8
+10
+12
+1
+7
+5
+11
+3
+6
+9
+4
+2
+
+> <PUBCHEM_MMFF94_PARTIAL_CHARGES>
+25
+1 -0.18
+10 -0.15
+11 -0.15
+12 -0.15
+13 -0.15
+14 0.18
+15 0.18
+16 -0.15
+17 -0.15
+2 -0.18
+20 0.15
+21 0.15
+22 0.15
+23 0.15
+24 0.15
+25 0.15
+26 0.45
+27 0.45
+3 -0.53
+4 -0.53
+5 0.29
+6 -0.14
+7 -0.14
+8 0.08
+9 0.08
+
+> <PUBCHEM_EFFECTIVE_ROTOR_COUNT>
+2
+
+> <PUBCHEM_PHARMACOPHORE_FEATURES>
+4
+1 3 donor
+1 4 donor
+6 6 8 10 12 14 16 rings
+6 7 9 11 13 15 17 rings
+
+> <PUBCHEM_HEAVY_ATOM_COUNT>
+17
+
+> <PUBCHEM_ATOM_DEF_STEREO_COUNT>
+0
+
+> <PUBCHEM_ATOM_UDEF_STEREO_COUNT>
+0
+
+> <PUBCHEM_BOND_DEF_STEREO_COUNT>
+0
+
+> <PUBCHEM_BOND_UDEF_STEREO_COUNT>
+0
+
+> <PUBCHEM_ISOTOPIC_ATOM_COUNT>
+0
+
+> <PUBCHEM_COMPONENT_COUNT>
+1
+
+> <PUBCHEM_CACTVS_TAUTO_COUNT>
+5
+
+> <PUBCHEM_CONFORMER_ID>
+00000BDD00000008
+
+> <PUBCHEM_MMFF94_ENERGY>
+44.6858
+
+> <PUBCHEM_FEATURE_SELFOVERLAP>
+20.297
+
+> <PUBCHEM_SHAPE_FINGERPRINT>
+10062212 137 18261117369936506423
+104564 63 17986963035811110412
+11458722 120 18339359768245870841
+11471102 22 5472872458301843344
+11578080 2 18190204380446433792
+116883 192 18265608969609498196
+12236239 1 18410856576819659107
+12592029 89 18338223951597366363
+13549 16 18410575084668353682
+13693222 15 6555421915516066822
+13764800 53 14189033175566991199
+14115302 16 18186237320680093898
+14341114 328 10087642619424135543
+14787075 74 9511159855286719151
+14993402 34 18410855451538227223
+15099037 51 18340768233908588503
+15207287 21 15719111361650760302
+15375358 24 15647053767618106914
+15775835 57 18272650117329930317
+16945 1 17906452130063974618
+17834072 14 15936410035134206066
+18186145 218 17132117918276567720
+19422 9 18271525295227750719
+20279233 1 15719389529571237654
+20645476 183 18339080393619327415
+23402539 116 18186809105365620101
+23402655 69 18342736308283284156
+23559900 14 17603590712323212176
+25 1 17561083592297532664
+26918003 58 6266902359448424189
+296302 2 15213020427345972082
+3082319 5 18338798905472319583
+34934 24 18341891845236497020
+633830 44 17703790310130762689
+74978 22 18266740181857992718
+7832392 63 18340206284835898173
+81228 2 15720767252053392762
+9981440 41 17403743242177431832
+
+> <PUBCHEM_SHAPE_MULTIPOLES>
+341.85
+8.38
+1.9
+1.1
+0.02
+0
+-1.15
+1.94
+-0.01
+0
+-0.39
+-4.15
+0.01
+0
+
+> <PUBCHEM_SHAPE_SELFOVERLAP>
+722.787
+
+> <PUBCHEM_SHAPE_VOLUME>
+193
+
+> <PUBCHEM_COORDINATE_TYPE>
+2
+5
+255
+
+$$$$
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rdkit_descriptors_result1.tab	Sat May 20 12:41:19 2017 -0400
@@ -0,0 +1,2 @@
+BalabanJ	BertzCT	Chi0	Chi0n	Chi0v	Chi1	Chi1n	Chi1v	Chi2n	Chi2v	Chi3n	Chi3v	Chi4n	Chi4v	EState_VSA1	EState_VSA10	EState_VSA11	EState_VSA2	EState_VSA3	EState_VSA4	EState_VSA5	EState_VSA6	EState_VSA7	EState_VSA8	EState_VSA9	ExactMolWt	FractionCSP3	HallKierAlpha	HeavyAtomCount	HeavyAtomMolWt	Ipc	Kappa1	Kappa2	Kappa3	LabuteASA	MaxAbsEStateIndex	MaxAbsPartialCharge	MaxEStateIndex	MaxPartialCharge	MinAbsEStateIndex	MinAbsPartialCharge	MinEStateIndex	MinPartialCharge	MolLogP	MolMR	MolWt	NHOHCount	NOCount	NumAliphaticCarbocycles	NumAliphaticHeterocycles	NumAliphaticRings	NumAromaticCarbocycles	NumAromaticHeterocycles	NumAromaticRings	NumHAcceptors	NumHDonors	NumHeteroatoms	NumRadicalElectrons	NumRotatableBonds	NumSaturatedCarbocycles	NumSaturatedHeterocycles	NumSaturatedRings	NumValenceElectrons	PEOE_VSA1	PEOE_VSA10	PEOE_VSA11	PEOE_VSA12	PEOE_VSA13	PEOE_VSA14	PEOE_VSA2	PEOE_VSA3	PEOE_VSA4	PEOE_VSA5	PEOE_VSA6	PEOE_VSA7	PEOE_VSA8	PEOE_VSA9	RingCount	SMR_VSA1	SMR_VSA10	SMR_VSA2	SMR_VSA3	SMR_VSA4	SMR_VSA5	SMR_VSA6	SMR_VSA7	SMR_VSA8	SMR_VSA9	SlogP_VSA1	SlogP_VSA10	SlogP_VSA11	SlogP_VSA12	SlogP_VSA2	SlogP_VSA3	SlogP_VSA4	SlogP_VSA5	SlogP_VSA6	SlogP_VSA7	SlogP_VSA8	SlogP_VSA9	TPSA	VSA_EState1	VSA_EState10	VSA_EState2	VSA_EState3	VSA_EState4	VSA_EState5	VSA_EState6	VSA_EState7	VSA_EState8	VSA_EState9	fr_Al_COO	fr_Al_OH	fr_Al_OH_noTert	fr_ArN	fr_Ar_COO	fr_Ar_N	fr_Ar_NH	fr_Ar_OH	fr_COO	fr_COO2	fr_C_O	fr_C_O_noCOO	fr_C_S	fr_HOCCN	fr_Imine	fr_NH0	fr_NH1	fr_NH2	fr_N_O	fr_Ndealkylation1	fr_Ndealkylation2	fr_Nhpyrrole	fr_SH	fr_aldehyde	fr_alkyl_carbamate	fr_alkyl_halide	fr_allylic_oxid	fr_amide	fr_amidine	fr_aniline	fr_aryl_methyl	fr_azide	fr_azo	fr_barbitur	fr_benzene	fr_benzodiazepine	fr_bicyclic	fr_diazo	fr_dihydropyridine	fr_epoxide	fr_ester	fr_ether	fr_furan	fr_guanido	fr_halogen	fr_hdrzine	fr_hdrzone	fr_imidazole	fr_imide	fr_isocyan	fr_isothiocyan	fr_ketone	fr_ketone_Topliss	fr_lactam	fr_lactone	fr_methoxy	fr_morpholine	fr_nitrile	fr_nitro	fr_nitro_arom	fr_nitro_arom_nonortho	fr_nitroso	fr_oxazole	fr_oxime	fr_para_hydroxylation	fr_phenol	fr_phenol_noOrthoHbond	fr_phos_acid	fr_phos_ester	fr_piperdine	fr_piperzine	fr_priamide	fr_prisulfonamd	fr_pyridine	fr_quatN	fr_sulfide	fr_sulfonamd	fr_sulfone	fr_term_acetylene	fr_tetrazole	fr_thiazole	fr_thiocyan	fr_thiophene	fr_unbrch_alkane	fr_urea
+3037	2.37022757927	503.610880418	12.4138490834	8.82156453334	10.3334224254	8.05855064806	5.00835259312	5.76428153914	3.72284524811	4.59571680905	2.46398508386	2.93417928943	1.59652584133	1.98592629408	0.0	10.2130547897	0.0	11.4990236666	27.5929912338	0.0	12.1327341369	24.2654682738	0.0	0.0	23.2018797805	268.00578492	0.0769230769231	-1.38	17	259.047	6943.44519959	12.0868667938	4.86118110558	2.84267247008	109.048439398	9.68320845931	0.50766175334	9.68320845931	0.118708899658	0.1470143613	0.118708899658	0.1470143613	-0.50766175334	3.9954	69.0396	269.127	2	2	0	0	0	2	0	2	2	2	4	0	2	0	0	0	88	10.2130547897	11.4990236666	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	23.2018797805	47.5251053942	16.4660882504	0.0	2	10.2130547897	23.2018797805	0.0	0.0	0.0	6.42082162293	0.0	57.5703720216	0.0	11.4990236666	0.0	0.0	11.4990236666	23.2018797805	10.2130547897	6.42082162293	0.0	11.1269029834	36.3982024108	10.0452666275	0.0	0.0	40.46	0.0	11.7088697125	0.0	0.0	0.0	0.0	0.0	0.0	0.0	32.0133525097	0	0	0	0	0	0	0	2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2	0	0	0	0	0	0	0	0	0	2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	2	2	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0