changeset 0:915e9be38994 draft

planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 2e3c3c2bd7ecdc9c2968a32f91e81136e0cb3835
author bgruening
date Mon, 05 Aug 2019 05:21:58 -0400
parents
children 6f8458d1cf46
files chembl.py chembl.xml test-data/in1.smi test-data/out1.smi test-data/out2.smi test-data/out3.smi test-data/out4.smi
diffstat 7 files changed, 345 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/chembl.py	Mon Aug 05 05:21:58 2019 -0400
@@ -0,0 +1,102 @@
+from chembl_webresource_client.new_client import new_client
+import argparse
+
+def open_file(filename):
+    with open(filename) as f:
+        return f.readline()
+
+def get_smiles(res):
+    """
+    Get a list of SMILES from function results
+    """ 
+    smiles = set()
+    for smi in res: 
+        smiles.add(smi['molecule_structures']['canonical_smiles']) 
+    return smiles
+
+def sim_search(smiles, tanimoto):
+    """
+    Return compounds which are within a Tanimoto range of the SMILES input
+    """
+    similarity = new_client.similarity
+    return similarity.filter(smiles=smiles, similarity=tanimoto).only(['molecule_structures'])
+    
+def substr_search(smiles):
+    """
+    Return compounds which contain the SMILES substructure input
+    """
+    substructure = new_client.substructure
+    return substructure.filter(smiles=smiles).only(['molecule_structures'])
+    
+def filter_drugs(mols):
+    """
+    Return only compounds which are approved drugs
+    """
+    return mols.filter(max_phase=4)
+
+def filter_biotherapeutic(mols):
+    """
+    Return only biotherapeutic molecules
+    """
+    return mols.filter(biotherapeutic__isnull=False)
+
+def filter_nat_prod(mols):
+    """
+    Return only natural products
+    """
+    return mols.filter(natural_product=1)
+
+def filter_ro5(mols):
+    """
+    Return only compounds with no RO5 violations
+    """
+    return mols.filter(molecule_properties__num_ro5_violations=0)
+
+def main():
+    parser = argparse.ArgumentParser(description='Search ChEMBL database for compounds')
+    parser.add_argument('-i', '--input', help='SMILES input')
+    parser.add_argument('-f', '--file', help='SMILES input as file')
+    parser.add_argument('-o', '--output', help="SMILES output")
+    parser.add_argument('-t', '--tanimoto', type=int, help='Tanimoto similarity score')
+    parser.add_argument('-s', '--substructure', action='store_true', help='Substructure search using the SMILES input.')
+    parser.add_argument('-d', '--drugs', action='store_true', help='Filter approved drugs')
+    parser.add_argument('-b', '--biotherapeutic', action='store_true', help='Filter biotherapeutic molecules')
+    parser.add_argument('-n', '--nat-prod', action='store_true', help='Filter natural products')
+    parser.add_argument('-r', '--ro5', action='store_true', help='Filter compounds that pass Lipinski RO5')
+
+    args = parser.parse_args()
+
+    if args.file:  # get SMILES from file rather than -i option
+        args.input = open_file(args.file)
+
+    if len(args.input) < 5:
+        raise IOError('SMILES must be at least 5 characters long.')
+
+    if args.substructure:  # specify search type: substructure or similarity
+        mols = substr_search(args.input)
+    else:
+        mols = sim_search(args.input, args.tanimoto)
+
+    # filter options:
+    if args.drugs:
+        mols = filter_drugs(mols)
+
+    if args.biotherapeutic:
+        mols = filter_biotherapeutic(mols)
+
+    if args.nat_prod:
+        mols = filter_nat_prod(mols)
+
+    if args.ro5:
+        mols = filter_ro5(mols)
+
+    # get SMILES from search output
+    mols = get_smiles(mols)
+
+    # write to file
+    with open(args.output, 'w') as f:
+        f.write('\n'.join(mols))
+    
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/chembl.xml	Mon Aug 05 05:21:58 2019 -0400
@@ -0,0 +1,118 @@
+<tool id="chembl" name="Search ChEMBL database" version="0.1.0">
+    <description>for compounds which are similar to a SMILES string</description>
+    <requirements>
+        <requirement type="package" version="0.9.31">chembl_webresource_client</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        python -W ignore '$__tool_directory__/chembl.py'
+            $input.format '$input.smiles'
+            -o $outfile
+            $search.type
+            #if $search.type == '-t':
+                $search.tanimoto
+            #end if
+            $drugs
+            $biotherapeutic
+            $natprod
+            $ro5
+    ]]></command>
+    <inputs>
+        <conditional name="input">
+            <param name='format' type='select' format='text' label="SMILES input type" help="Enter SMILES as either text or file.">
+                <option value='-i'>Text</option>
+                <option value='-f'>File</option>
+            </param>
+            <when value='-i'>
+                <param name="smiles" type="text" label="SMILES input" help="Enter SMILES for a compound.">
+                    <validator type='length' min='5'/>
+                </param>
+            </when>
+            <when value='-f'>
+                <param name="smiles" type="data" format="smi" label="Input file" help="File containing a single compound in SMILES format. Note only the first line of the file will be read, if the file contains multiple compounds."/>
+            </when>
+        </conditional>
+        
+        <conditional name="search">
+            <param name='type' type='select' format='text' label="Search type" help="Search for compounds are similar to the SMILES input, or which contain the SMILES input as a substructure">
+                <option value='-t'>Similarity</option>
+                <option value='-s'>Substructure</option>
+            </param>
+            <when value="-t">
+                <param type="integer" name="tanimoto" label="Tanimoto cutoff score" help="Score for similarity search. Minimum value is 70." value="70" min="70" max="100"/>
+            </when>
+            <when value="-s"/>
+        </conditional>
+        
+        <param name="drugs" type="boolean" value="false" label="Filter to return only approved drugs" truevalue="-d" falsevalue=""/>
+        <param name="biotherapeutic" type="boolean" value="false" label="Filter to return only biotherapeutic molecules" truevalue="-b" falsevalue=""/>
+        <param name="natprod" type="boolean" value="false" label="Filter to return only natural products" truevalue="-n" falsevalue=""/>
+        <param name="ro5" type="boolean" value="false" label="Filter for Lipinski's Rule of Five" truevalue="-r" falsevalue=""/>
+    </inputs>
+    <outputs>
+        <data name="outfile" format="smi" />
+    </outputs>
+    <tests>
+        <test>
+            <param name="format" value="-f"/>
+            <param name="smiles" value="in1.smi"/>
+            <param name='type' value='-t' />
+            <param name='tanimoto' value='70' />
+            <output name="outfile" ftype="smi" file='out1.smi'/>
+        </test>
+        <test>
+            <param name="format" value="-f"/>
+            <param name="smiles" value="in1.smi"/>
+            <param name='type' value='-t' />
+            <param name='tanimoto' value='70' />
+            <param name='drugs' value='true'/>
+            <output name="outfile" ftype="smi" file='out2.smi'/>
+        </test>
+        <test>
+            <param name="format" value="-f"/>
+            <param name="smiles" value="in1.smi"/>
+            <param name='type' value='-s' />
+            <output name="outfile" ftype="smi" file='out3.smi'/>
+        </test>    
+        <test>
+            <param name="format" value="-i"/>
+            <param name="smiles" value="C1CCCCC1"/>
+            <param name='type' value='-t' />
+            <param name='tanimoto' value='70' />
+            <output name="outfile" ftype="smi" file='out4.smi'/>
+        </test>       
+    </tests>
+    <help><![CDATA[
+
+Search the ChEMBL database for compounds which resemble a SMILES string. Two 
+search options are possible: similarity (searches for compounds which are 
+similar to the input within a specified Tanimoto cutoff) and substructure 
+(searches for compounds which contain the input substructure).
+
+Results can be filtered for compounds which are 1) approved drugs 2) biotherapeutic
+3) natural products and 4) fulfil all of the Lipinski rule of five criteria.
+
+-----
+
+.. class:: infomark
+
+**Input**
+
+A single molecule in SMILES format. This can be submitted either as text or as a 
+file containing the SMILES string on the first line. Note that if the file contains 
+multiple lines, only the SMILES string on the first line will be used for the search.
+
+-----
+
+.. class:: infomark
+
+**Output**
+
+A SMILES file with search results, each on a new line.
+
+    ]]></help>
+
+    <citations>
+        <citation type="doi">10.1093/nar/gkv352</citation>
+        <citation type="doi">arXiv:1607.00378v1</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/in1.smi	Mon Aug 05 05:21:58 2019 -0400
@@ -0,0 +1,1 @@
+CN1CCC[C@H]1c2cccnc2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/out1.smi	Mon Aug 05 05:21:58 2019 -0400
@@ -0,0 +1,47 @@
+CN1CCCC1c2cccnc2
+CN1CCC[C@@H]1c2cccnc2
+CN1CCC[C@H]1c2cccnc2
+CCN1CCCC1c2cccnc2
+CN1CCCC1c2ccc(C)nc2
+CCc1ccc(cn1)C2CCCN2C
+CN1CCCC1c2cncc(C)c2
+CCCc1ccc(cn1)C2CCCN2C
+CCc1cncc(c1)C2CCCN2C
+CN1CCCC[C@H]1c2cccnc2
+CN1CCCCC1c2cccnc2
+CCCc1cncc(c1)C2CCCN2C
+CN1CCCC1c2cccnc2C
+CCCCc1ccc(cn1)C2CCCN2C
+CCCCCc1ccc(cn1)C2CCCN2C
+CC1CCN(C)[C@@H]1c2cccnc2
+CN1CCCC1c2ccc(CCCc3ccccc3)nc2
+CN1CCCC1c2cncc(Cl)c2
+CN1CCCC1c2ccc(CCc3ccccc3)nc2
+CN1CCC[C@H]1c2ccccc2
+CN1CCCC1c2ccccc2
+CN1CCC[C@H]1c2ccccn2
+CN1CCCC1c2cncc(F)c2
+COc1cncc(c1)C2CCCN2C
+CN1CCCC1c2cncc(Br)c2
+CN1CCCC1c2ccc(nc2)c3ccccc3
+CN1CCCC1c2ccc(\C=C\c3ccccc3)nc2
+COc1ccncc1C2CCCN2C
+CCCC[C@H]1CC[C@H](N1C)c2cccnc2
+CCCC[C@@H]1CC[C@H](N1C)c2cccnc2
+CN1CCCC1c2cnccc2N
+C[C@H]1C[C@H](N(C)C1)c2cccnc2
+CC[C@H]1C[C@H](N(C)C1)c2cccnc2
+CN1CCCC1c2ccc(CCc3ccc(Cl)cc3)nc2
+CN1CCCC1c2ccc(Cl)nc2
+CN1CCCC1c2ccc(C)cc2
+COCC1CCN(C)[C@@H]1c2cccnc2
+C(N1CCCC1c2cccnc2)c3ccccc3
+C[C@H]1CC[C@H](N1C)c2cccnc2
+C[C@@H]1CC[C@H](N1C)c2cccnc2
+CN1CCCC1c2ccc(Cl)cc2
+CN1CCCC1c2ccc(F)nc2
+CN1CCC(CF)[C@H]1c2cccnc2
+CN1CCCC1c2ccc(Br)nc2
+COc1ccc(CCc2ccc(cn2)C3CCCN3C)cc1
+CN1CCC(CO)[C@H]1c2cccnc2
+CN1CCCC1c2cnc3ccccc3c2
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/out2.smi	Mon Aug 05 05:21:58 2019 -0400
@@ -0,0 +1,1 @@
+CN1CCC[C@H]1c2cccnc2
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/out3.smi	Mon Aug 05 05:21:58 2019 -0400
@@ -0,0 +1,72 @@
+CN1CCC[C@H]1c2cccnc2
+CN1CCC[C@H]1c2ccc[n+]([BH2-]C#N)c2
+CN1CC[C@H]2CCc3ncccc3[C@@H]12
+CN1CC[C@H]2CCc3c(ccc[n+]3[BH2-]C#N)[C@@H]12
+CN1[C@@H](CC[C@H]1c2cccnc2)C#N
+CN1[C@H](CC[C@H]1c2cccnc2)C#N
+CN1CCC[C@H]1c2cncc(c2)C#C
+CN1C[C@@H](Cc2ccccc2)C[C@H]1c3cccnc3
+C[C@@H]1CC[C@H](N1C)c2cccnc2
+C[C@H]1CC[C@H](N1C)c2cccnc2
+CC[C@H]1C[C@H](N(C)C1)c2cccnc2
+CN1C[C@@H](O)C[C@H]1c2cccnc2
+CN1CCC(CO)[C@H]1c2cccnc2
+CSC[C@H]1C[C@H](N(C)C1)c2cccnc2
+CN1C[C@H](CO)C[C@H]1c2cccnc2
+CN1C[C@@H](CC#N)C[C@H]1c2cccnc2
+CN1C[C@@H](CF)C[C@H]1c2cccnc2
+CO[C@H]1C[C@H](N(C)C1)c2cccnc2
+CN1CCC(CF)[C@H]1c2cccnc2
+CC1CCN(C)[C@@H]1c2cccnc2
+COCC1CCN(C)[C@@H]1c2cccnc2
+CN1C[C@@H](CO)C[C@H]1c2cccnc2
+CN1C[C@H](C[C@H]1c2cccnc2)OC(=O)C
+CN1C[C@@H](C[C@H]1c2cccnc2)C#N
+CC1CN(C)[C@@H](C1C)c2cccnc2
+C[C@H]1C[C@H](N(C)C1)c2cccnc2
+CN1C[C@H](C[C@H]1c2cccnc2)OS(=O)(=O)C
+COC[C@H]1C[C@H](N(C)C1)c2cccnc2
+CCCC[C@@H]1CC[C@H](N1C)c2cccnc2
+CCCC[C@H]1CC[C@H](N1C)c2cccnc2
+CN1[C@@H](CC[C@H]1c2cccnc2)c3ccccc3
+CN1[C@@H](CC[C@@H]1c2ccccc2)c3cccnc3
+Clc1ccc(OC[C@H]2CN3C(=O)CC[C@@]3(O2)c4cccnc4)cc1
+Clc1ccc(OC[C@@H]2CN3C(=O)CC[C@@]3(O2)c4cccnc4)cc1
+CN1[C@@H](CCC1=O)c2cccnc2
+CCCCCCCCC[n+]1cccc2[C@@H]3[C@@H](CCN3C)CCc12
+CN1CCC[C@H]1c2ccc[n+](CCCCCCCCCCCCCC[n+]3cccc(c3)[C@@H]4CCCN4C)c2
+CCCCCCCCCCCC[n+]1cccc2[C@@H]3[C@@H](CCN3C)CCc12
+CCCCCCCCCCCC[n+]1cccc(c1)[C@@H]2CCCN2C
+CCCCCCCCCC[n+]1cccc(c1)[C@@H]2CCCN2C
+CN1CCC[C@H]1c2ccc[n+](CCCCCC[n+]3cccc(c3)[C@@H]4CCCN4C)c2
+CN1CCC[C@H]1c2ccc[n+](CCCCCCCCCCC[n+]3cccc(c3)[C@@H]4CCCN4C)c2
+CN1CCC[C@H]1c2ccc[n+](CCCCCCCC[n+]3cccc(c3)[C@@H]4CCCN4C)c2
+CN1CCC[C@H]1c2ccc[n+](CCCCCCCCCC[n+]3cccc(c3)[C@@H]4CCCN4C)c2
+CN1CCC[C@H]1c2ccc[n+](CCCCc3ccc(CCCC[n+]4cccc(c4)[C@@H]5CCCN5C)cc3)c2
+CN1CCC[C@H]1c2ccc[n+](CCCCCc3ccccc3CCCCC[n+]4cccc(c4)[C@@H]5CCCN5C)c2
+CN1CCC[C@H]1c2ccc[n+](CCCC#Cc3ccccc3C#CCCC[n+]4cccc(c4)[C@@H]5CCCN5C)c2
+CN1CCC[C@H]1c2ccc[n+](CCCCCc3cc(CCCCC[n+]4cccc(c4)[C@@H]5CCCN5C)cc(CCCCC[n+]6cccc(c6)[C@@H]7CCCN7C)c3)c2
+CN1CCC[C@H]1c2ccc[n+](CCCCCCCCC[n+]3cccc(c3)[C@@H]4CCCN4C)c2
+CCCCCCCCCC[n+]1cccc2c1CC[C@]3(C)CCN(C)[C@]23C
+CN1CCC[C@H]1c2ccc[n+](CCCC#Cc3cccc(c3)C#CCCC[n+]4cccc(c4)[C@@H]5CCCN5C)c2
+CCCCCCCC[n+]1cccc2[C@@H]3[C@@H](CCN3C)CCc12
+CN1CCC[C@H]1c2ccc[n+](CCCC#Cc3cc(cc(c3)C#CCCC[n+]4cccc(c4)[C@@H]5CCCN5C)C#CCCC[n+]6cccc(c6)[C@@H]7CCCN7C)c2
+CCCCCCCCCCC[n+]1cccc2[C@@H]3[C@@H](CCN3C)CCc12
+CCCCCCCCCC[n+]1cccc2[C@@H]3[C@@H](CCN3C)CCc12
+C[N@+]1(CC[N@+]2(C)CCC[C@@H]2c3cccnc3)CCC[C@H]1c4cccnc4
+CN1[C@@H](C[C@@H](OC2O[C@@H]([C@@H](O)[C@H](O)[C@H]2O)C(=O)O)C1=O)c3cccnc3
+CN1C(=O)CC[C@@]1(O)c2cccnc2
+OCN1[C@@H](CCC1=O)c2cccnc2
+CN1CCC[C@H]1c2ccc[n+](c2)[C@@H]3O[C@@H]([C@@H](O)[C@H](O)[C@H]3O)C(=O)O
+C[N+]1([O-])CCC[C@H]1c2cccnc2
+CN1CCC[C@@]1(O)c2cccnc2
+CN1[C@@H](CCC1=O)c2ccc[n+](C)c2
+CN1[C@@H](CCC1=O)c2ccc[n+]([O-])c2
+CN1[C@@H](CCC1=O)c2ccc[n+](c2)C3O[C@@H]([C@@H](O)[C@H](O)[C@H]3O)C(=O)C
+Cc1cncc(c1)[C@@H]2CCC[N+]2(C)[O-]
+COc1ncc(cc1c2ncc(cc2[C@@H]3CC[C@H]4[C@H](OC(=O)N34)c5cc(cc(c5)C(F)(F)F)C(F)(F)F)C(F)(F)F)c6c(C)cc(cc6C)C(=O)O
+COc1ccc(cc1c2ncc(cc2[C@@H]3CC[C@H]4[C@H](OC(=O)N34)c5cc(cc(c5)C(F)(F)F)C(F)(F)F)C(F)(F)F)c6c(C)cc(cc6C)C(=O)O
+COc1ncc(cc1c2ncc(cc2[C@@H]3CC[C@H]4[C@H](OC(=O)N34)c5cc(cc(c5)C(F)(F)F)C(F)(F)F)C(F)(F)F)c6ccc(cc6C)C(=O)O
+COCCOc1ncccc1[C@@H]2C(C(=O)C(C)C)C(=O)C(=O)N2c3ccc(cc3)c4ccsc4
+COCCOc1ncccc1[C@@H]2C(C(=O)C(C)C)C(=O)C(=O)N2c3ccc(cc3)c4ccc(C)s4
+O=S(=O)(Nc1ncns1)c2ccc3c(cccc3c2)N4CCC[C@H]4c5cccnc5
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/out4.smi	Mon Aug 05 05:21:58 2019 -0400
@@ -0,0 +1,4 @@
+C1CCCCC1
+C1CCCCCCCCCCC1
+C1CCCCCCC1
+C1CCCC1
\ No newline at end of file