# HG changeset patch # User bgruening # Date 1564996918 14400 # Node ID 915e9be3899426491f632c544eb9705d860f0b0e planemo upload for repository https://github.com/chembl/chembl_webresource_client commit 2e3c3c2bd7ecdc9c2968a32f91e81136e0cb3835 diff -r 000000000000 -r 915e9be38994 chembl.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chembl.py Mon Aug 05 05:21:58 2019 -0400 @@ -0,0 +1,102 @@ +from chembl_webresource_client.new_client import new_client +import argparse + +def open_file(filename): + with open(filename) as f: + return f.readline() + +def get_smiles(res): + """ + Get a list of SMILES from function results + """ + smiles = set() + for smi in res: + smiles.add(smi['molecule_structures']['canonical_smiles']) + return smiles + +def sim_search(smiles, tanimoto): + """ + Return compounds which are within a Tanimoto range of the SMILES input + """ + similarity = new_client.similarity + return similarity.filter(smiles=smiles, similarity=tanimoto).only(['molecule_structures']) + +def substr_search(smiles): + """ + Return compounds which contain the SMILES substructure input + """ + substructure = new_client.substructure + return substructure.filter(smiles=smiles).only(['molecule_structures']) + +def filter_drugs(mols): + """ + Return only compounds which are approved drugs + """ + return mols.filter(max_phase=4) + +def filter_biotherapeutic(mols): + """ + Return only biotherapeutic molecules + """ + return mols.filter(biotherapeutic__isnull=False) + +def filter_nat_prod(mols): + """ + Return only natural products + """ + return mols.filter(natural_product=1) + +def filter_ro5(mols): + """ + Return only compounds with no RO5 violations + """ + return mols.filter(molecule_properties__num_ro5_violations=0) + +def main(): + parser = argparse.ArgumentParser(description='Search ChEMBL database for compounds') + parser.add_argument('-i', '--input', help='SMILES input') + parser.add_argument('-f', '--file', help='SMILES input as file') + parser.add_argument('-o', '--output', help="SMILES output") + parser.add_argument('-t', '--tanimoto', type=int, help='Tanimoto similarity score') + parser.add_argument('-s', '--substructure', action='store_true', help='Substructure search using the SMILES input.') + parser.add_argument('-d', '--drugs', action='store_true', help='Filter approved drugs') + parser.add_argument('-b', '--biotherapeutic', action='store_true', help='Filter biotherapeutic molecules') + parser.add_argument('-n', '--nat-prod', action='store_true', help='Filter natural products') + parser.add_argument('-r', '--ro5', action='store_true', help='Filter compounds that pass Lipinski RO5') + + args = parser.parse_args() + + if args.file: # get SMILES from file rather than -i option + args.input = open_file(args.file) + + if len(args.input) < 5: + raise IOError('SMILES must be at least 5 characters long.') + + if args.substructure: # specify search type: substructure or similarity + mols = substr_search(args.input) + else: + mols = sim_search(args.input, args.tanimoto) + + # filter options: + if args.drugs: + mols = filter_drugs(mols) + + if args.biotherapeutic: + mols = filter_biotherapeutic(mols) + + if args.nat_prod: + mols = filter_nat_prod(mols) + + if args.ro5: + mols = filter_ro5(mols) + + # get SMILES from search output + mols = get_smiles(mols) + + # write to file + with open(args.output, 'w') as f: + f.write('\n'.join(mols)) + + +if __name__ == "__main__": + main() diff -r 000000000000 -r 915e9be38994 chembl.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/chembl.xml Mon Aug 05 05:21:58 2019 -0400 @@ -0,0 +1,118 @@ + + for compounds which are similar to a SMILES string + + chembl_webresource_client + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 10.1093/nar/gkv352 + arXiv:1607.00378v1 + + diff -r 000000000000 -r 915e9be38994 test-data/in1.smi --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/in1.smi Mon Aug 05 05:21:58 2019 -0400 @@ -0,0 +1,1 @@ +CN1CCC[C@H]1c2cccnc2 diff -r 000000000000 -r 915e9be38994 test-data/out1.smi --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/out1.smi Mon Aug 05 05:21:58 2019 -0400 @@ -0,0 +1,47 @@ +CN1CCCC1c2cccnc2 +CN1CCC[C@@H]1c2cccnc2 +CN1CCC[C@H]1c2cccnc2 +CCN1CCCC1c2cccnc2 +CN1CCCC1c2ccc(C)nc2 +CCc1ccc(cn1)C2CCCN2C +CN1CCCC1c2cncc(C)c2 +CCCc1ccc(cn1)C2CCCN2C +CCc1cncc(c1)C2CCCN2C +CN1CCCC[C@H]1c2cccnc2 +CN1CCCCC1c2cccnc2 +CCCc1cncc(c1)C2CCCN2C +CN1CCCC1c2cccnc2C +CCCCc1ccc(cn1)C2CCCN2C +CCCCCc1ccc(cn1)C2CCCN2C +CC1CCN(C)[C@@H]1c2cccnc2 +CN1CCCC1c2ccc(CCCc3ccccc3)nc2 +CN1CCCC1c2cncc(Cl)c2 +CN1CCCC1c2ccc(CCc3ccccc3)nc2 +CN1CCC[C@H]1c2ccccc2 +CN1CCCC1c2ccccc2 +CN1CCC[C@H]1c2ccccn2 +CN1CCCC1c2cncc(F)c2 +COc1cncc(c1)C2CCCN2C +CN1CCCC1c2cncc(Br)c2 +CN1CCCC1c2ccc(nc2)c3ccccc3 +CN1CCCC1c2ccc(\C=C\c3ccccc3)nc2 +COc1ccncc1C2CCCN2C +CCCC[C@H]1CC[C@H](N1C)c2cccnc2 +CCCC[C@@H]1CC[C@H](N1C)c2cccnc2 +CN1CCCC1c2cnccc2N +C[C@H]1C[C@H](N(C)C1)c2cccnc2 +CC[C@H]1C[C@H](N(C)C1)c2cccnc2 +CN1CCCC1c2ccc(CCc3ccc(Cl)cc3)nc2 +CN1CCCC1c2ccc(Cl)nc2 +CN1CCCC1c2ccc(C)cc2 +COCC1CCN(C)[C@@H]1c2cccnc2 +C(N1CCCC1c2cccnc2)c3ccccc3 +C[C@H]1CC[C@H](N1C)c2cccnc2 +C[C@@H]1CC[C@H](N1C)c2cccnc2 +CN1CCCC1c2ccc(Cl)cc2 +CN1CCCC1c2ccc(F)nc2 +CN1CCC(CF)[C@H]1c2cccnc2 +CN1CCCC1c2ccc(Br)nc2 +COc1ccc(CCc2ccc(cn2)C3CCCN3C)cc1 +CN1CCC(CO)[C@H]1c2cccnc2 +CN1CCCC1c2cnc3ccccc3c2 \ No newline at end of file diff -r 000000000000 -r 915e9be38994 test-data/out2.smi --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/out2.smi Mon Aug 05 05:21:58 2019 -0400 @@ -0,0 +1,1 @@ +CN1CCC[C@H]1c2cccnc2 \ No newline at end of file diff -r 000000000000 -r 915e9be38994 test-data/out3.smi --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/out3.smi Mon Aug 05 05:21:58 2019 -0400 @@ -0,0 +1,72 @@ +CN1CCC[C@H]1c2cccnc2 +CN1CCC[C@H]1c2ccc[n+]([BH2-]C#N)c2 +CN1CC[C@H]2CCc3ncccc3[C@@H]12 +CN1CC[C@H]2CCc3c(ccc[n+]3[BH2-]C#N)[C@@H]12 +CN1[C@@H](CC[C@H]1c2cccnc2)C#N +CN1[C@H](CC[C@H]1c2cccnc2)C#N +CN1CCC[C@H]1c2cncc(c2)C#C +CN1C[C@@H](Cc2ccccc2)C[C@H]1c3cccnc3 +C[C@@H]1CC[C@H](N1C)c2cccnc2 +C[C@H]1CC[C@H](N1C)c2cccnc2 +CC[C@H]1C[C@H](N(C)C1)c2cccnc2 +CN1C[C@@H](O)C[C@H]1c2cccnc2 +CN1CCC(CO)[C@H]1c2cccnc2 +CSC[C@H]1C[C@H](N(C)C1)c2cccnc2 +CN1C[C@H](CO)C[C@H]1c2cccnc2 +CN1C[C@@H](CC#N)C[C@H]1c2cccnc2 +CN1C[C@@H](CF)C[C@H]1c2cccnc2 +CO[C@H]1C[C@H](N(C)C1)c2cccnc2 +CN1CCC(CF)[C@H]1c2cccnc2 +CC1CCN(C)[C@@H]1c2cccnc2 +COCC1CCN(C)[C@@H]1c2cccnc2 +CN1C[C@@H](CO)C[C@H]1c2cccnc2 +CN1C[C@H](C[C@H]1c2cccnc2)OC(=O)C +CN1C[C@@H](C[C@H]1c2cccnc2)C#N +CC1CN(C)[C@@H](C1C)c2cccnc2 +C[C@H]1C[C@H](N(C)C1)c2cccnc2 +CN1C[C@H](C[C@H]1c2cccnc2)OS(=O)(=O)C +COC[C@H]1C[C@H](N(C)C1)c2cccnc2 +CCCC[C@@H]1CC[C@H](N1C)c2cccnc2 +CCCC[C@H]1CC[C@H](N1C)c2cccnc2 +CN1[C@@H](CC[C@H]1c2cccnc2)c3ccccc3 +CN1[C@@H](CC[C@@H]1c2ccccc2)c3cccnc3 +Clc1ccc(OC[C@H]2CN3C(=O)CC[C@@]3(O2)c4cccnc4)cc1 +Clc1ccc(OC[C@@H]2CN3C(=O)CC[C@@]3(O2)c4cccnc4)cc1 +CN1[C@@H](CCC1=O)c2cccnc2 +CCCCCCCCC[n+]1cccc2[C@@H]3[C@@H](CCN3C)CCc12 +CN1CCC[C@H]1c2ccc[n+](CCCCCCCCCCCCCC[n+]3cccc(c3)[C@@H]4CCCN4C)c2 +CCCCCCCCCCCC[n+]1cccc2[C@@H]3[C@@H](CCN3C)CCc12 +CCCCCCCCCCCC[n+]1cccc(c1)[C@@H]2CCCN2C +CCCCCCCCCC[n+]1cccc(c1)[C@@H]2CCCN2C +CN1CCC[C@H]1c2ccc[n+](CCCCCC[n+]3cccc(c3)[C@@H]4CCCN4C)c2 +CN1CCC[C@H]1c2ccc[n+](CCCCCCCCCCC[n+]3cccc(c3)[C@@H]4CCCN4C)c2 +CN1CCC[C@H]1c2ccc[n+](CCCCCCCC[n+]3cccc(c3)[C@@H]4CCCN4C)c2 +CN1CCC[C@H]1c2ccc[n+](CCCCCCCCCC[n+]3cccc(c3)[C@@H]4CCCN4C)c2 +CN1CCC[C@H]1c2ccc[n+](CCCCc3ccc(CCCC[n+]4cccc(c4)[C@@H]5CCCN5C)cc3)c2 +CN1CCC[C@H]1c2ccc[n+](CCCCCc3ccccc3CCCCC[n+]4cccc(c4)[C@@H]5CCCN5C)c2 +CN1CCC[C@H]1c2ccc[n+](CCCC#Cc3ccccc3C#CCCC[n+]4cccc(c4)[C@@H]5CCCN5C)c2 +CN1CCC[C@H]1c2ccc[n+](CCCCCc3cc(CCCCC[n+]4cccc(c4)[C@@H]5CCCN5C)cc(CCCCC[n+]6cccc(c6)[C@@H]7CCCN7C)c3)c2 +CN1CCC[C@H]1c2ccc[n+](CCCCCCCCC[n+]3cccc(c3)[C@@H]4CCCN4C)c2 +CCCCCCCCCC[n+]1cccc2c1CC[C@]3(C)CCN(C)[C@]23C +CN1CCC[C@H]1c2ccc[n+](CCCC#Cc3cccc(c3)C#CCCC[n+]4cccc(c4)[C@@H]5CCCN5C)c2 +CCCCCCCC[n+]1cccc2[C@@H]3[C@@H](CCN3C)CCc12 +CN1CCC[C@H]1c2ccc[n+](CCCC#Cc3cc(cc(c3)C#CCCC[n+]4cccc(c4)[C@@H]5CCCN5C)C#CCCC[n+]6cccc(c6)[C@@H]7CCCN7C)c2 +CCCCCCCCCCC[n+]1cccc2[C@@H]3[C@@H](CCN3C)CCc12 +CCCCCCCCCC[n+]1cccc2[C@@H]3[C@@H](CCN3C)CCc12 +C[N@+]1(CC[N@+]2(C)CCC[C@@H]2c3cccnc3)CCC[C@H]1c4cccnc4 +CN1[C@@H](C[C@@H](OC2O[C@@H]([C@@H](O)[C@H](O)[C@H]2O)C(=O)O)C1=O)c3cccnc3 +CN1C(=O)CC[C@@]1(O)c2cccnc2 +OCN1[C@@H](CCC1=O)c2cccnc2 +CN1CCC[C@H]1c2ccc[n+](c2)[C@@H]3O[C@@H]([C@@H](O)[C@H](O)[C@H]3O)C(=O)O +C[N+]1([O-])CCC[C@H]1c2cccnc2 +CN1CCC[C@@]1(O)c2cccnc2 +CN1[C@@H](CCC1=O)c2ccc[n+](C)c2 +CN1[C@@H](CCC1=O)c2ccc[n+]([O-])c2 +CN1[C@@H](CCC1=O)c2ccc[n+](c2)C3O[C@@H]([C@@H](O)[C@H](O)[C@H]3O)C(=O)C +Cc1cncc(c1)[C@@H]2CCC[N+]2(C)[O-] +COc1ncc(cc1c2ncc(cc2[C@@H]3CC[C@H]4[C@H](OC(=O)N34)c5cc(cc(c5)C(F)(F)F)C(F)(F)F)C(F)(F)F)c6c(C)cc(cc6C)C(=O)O +COc1ccc(cc1c2ncc(cc2[C@@H]3CC[C@H]4[C@H](OC(=O)N34)c5cc(cc(c5)C(F)(F)F)C(F)(F)F)C(F)(F)F)c6c(C)cc(cc6C)C(=O)O +COc1ncc(cc1c2ncc(cc2[C@@H]3CC[C@H]4[C@H](OC(=O)N34)c5cc(cc(c5)C(F)(F)F)C(F)(F)F)C(F)(F)F)c6ccc(cc6C)C(=O)O +COCCOc1ncccc1[C@@H]2C(C(=O)C(C)C)C(=O)C(=O)N2c3ccc(cc3)c4ccsc4 +COCCOc1ncccc1[C@@H]2C(C(=O)C(C)C)C(=O)C(=O)N2c3ccc(cc3)c4ccc(C)s4 +O=S(=O)(Nc1ncns1)c2ccc3c(cccc3c2)N4CCC[C@H]4c5cccnc5 \ No newline at end of file diff -r 000000000000 -r 915e9be38994 test-data/out4.smi --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/out4.smi Mon Aug 05 05:21:58 2019 -0400 @@ -0,0 +1,4 @@ +C1CCCCC1 +C1CCCCCCCCCCC1 +C1CCCCCCC1 +C1CCCC1 \ No newline at end of file