# HG changeset patch # User bgruening # Date 1584292718 14400 # Node ID 0e330829de40af3f91486a079acf89bd01de2857 # Parent a2369e86bc480d273572cbb81c33cda88000f50a "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sygma commit 5b2d7437ba0875c0913630fd2165c82ed933422c" diff -r a2369e86bc48 -r 0e330829de40 sygma.xml --- a/sygma.xml Mon Sep 30 17:38:26 2019 -0400 +++ b/sygma.xml Sun Mar 15 13:18:38 2020 -0400 @@ -1,27 +1,31 @@ - + + by performing common reactions on one or more parent molecule(s) 1.1.1 - by performing common reactions on one or more parent molecule(s) sygma rdkit + - + @@ -31,6 +35,13 @@ + + + + + + + @@ -44,8 +55,8 @@ **What this tool does** -SyGMa (Systematic Generation of potential Metabolites) is a tool to generate -possible metabolic products of an input parent structure. The tool provides +SyGMa (Systematic Generation of potential Metabolites) is a tool to generate +possible metabolic products of an input parent structure. The tool provides two rulesets to cover both phase 1 and 2 metabolism. ----- @@ -54,11 +65,11 @@ **Input** -A file in SMILES or SDF format. Files may contain multiple molecule -entries; in this case outputs are distinguished by the code included in the +A file in SMILES or SDF format. Files may contain multiple molecule +entries; in this case outputs are distinguished by the code included in the output file (e.g. SYGMA0MOL0 vs SYGMA1MOL0). -The number of reaction cycles to be performed for both phase 1 and phase 2 +The number of reaction cycles to be performed for both phase 1 and phase 2 metabolism should also be specified. ----- @@ -67,25 +78,32 @@ **Output** -For each molecule in the input file, a SMILES file is produced containing -SMILES strings of the metabolite outputs, a generated ID code, and an empirical -probability score (corresponding to an estimated probability that a product is +A tab-separated values (tsv) file for each molecule in the input file. +Columns contain a generated ID code (compound_id), SMILES strings of the +metabolite outputs (smiles) and an empirical probability score (sygma_score). +Calculated score value corresponds to an estimated probability that a product is actually metabolically produced in humans). The first line is always the parent -molecule itself:: +molecule itself. - Oc1ccccc1 SYGMA0MOL0 1.0 - O=C(O)C1OC(Oc2ccccc2)C(O)C(O)C1O SYGMA0MOL1 0.25 - O=S(=O)(O)Oc1ccccc1 SYGMA0MOL2 0.119 - Oc1ccc(O)cc1 SYGMA0MOL3 0.056 - COc1ccccc1 SYGMA0MOL4 0.054 - Oc1ccccc1O SYGMA0MOL5 0.032 - O=C(O)C1OC(Oc2ccc(O)cc2)C(O)C(O)C1O SYGMA0MOL6 0.014 - O=C(O)C1OC(Oc2ccccc2O)C(O)C(O)C1O SYGMA0MOL7 0.008 - O=S(=O)(O)Oc1ccc(O)cc1 SYGMA0MOL8 0.00666 - O=S(=O)(O)Oc1ccccc1O SYGMA0MOL9 0.00381 - COc1ccc(O)cc1 SYGMA0MOL10 0.00302 - COc1ccccc1O SYGMA0MOL11 0.00173 ++----------------------------------+-------------+-------------+ +| smiles | compound_id | sygma_score | ++----------------------------------+-------------+-------------+ +| Oc1ccccc1 | SYGMA0MOL0 | 1.0 | ++----------------------------------+-------------+-------------+ +| O=C(O)C1OC(Oc2ccccc2)C(O)C(O)C1O | SYGMA0MOL1 | 0.25 | ++----------------------------------+-------------+-------------+ +If option for more detailed output is selected, additional columns include +molecular formula (molecular_formula) of the chemical structure, number of +reactions (sygma_n) and transformation pathway (sygma_pathway) involved. + ++----------------------------------+-------------+--------------+-------------------+----------+---------------------------------------+ +| smiles | compound_id | sygma_score | molecular_formula | sygma_n | sygma_pathway | ++----------------------------------+-------------+--------------+-------------------+----------+---------------------------------------+ +| Oc1ccccc1 | SYGMA0MOL0 | 1.0 | C6H6O | 1 | parent | ++----------------------------------+-------------+--------------+-------------------+----------+---------------------------------------+ +| O=C(O)C1OC(Oc2ccccc2)C(O)C(O)C1O | SYGMA0MOL1 | 0.25 | C12H14O7 | 2 | O-glucuronidation_(aromatic_hydroxyl) | ++----------------------------------+-------------+--------------+-------------------+----------+---------------------------------------+ ]]> diff -r a2369e86bc48 -r 0e330829de40 sygma_metabolites.py --- a/sygma_metabolites.py Mon Sep 30 17:38:26 2019 -0400 +++ b/sygma_metabolites.py Sun Mar 15 13:18:38 2020 -0400 @@ -15,8 +15,8 @@ """ if ext == 'sdf': return [n for n in SDMolSupplier(filename)] - with open(filename) as f: - mols = f.read().split('\n') + with open(filename) as f: + mols = f.read().split('\n') if ext == 'smi' or ext == 'inchi': return [Chem.MolFromSmiles(mol, sanitize=True) for mol in mols if mol != ''] @@ -29,30 +29,53 @@ [sygma.ruleset['phase2'], int(phase2_cycles)]]) metabolic_tree = scenario.run(parent) metabolic_tree.calc_scores() - return metabolic_tree.to_smiles() + return metabolic_tree.to_list() def main(): parser = argparse.ArgumentParser() - parser.add_argument('-i', '--infile', required=True, help='Path to the input file.') - parser.add_argument('-o', '--outfile', required=True, help='Path to the output file.') - parser.add_argument("--iformat", help="Specify the input file format.") - parser.add_argument("--phase1", help="Number of phase1 cycles.") - parser.add_argument("--phase2", help="Number of phase2 cycles.") + parser.add_argument("-i", "--infile", required=True, help="Path to the input file.") + parser.add_argument("-o", "--outfile", required=True, help="Path to the output file.") + parser.add_argument("--iformat", required=True, help="Specify the input file format.") + parser.add_argument("--phase1", required=True, help="Number of phase1 cycles.") + parser.add_argument("--phase2", required=True, help="Number of phase2 cycles.") + parser.add_argument("--detailed", dest="detailed", + action="store_true", help="Returns more detailed output") args = parser.parse_args() mols = mol_supplier(args.infile, args.iformat) - outp = np.zeros((0,3)) + if args.detailed: + outp = np.zeros((0,6)) + else: + outp = np.zeros((0,3)) for n in range(len(mols)): - metabs = np.array(predict_metabolites(mols[n], args.phase1, args.phase2)) - metabs = np.column_stack(( - metabs[:,0], # SMILES - ['SYGMA{}MOL{}'.format(n, m) for m in range(metabs.shape[0])], # SMILES label - np.round(np.array(metabs[:,1], dtype=float), decimals=5) # score rounded to 5 dp - )) - outp = np.vstack((outp, metabs)) - np.savetxt(args.outfile, outp, fmt="%s") - + metabs = predict_metabolites(mols[n], args.phase1, args.phase2) + for entry in range(len(metabs)): + smiles = Chem.MolToSmiles(metabs[entry]['SyGMa_metabolite']) + if args.detailed: + out = np.column_stack(( + smiles, # SMILES + 'SYGMA{}MOL{}'.format(n, entry), # SMILES label + np.round(np.array(metabs[entry]['SyGMa_score'], dtype=float), + decimals=5), # score rounded to 5 dp + Chem.rdMolDescriptors.CalcMolFormula(Chem.MolFromSmiles(smiles)), # Molecular formula + len(metabs[entry]["SyGMa_pathway"].split("\n")), # SyGMa_n Sygma pathway length + metabs[entry]["SyGMa_pathway"].replace("\n", "") # SyGMa pathway + )) + else: + out = np.column_stack(( + smiles, # SMILES + 'SYGMA{}MOL{}'.format(n, entry), # SMILES label + np.round(np.array(metabs[entry]['SyGMa_score'], dtype=float), + decimals=5) # score rounded to 5 dp + )) + outp = np.vstack((outp, out)) + if args.detailed: + np.savetxt(args.outfile, outp, fmt="%s", delimiter="\t", + header="smiles\tcompound_id\tsygma_score\tmolecular_formula\tsygma_n\tsygma_pathway", comments="") + else: + np.savetxt(args.outfile, outp, fmt="%s", delimiter="\t", + header="smiles\tcompound_id\tsygma_score", comments="") if __name__ == "__main__": main() diff -r a2369e86bc48 -r 0e330829de40 test-data/o.smi --- a/test-data/o.smi Mon Sep 30 17:38:26 2019 -0400 +++ b/test-data/o.smi Sun Mar 15 13:18:38 2020 -0400 @@ -1,16 +1,17 @@ -Oc1ccccc1 SYGMA0MOL0 1.0 -O=C(O)C1OC(Oc2ccccc2)C(O)C(O)C1O SYGMA0MOL1 0.25 -O=S(=O)(O)Oc1ccccc1 SYGMA0MOL2 0.119 -Oc1ccc(O)cc1 SYGMA0MOL3 0.056 -COc1ccccc1 SYGMA0MOL4 0.054 -Oc1ccccc1O SYGMA0MOL5 0.032 -O=C(O)C1OC(Oc2ccc(O)cc2)C(O)C(O)C1O SYGMA0MOL6 0.014 -O=C(O)C1OC(Oc2ccccc2O)C(O)C(O)C1O SYGMA0MOL7 0.008 -O=S(=O)(O)Oc1ccc(O)cc1 SYGMA0MOL8 0.00666 -O=S(=O)(O)Oc1ccccc1O SYGMA0MOL9 0.00381 -COc1ccc(O)cc1 SYGMA0MOL10 0.00302 -COc1ccccc1O SYGMA0MOL11 0.00173 -CCOCC SYGMA1MOL0 1.0 -CCO SYGMA1MOL1 0.087 -CCOC1OC(C(=O)O)C(O)C(O)C1O SYGMA1MOL2 0.00879 -CCOS(=O)(=O)O SYGMA1MOL3 0.00157 +smiles compound_id sygma_score +Oc1ccccc1 SYGMA0MOL0 1.0 +O=C(O)C1OC(Oc2ccccc2)C(O)C(O)C1O SYGMA0MOL1 0.25 +O=S(=O)(O)Oc1ccccc1 SYGMA0MOL2 0.119 +Oc1ccc(O)cc1 SYGMA0MOL3 0.056 +COc1ccccc1 SYGMA0MOL4 0.054 +Oc1ccccc1O SYGMA0MOL5 0.032 +O=C(O)C1OC(Oc2ccc(O)cc2)C(O)C(O)C1O SYGMA0MOL6 0.014 +O=C(O)C1OC(Oc2ccccc2O)C(O)C(O)C1O SYGMA0MOL7 0.008 +O=S(=O)(O)Oc1ccc(O)cc1 SYGMA0MOL8 0.00666 +O=S(=O)(O)Oc1ccccc1O SYGMA0MOL9 0.00381 +COc1ccc(O)cc1 SYGMA0MOL10 0.00302 +COc1ccccc1O SYGMA0MOL11 0.00173 +CCOCC SYGMA1MOL0 1.0 +CCO SYGMA1MOL1 0.087 +CCOC1OC(C(=O)O)C(O)C(O)C1O SYGMA1MOL2 0.00879 +CCOS(=O)(=O)O SYGMA1MOL3 0.00157 diff -r a2369e86bc48 -r 0e330829de40 test-data/o2.smi --- a/test-data/o2.smi Mon Sep 30 17:38:26 2019 -0400 +++ b/test-data/o2.smi Sun Mar 15 13:18:38 2020 -0400 @@ -1,32 +1,33 @@ -CC(=O)Oc1ccccc1C(=O)O SYGMA0MOL0 1.0 -O=C(O)c1ccccc1O SYGMA0MOL1 0.529 -CC(=O)Oc1cc(O)ccc1C(=O)O SYGMA0MOL2 0.061 -CC(=O)Oc1ccc(O)cc1C(=O)O SYGMA0MOL3 0.056 -O=C(CO)Oc1ccccc1C(=O)O SYGMA0MOL4 0.049 -O=C(O)c1ccc(O)cc1O SYGMA0MOL5 0.03227 -CC(=O)Oc1c(O)cccc1C(=O)O SYGMA0MOL6 0.032 -O=C(O)c1cc(O)ccc1O SYGMA0MOL7 0.02962 -CC(=O)Oc1ccccc1 SYGMA0MOL8 0.023 -O=C(O)c1cccc(O)c1O SYGMA0MOL9 0.01693 -O=C(O)C(=O)Oc1ccccc1C(=O)O SYGMA0MOL10 0.016 -O=C(O)CO SYGMA0MOL11 0.01333 -Oc1ccccc1 SYGMA0MOL12 0.01217 -O=C(O)C(=O)O SYGMA0MOL13 0.00435 -O=C(O)Oc1ccccc1C(=O)O SYGMA0MOL14 0.00355 -CC(=O)Oc1cc(O)c(O)cc1C(=O)O SYGMA0MOL15 0.00342 -O=C(CO)Oc1cc(O)ccc1C(=O)O SYGMA0MOL16 0.00299 -O=C(CO)Oc1ccc(O)cc1C(=O)O SYGMA0MOL17 0.00274 -CC(=O)Oc1c(C(=O)O)ccc(O)c1O SYGMA0MOL18 0.00195 -CC(=O)Oc1c(O)cc(O)cc1C(=O)O SYGMA0MOL19 0.00179 -CC(=O)Oc1c(O)ccc(O)c1C(=O)O SYGMA0MOL20 0.00179 -O=C(CO)Oc1c(O)cccc1C(=O)O SYGMA0MOL21 0.00157 -CC(=O)Oc1cccc(O)c1 SYGMA0MOL22 0.0014 -CC(=O)Oc1ccc(O)cc1 SYGMA0MOL23 0.00129 -O=C(CO)Oc1ccccc1 SYGMA0MOL24 0.00113 -O=C(O)C(=O)Oc1cc(O)ccc1C(=O)O SYGMA0MOL25 0.00098 -O=C(O)C(=O)Oc1ccc(O)cc1C(=O)O SYGMA0MOL26 0.0009 -CC(=O)Oc1ccccc1O SYGMA0MOL27 0.00074 -CC(=O)Oc1ccc(O)c(O)c1C(=O)O SYGMA0MOL28 0.00073 -O=C(O)C(=O)Oc1c(O)cccc1C(=O)O SYGMA0MOL29 0.00051 -O=COc1ccccc1C(=O)O SYGMA0MOL30 0.00037 -O=C(O)C(=O)Oc1ccccc1 SYGMA0MOL31 0.00037 +smiles compound_id sygma_score +CC(=O)Oc1ccccc1C(=O)O SYGMA0MOL0 1.0 +O=C(O)c1ccccc1O SYGMA0MOL1 0.529 +CC(=O)Oc1cc(O)ccc1C(=O)O SYGMA0MOL2 0.061 +CC(=O)Oc1ccc(O)cc1C(=O)O SYGMA0MOL3 0.056 +O=C(CO)Oc1ccccc1C(=O)O SYGMA0MOL4 0.049 +O=C(O)c1ccc(O)cc1O SYGMA0MOL5 0.03227 +CC(=O)Oc1c(O)cccc1C(=O)O SYGMA0MOL6 0.032 +O=C(O)c1cc(O)ccc1O SYGMA0MOL7 0.02962 +CC(=O)Oc1ccccc1 SYGMA0MOL8 0.023 +O=C(O)c1cccc(O)c1O SYGMA0MOL9 0.01693 +O=C(O)C(=O)Oc1ccccc1C(=O)O SYGMA0MOL10 0.016 +O=C(O)CO SYGMA0MOL11 0.01333 +Oc1ccccc1 SYGMA0MOL12 0.01217 +O=C(O)C(=O)O SYGMA0MOL13 0.00435 +O=C(O)Oc1ccccc1C(=O)O SYGMA0MOL14 0.00355 +CC(=O)Oc1cc(O)c(O)cc1C(=O)O SYGMA0MOL15 0.00342 +O=C(CO)Oc1cc(O)ccc1C(=O)O SYGMA0MOL16 0.00299 +O=C(CO)Oc1ccc(O)cc1C(=O)O SYGMA0MOL17 0.00274 +CC(=O)Oc1c(C(=O)O)ccc(O)c1O SYGMA0MOL18 0.00195 +CC(=O)Oc1c(O)cc(O)cc1C(=O)O SYGMA0MOL19 0.00179 +CC(=O)Oc1c(O)ccc(O)c1C(=O)O SYGMA0MOL20 0.00179 +O=C(CO)Oc1c(O)cccc1C(=O)O SYGMA0MOL21 0.00157 +CC(=O)Oc1cccc(O)c1 SYGMA0MOL22 0.0014 +CC(=O)Oc1ccc(O)cc1 SYGMA0MOL23 0.00129 +O=C(CO)Oc1ccccc1 SYGMA0MOL24 0.00113 +O=C(O)C(=O)Oc1cc(O)ccc1C(=O)O SYGMA0MOL25 0.00098 +O=C(O)C(=O)Oc1ccc(O)cc1C(=O)O SYGMA0MOL26 0.0009 +CC(=O)Oc1ccccc1O SYGMA0MOL27 0.00074 +CC(=O)Oc1ccc(O)c(O)c1C(=O)O SYGMA0MOL28 0.00073 +O=C(O)C(=O)Oc1c(O)cccc1C(=O)O SYGMA0MOL29 0.00051 +O=COc1ccccc1C(=O)O SYGMA0MOL30 0.00037 +O=C(O)C(=O)Oc1ccccc1 SYGMA0MOL31 0.00037 diff -r a2369e86bc48 -r 0e330829de40 test-data/o_detailed.smi --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/o_detailed.smi Sun Mar 15 13:18:38 2020 -0400 @@ -0,0 +1,17 @@ +smiles compound_id sygma_score molecular_formula sygma_n sygma_pathway +Oc1ccccc1 SYGMA0MOL0 1.0 C6H6O 1 parent; +O=C(O)C1OC(Oc2ccccc2)C(O)C(O)C1O SYGMA0MOL1 0.25 C12H14O7 2 O-glucuronidation_(aromatic_hydroxyl); +O=S(=O)(O)Oc1ccccc1 SYGMA0MOL2 0.119 C6H6O4S 2 sulfation_(aromatic_hydroxyl); +Oc1ccc(O)cc1 SYGMA0MOL3 0.056 C6H6O2 2 aromatic_hydroxylation_(para_to_oxygen); +COc1ccccc1 SYGMA0MOL4 0.054 C7H8O 2 methylation_(aromatic_OH); +Oc1ccccc1O SYGMA0MOL5 0.032 C6H6O2 2 aromatic_hydroxylation_(ortho_to_oxygen); +O=C(O)C1OC(Oc2ccc(O)cc2)C(O)C(O)C1O SYGMA0MOL6 0.014 C12H14O8 3 aromatic_hydroxylation_(para_to_oxygen); O-glucuronidation_(aromatic_hydroxyl); +O=C(O)C1OC(Oc2ccccc2O)C(O)C(O)C1O SYGMA0MOL7 0.008 C12H14O8 3 aromatic_hydroxylation_(ortho_to_oxygen); O-glucuronidation_(aromatic_hydroxyl); +O=S(=O)(O)Oc1ccc(O)cc1 SYGMA0MOL8 0.00666 C6H6O5S 3 aromatic_hydroxylation_(para_to_oxygen); sulfation_(aromatic_hydroxyl); +O=S(=O)(O)Oc1ccccc1O SYGMA0MOL9 0.00381 C6H6O5S 3 aromatic_hydroxylation_(ortho_to_oxygen); sulfation_(aromatic_hydroxyl); +COc1ccc(O)cc1 SYGMA0MOL10 0.00302 C7H8O2 3 aromatic_hydroxylation_(para_to_oxygen); methylation_(aromatic_OH); +COc1ccccc1O SYGMA0MOL11 0.00173 C7H8O2 3 aromatic_hydroxylation_(ortho_to_oxygen); methylation_(aromatic_OH); +CCOCC SYGMA1MOL0 1.0 C4H10O 1 parent; +CCO SYGMA1MOL1 0.087 C2H6O 2 O-dealkylation_(aliphatic); +CCOC1OC(C(=O)O)C(O)C(O)C1O SYGMA1MOL2 0.00879 C8H14O7 3 O-dealkylation_(aliphatic); O-glucuronidation_(aliphatic_hydroxyl); +CCOS(=O)(=O)O SYGMA1MOL3 0.00157 C2H6O4S 3 O-dealkylation_(aliphatic); sulfation_(aliphatic_hydroxyl);