view sucos.py @ 6:b8725fec8c7b draft default tip

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
author bgruening
date Wed, 14 Apr 2021 09:30:48 +0000
parents 58d18838e244
children
line wrap: on
line source

#!/usr/bin/env python
"""
Basic SuCOS scoring. Allows a set of molecules from a SD file to be overlayed to a reference molecule,
with the resulting scores being written as properties in the output SD file.

SuCOS is the work of Susan Leung.
GitHub: https://github.com/susanhleung/SuCOS
Publication: https://doi.org/10.26434/chemrxiv.8100203.v1
"""

from __future__ import print_function

import argparse
import os

import numpy as np
import utils
from rdkit import Chem, RDConfig
from rdkit.Chem import AllChem, rdShapeHelpers
from rdkit.Chem.FeatMaps import FeatMaps

# start function definitions #########################################

# Setting up the features to use in FeatureMap
fdef = AllChem.BuildFeatureFactory(
    os.path.join(RDConfig.RDDataDir, "BaseFeatures.fdef")
)

fmParams = {}
for k in fdef.GetFeatureFamilies():
    fparams = FeatMaps.FeatMapParams()
    fmParams[k] = fparams

keep = (
    "Donor",
    "Acceptor",
    "NegIonizable",
    "PosIonizable",
    "ZnBinder",
    "Aromatic",
    "Hydrophobe",
    "LumpedHydrophobe",
)


def filterFeature(f):
    result = f.GetFamily() in keep
    # TODO - nothing ever seems to be filtered. Is this expected?
    if not result:
        utils.log("Filtered out feature type", f.GetFamily())
    return result


def getRawFeatures(mol):

    rawFeats = fdef.GetFeaturesForMol(mol)
    # filter that list down to only include the ones we're interested in
    filtered = list(filter(filterFeature, rawFeats))
    return filtered


def get_FeatureMapScore(
    small_feats, large_feats, tani=False, score_mode=FeatMaps.FeatMapScoreMode.All
):
    """
    Generate the feature map score.

    :param small_feats:
    :param large_feats:
    :param tani:
    :return:
    """

    featLists = []
    for rawFeats in [small_feats, large_feats]:
        # filter that list down to only include the ones we're interested in
        featLists.append(rawFeats)
    fms = [
        FeatMaps.FeatMap(feats=x, weights=[1] * len(x), params=fmParams)
        for x in featLists
    ]
    # set the score mode
    fms[0].scoreMode = score_mode

    try:
        if tani:
            c = fms[0].ScoreFeats(featLists[1])
            A = fms[0].GetNumFeatures()
            B = len(featLists[1])
            if B != fms[1].GetNumFeatures():
                utils.log("Why isn't B equal to number of features...?!")
            tani_score = float(c) / (A + B - c)
            return tani_score
        else:
            fm_score = fms[0].ScoreFeats(featLists[1]) / min(
                fms[0].GetNumFeatures(), len(featLists[1])
            )
            return fm_score
    except ZeroDivisionError:
        utils.log("ZeroDivisionError")
        return 0

    if tani:
        tani_score = float(c) / (A + B - c)
        return tani_score
    else:
        fm_score = fms[0].ScoreFeats(featLists[1]) / min(
            fms[0].GetNumFeatures(), len(featLists[1])
        )
        return fm_score


def get_SucosScore(
    ref_mol,
    query_mol,
    tani=False,
    ref_features=None,
    query_features=None,
    score_mode=FeatMaps.FeatMapScoreMode.All,
):
    """
    This is the key function that calculates the SuCOS scores and is expected to be called from other modules.
    To improve performance you can pre-calculate the features and pass them in as optional parameters to avoid having
    to recalculate them. Use the getRawFeatures function to pre-calculate the features.

    :param ref_mol: The reference molecule to compare to
    :param query_mol: The molecule to align to the reference
    :param tani: Whether to calculate Tanimoto distances
    :param ref_features: An optional feature map for the reference molecule, avoiding the need to re-calculate it.
    :param query_features: An optional feature map for the query molecule, avoiding the need to re-calculate it.
    :return: A tuple of 3 values. 1 the sucos score, 2 the feature map score,
        3 the Tanimoto distance or 1 minus the protrude distance
    """

    if not ref_features:
        ref_features = getRawFeatures(ref_mol)
    if not query_features:
        query_features = getRawFeatures(query_mol)

    fm_score = get_FeatureMapScore(ref_features, query_features, tani, score_mode)
    fm_score = np.float(np.clip(fm_score, 0, 1))

    try:
        if tani:
            tani_sim = 1 - float(rdShapeHelpers.ShapeTanimotoDist(ref_mol, query_mol))
            tani_sim = np.clip(tani_sim, 0, 1)
            SuCOS_score = 0.5 * fm_score + 0.5 * tani_sim
            return SuCOS_score, fm_score, tani_sim
        else:
            protrude_dist = rdShapeHelpers.ShapeProtrudeDist(
                ref_mol, query_mol, allowReordering=False
            )
            protrude_dist = np.clip(protrude_dist, 0, 1)
            protrude_val = 1.0 - protrude_dist
            SuCOS_score = 0.5 * fm_score + 0.5 * protrude_val
            return SuCOS_score, fm_score, protrude_val
    except Exception:
        utils.log("Failed to calculate SuCOS scores. Returning 0,0,0")
        return 0, 0, 0


def process(
    refmol_filename,
    inputs_filename,
    outputs_filename,
    refmol_index=None,
    refmol_format=None,
    tani=False,
    score_mode=FeatMaps.FeatMapScoreMode.All,
):

    ref_mol = utils.read_single_molecule(
        refmol_filename, index=refmol_index, format=refmol_format
    )
    # utils.log("Reference mol has", ref_mol.GetNumHeavyAtoms(), "heavy atoms")
    ref_features = getRawFeatures(ref_mol)

    input_file = utils.open_file_for_reading(inputs_filename)
    suppl = Chem.ForwardSDMolSupplier(input_file)
    output_file = utils.open_file_for_writing(outputs_filename)
    writer = Chem.SDWriter(output_file)

    count = 0
    total = 0
    errors = 0
    for mol in suppl:
        count += 1
        if mol is None:
            continue
        # utils.log("Mol has", str(mol.GetNumHeavyAtoms()), "heavy atoms")
        try:
            sucos_score, fm_score, val3 = get_SucosScore(
                ref_mol,
                mol,
                tani=tani,
                ref_features=ref_features,
                score_mode=score_mode,
            )
            mol.SetDoubleProp("SuCOS_Score", sucos_score)
            mol.SetDoubleProp("SuCOS_FeatureMap_Score", fm_score)
            if tani:
                mol.SetDoubleProp("SuCOS_Tanimoto_Score", val3)
            else:
                mol.SetDoubleProp("SuCOS_Protrude_Score", val3)
            utils.log("Scores:", sucos_score, fm_score, val3)
            writer.write(mol)
            total += 1
        except ValueError as e:
            errors += 1
            utils.log("Molecule", count, "failed to score:", e.message)

    input_file.close()
    writer.flush()
    writer.close()
    output_file.close()

    utils.log(
        "Completed.", total, "processed, ", count, "succeeded, ", errors, "errors"
    )


def parse_score_mode(value):
    if value is None or value == "all":
        return FeatMaps.FeatMapScoreMode.All
    elif value == "closest":
        return FeatMaps.FeatMapScoreMode.Closest
    elif value == "best":
        return FeatMaps.FeatMapScoreMode.Best
    else:
        raise ValueError(value + " is not a valid scoring mode option")


# start main execution #########################################


def main():

    parser = argparse.ArgumentParser(description="SuCOS with RDKit")
    parser.add_argument(
        "-i", "--input", help="Input file in SDF format. Can be gzipped (*.gz)."
    )
    parser.add_argument(
        "-r",
        "--refmol",
        help="Molecule to compare against in Molfile (.mol) or SDF (.sdf) format",
    )
    parser.add_argument(
        "--refmol-format",
        help="Format for the reference molecule (mol or sdf). "
        + "Only needed if files don't have the expected extensions",
    )
    parser.add_argument(
        "--refmolidx",
        help="Reference molecule index in SD file if not the first",
        type=int,
        default=1,
    )
    parser.add_argument(
        "-o", "--output", help="Output file in SDF format. Can be gzipped (*.gz)."
    )
    parser.add_argument(
        "--tanimoto", action="store_true", help="Include Tanimoto distance in score"
    )
    parser.add_argument(
        "--score_mode",
        choices=["all", "closest", "best"],
        help="choose the scoring mode for the feature map, default is 'all'.",
    )

    args = parser.parse_args()
    utils.log("SuCOS Args: ", args)

    score_mode = parse_score_mode(args.score_mode)

    process(
        args.refmol,
        args.input,
        args.output,
        refmol_index=args.refmolidx,
        refmol_format=args.refmol_format,
        tani=args.tanimoto,
        score_mode=score_mode,
    )


if __name__ == "__main__":
    main()