view nxn_clustering.py @ 12:3b14765c22ee draft default tip

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 7fb96a3844b4771084f18de2346ed6d5e241d839"
author bgruening
date Sat, 25 Sep 2021 19:07:44 +0000
parents 198b1e30c739
children
line wrap: on
line source

# !/usr/bin/env python
"""
    Modified version of code examples from the chemfp project.
    http://code.google.com/p/chem-fingerprints/
    Thanks to Andrew Dalke of Andrew Dalke Scientific!
"""

import argparse

import chemfp
import matplotlib
matplotlib.use("Agg")  # noqa
from matplotlib import rcParams  # noqa
rcParams.update({"figure.autolayout": True})  # noqa
import numpy  # noqa
import pylab  # noqa
import scipy.cluster.hierarchy as hcluster  # noqa


def distance_matrix(arena, tanimoto_threshold=0.0):
    n = len(arena)
    # Start off a similarity matrix with 1.0s along the diagonal
    try:
        similarities = numpy.identity(n, "d")
    except Exception:
        raise Exception("Input dataset is to large!")
    chemfp.set_num_threads(args.processors)

    # Compute the full similarity matrix.
    # The implementation computes the upper-triangle then copies
    # the upper-triangle into lower-triangle. It does not include
    # terms for the diagonal.
    results = chemfp.search.threshold_tanimoto_search_symmetric(
        arena, threshold=tanimoto_threshold
    )

    # Copy the results into the NumPy array.
    for row_index, row in enumerate(results.iter_indices_and_scores()):
        for target_index, target_score in row:
            similarities[row_index, target_index] = target_score

    # Return the distance matrix using the similarity matrix
    return 1.0 - similarities


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="""NxN clustering for fps files.
For more details please see the chemfp documentation:
https://chemfp.readthedocs.org
"""
    )

    parser.add_argument(
        "-i",
        "--input",
        dest="input_path",
        required=True,
        help="Path to the input file.",
    )

    parser.add_argument(
        "-c",
        "--cluster",
        dest="cluster_image",
        help="Path to the output cluster image.",
    )

    parser.add_argument(
        "-s",
        "--smatrix",
        dest="similarity_matrix",
        help="Path to the similarity matrix output file.",
    )

    parser.add_argument(
        "-t",
        "--threshold",
        dest="tanimoto_threshold",
        type=float,
        default=0.0,
        help="Tanimoto threshold [0.0]",
    )

    parser.add_argument("--oformat", default="png", help="Output format (png, svg)")

    parser.add_argument("-p", "--processors", type=int, default=4)

    args = parser.parse_args()

    targets = chemfp.open(args.input_path, format="fps")
    arena = chemfp.load_fingerprints(targets)
    distances = distance_matrix(arena, args.tanimoto_threshold)

    if args.similarity_matrix:
        numpy.savetxt(args.similarity_matrix, distances)

    if args.cluster_image:
        linkage = hcluster.linkage(distances, method="single", metric="euclidean")
        hcluster.dendrogram(linkage, labels=arena.ids, leaf_rotation=90.0)
        pylab.savefig(args.cluster_image, format=args.oformat)