view @ 6:e3a7d6cc87af draft

planemo upload for repository commit 5c14bdfd58559dbcd8ab722e0872a8fc072aeca7
author bgruening
date Fri, 23 Mar 2018 03:26:33 -0400
parents 70b071de9bee
children 0d88631bb7de
line wrap: on
line source

#!/usr/bin/env python
    Modified version of code examples from the chemfp project.
    Thanks to Andrew Dalke of Andrew Dalke Scientific!
import matplotlib
import argparse
import os
import chemfp
import scipy.cluster.hierarchy as hcluster
import pylab
import numpy

def distance_matrix(arena, tanimoto_threshold = 0.0):
    n = len(arena)
    # Start off a similarity matrix with 1.0s along the diagonal
        similarities = numpy.identity(n, "d")
        raise Exception('Input dataset is to large!')
    chemfp.set_num_threads( args.processors )

    ## Compute the full similarity matrix.
    # The implementation computes the upper-triangle then copies
    # the upper-triangle into lower-triangle. It does not include
    # terms for the diagonal.
    results =, threshold=tanimoto_threshold)

    # Copy the results into the NumPy array.
    for row_index, row in enumerate(results.iter_indices_and_scores()):
        for target_index, target_score in row:
            similarities[row_index, target_index] = target_score

    # Return the distance matrix using the similarity matrix
    return 1.0 - similarities

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="""NxN clustering for fps files.
For more details please see the chemfp documentation:

    parser.add_argument("-i", "--input", dest="input_path",
                    help="Path to the input file.")

    parser.add_argument("-c", "--cluster", dest="cluster_image",
                    help="Path to the output cluster image.")

    parser.add_argument("-s", "--smatrix", dest="similarity_matrix",
                    help="Path to the similarity matrix output file.")

    parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", 
                    type=float, default=0.0,
                    help="Tanimoto threshold [0.0]")

    parser.add_argument("--oformat", default='png', help="Output format (png, svg)")

    parser.add_argument('-p', '--processors', type=int, 

    args = parser.parse_args()

    targets = args.input_path, format='fps' )
    arena = chemfp.load_fingerprints( targets )
    distances  = distance_matrix( arena, args.tanimoto_threshold )

    if args.similarity_matrix:
        distances.tofile( args.similarity_matrix )

    if args.cluster_image:
        linkage = hcluster.linkage( distances, method="single", metric="euclidean" )

        hcluster.dendrogram(linkage, labels=arena.ids)

        pylab.savefig( args.cluster_image, format=args.oformat )