Mercurial > repos > bgruening > chemfp
changeset 12:3b14765c22ee draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/chemfp commit 7fb96a3844b4771084f18de2346ed6d5e241d839"
author | bgruening |
---|---|
date | Sat, 25 Sep 2021 19:07:44 +0000 |
parents | 92c7cdc243e8 |
children | |
files | butina_clustering.py butina_clustering.xml mol2fps.xml nxn_clustering.py nxn_clustering.xml sdf2fps.xml test-data/CID_2244_FP2.fps test-data/CID_2244_FP3.fps test-data/CID_2244_FP4.fps test-data/CID_2244_MACCS.fps test-data/CID_2244_torsions.fps |
diffstat | 11 files changed, 160 insertions(+), 95 deletions(-) [+] |
line wrap: on
line diff
--- a/butina_clustering.py Wed Jun 24 13:12:05 2020 -0400 +++ b/butina_clustering.py Sat Sep 25 19:07:44 2021 +0000 @@ -5,22 +5,28 @@ Thanks to Andrew Dalke of Andrew Dalke Scientific! """ -import chemfp +import argparse +import os +import subprocess import sys -import os import tempfile -import argparse -import subprocess + +import chemfp from chemfp import search + def unix_sort(results): temp_unsorted = tempfile.NamedTemporaryFile(delete=False) - for (i,indices) in enumerate( results.iter_indices() ): - temp_unsorted.write('%s %s\n' % (len(indices), i)) + for (i, indices) in enumerate(results.iter_indices()): + temp_unsorted.write("%s %s\n" % (len(indices), i)) temp_unsorted.close() temp_sorted = tempfile.NamedTemporaryFile(delete=False) temp_sorted.close() - p = subprocess.Popen(['sort', '-n', '-r', '-k', '1,1'], stdin=open(temp_unsorted.name), stdout=open(temp_sorted.name, 'w+')) + p = subprocess.Popen( + ["sort", "-n", "-r", "-k", "1,1"], + stdin=open(temp_unsorted.name), + stdout=open(temp_sorted.name, "w+"), + ) stdout, stderr = p.communicate() return_code = p.returncode @@ -37,16 +43,19 @@ os.remove(temp_sorted.name) -def butina( args ): + +def butina(args): """ - Taylor-Butina clustering from the chemfp help. + Taylor-Butina clustering from the chemfp help. """ out = args.output_path - targets = chemfp.open( args.input_path, format='fps' ) - arena = chemfp.load_fingerprints( targets ) + targets = chemfp.open(args.input_path, format="fps") + arena = chemfp.load_fingerprints(targets) - chemfp.set_num_threads( args.processors ) - results = search.threshold_tanimoto_search_symmetric(arena, threshold = args.tanimoto_threshold) + chemfp.set_num_threads(args.processors) + results = search.threshold_tanimoto_search_symmetric( + arena, threshold=args.tanimoto_threshold + ) results.reorder_all("move-closest-first") sorted_ids = unix_sort(results) @@ -57,10 +66,10 @@ clusters = [] seen = set() - #for (size, fp_idx, members) in results: + # for (size, fp_idx, members) in results: for (size, fp_idx) in sorted_ids: members = results[fp_idx].get_indices() - #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members] + # print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members] if fp_idx in seen: # Can't use a centroid which is already assigned continue @@ -68,7 +77,7 @@ if size == 0: # The only fingerprint in the exclusion sphere is itself - true_singletons.append( fp_idx ) + true_singletons.append(fp_idx) continue # Figure out which ones haven't yet been assigned @@ -79,16 +88,16 @@ continue # this is a new cluster - clusters.append( (fp_idx, unassigned) ) + clusters.append((fp_idx, unassigned)) seen.update(unassigned) len_cluster = len(clusters) - #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) ) - #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) ) + # out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) ) + # out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) ) - out.write( "#%s true singletons\n" % len(true_singletons) ) - out.write( "#%s false singletons\n" % len(false_singletons) ) - out.write( "#clusters: %s\n" % len_cluster ) + out.write("#%s true singletons\n" % len(true_singletons)) + out.write("#%s false singletons\n" % len(false_singletons)) + out.write("#clusters: %s\n" % len_cluster) # Sort so the cluster with the most compounds comes first, # then by alphabetically smallest id @@ -100,8 +109,11 @@ for centroid_idx, members in clusters: centroid_name = arena.ids[centroid_idx] - out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members))) - #ToDo: len(members) need to be some biggest top 90% or something ... + out.write( + "%s\t%s\t%s\n" + % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members)) + ) + # ToDo: len(members) need to be some biggest top 90% or something ... for idx in true_singletons: out.write("%s\t%s\n" % (arena.ids[idx], 0)) @@ -110,25 +122,41 @@ if __name__ == "__main__": - parser = argparse.ArgumentParser(description="""Taylor-Butina clustering for fps files. + parser = argparse.ArgumentParser( + description="""Taylor-Butina clustering for fps files. For more details please see the original publication or the chemfp documentation: http://www.chemomine.co.uk/dbclus-paper.pdf https://chemfp.readthedocs.org -""") +""" + ) - parser.add_argument("-i", "--input", dest="input_path", - required=True, - help="Path to the input file.") + parser.add_argument( + "-i", + "--input", + dest="input_path", + required=True, + help="Path to the input file.", + ) - parser.add_argument("-o", "--output", dest="output_path", type=argparse.FileType('w'), - default=sys.stdout, - help="Path to the output file.") + parser.add_argument( + "-o", + "--output", + dest="output_path", + type=argparse.FileType("w"), + default=sys.stdout, + help="Path to the output file.", + ) - parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", type=float, - default=0.8, - help="Tanimoto threshold [0.8]") + parser.add_argument( + "-t", + "--threshold", + dest="tanimoto_threshold", + type=float, + default=0.8, + help="Tanimoto threshold [0.8]", + ) - parser.add_argument('-p', '--processors', type=int, default=4) + parser.add_argument("-p", "--processors", type=int, default=4) options = parser.parse_args() - butina( options ) + butina(options)
--- a/butina_clustering.xml Wed Jun 24 13:12:05 2020 -0400 +++ b/butina_clustering.xml Sat Sep 25 19:07:44 2021 +0000 @@ -1,7 +1,11 @@ -<tool id="ctb_chemfp_butina_clustering" name="Taylor-Butina clustering" version="1.6"> +<tool id="ctb_chemfp_butina_clustering" name="Taylor-Butina clustering" version="@TOOL_VERSION@+@VERSION_SUFFIX@"> <description>of molecular fingerprints</description> + <macros> + <token name="@TOOL_VERSION@">1.6.1</token> + <token name="@VERSION_SUFFIX@">0</token> + </macros> <requirements> - <requirement type="package" version="1.6">chemfp</requirement> + <requirement type="package" version="@TOOL_VERSION@">chemfp</requirement> </requirements> <command detect_errors="exit_code"> <![CDATA[
--- a/mol2fps.xml Wed Jun 24 13:12:05 2020 -0400 +++ b/mol2fps.xml Sat Sep 25 19:07:44 2021 +0000 @@ -1,9 +1,13 @@ -<tool id="ctb_chemfp_mol2fps" name="Molecule to fingerprint" version="1.6"> +<tool id="ctb_chemfp_mol2fps" name="Molecule to fingerprint" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@"> <description>conversion to several different fingerprint formats</description> <!--parallelism method="multi" split_inputs="infile" split_mode="to_size" split_size="10000" merge_outputs="outfile"></parallelism--> <requirements> - <requirement type="package" version="1.6">chemfp</requirement> + <requirement type="package" version="@TOOL_VERSION@">chemfp</requirement> </requirements> + <macros> + <token name="@TOOL_VERSION@">1.6.1</token> + <token name="@VERSION_SUFFIX@">0</token> + </macros> <command> <![CDATA[
--- a/nxn_clustering.py Wed Jun 24 13:12:05 2020 -0400 +++ b/nxn_clustering.py Sat Sep 25 19:07:44 2021 +0000 @@ -1,34 +1,38 @@ -#!/usr/bin/env python +# !/usr/bin/env python """ Modified version of code examples from the chemfp project. http://code.google.com/p/chem-fingerprints/ Thanks to Andrew Dalke of Andrew Dalke Scientific! """ -import matplotlib -matplotlib.use('Agg') -from matplotlib import rcParams -rcParams.update({'figure.autolayout': True}) + import argparse -import os + import chemfp -import scipy.cluster.hierarchy as hcluster -import pylab -import numpy +import matplotlib +matplotlib.use("Agg") # noqa +from matplotlib import rcParams # noqa +rcParams.update({"figure.autolayout": True}) # noqa +import numpy # noqa +import pylab # noqa +import scipy.cluster.hierarchy as hcluster # noqa -def distance_matrix(arena, tanimoto_threshold = 0.0): + +def distance_matrix(arena, tanimoto_threshold=0.0): n = len(arena) # Start off a similarity matrix with 1.0s along the diagonal try: similarities = numpy.identity(n, "d") - except: - raise Exception('Input dataset is to large!') - chemfp.set_num_threads( args.processors ) + except Exception: + raise Exception("Input dataset is to large!") + chemfp.set_num_threads(args.processors) - ## Compute the full similarity matrix. + # Compute the full similarity matrix. # The implementation computes the upper-triangle then copies # the upper-triangle into lower-triangle. It does not include # terms for the diagonal. - results = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=tanimoto_threshold) + results = chemfp.search.threshold_tanimoto_search_symmetric( + arena, threshold=tanimoto_threshold + ) # Copy the results into the NumPy array. for row_index, row in enumerate(results.iter_indices_and_scores()): @@ -40,41 +44,58 @@ if __name__ == "__main__": - parser = argparse.ArgumentParser(description="""NxN clustering for fps files. + parser = argparse.ArgumentParser( + description="""NxN clustering for fps files. For more details please see the chemfp documentation: https://chemfp.readthedocs.org -""") +""" + ) - parser.add_argument("-i", "--input", dest="input_path", - required=True, - help="Path to the input file.") + parser.add_argument( + "-i", + "--input", + dest="input_path", + required=True, + help="Path to the input file.", + ) - parser.add_argument("-c", "--cluster", dest="cluster_image", - help="Path to the output cluster image.") + parser.add_argument( + "-c", + "--cluster", + dest="cluster_image", + help="Path to the output cluster image.", + ) - parser.add_argument("-s", "--smatrix", dest="similarity_matrix", - help="Path to the similarity matrix output file.") + parser.add_argument( + "-s", + "--smatrix", + dest="similarity_matrix", + help="Path to the similarity matrix output file.", + ) - parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", - type=float, default=0.0, - help="Tanimoto threshold [0.0]") + parser.add_argument( + "-t", + "--threshold", + dest="tanimoto_threshold", + type=float, + default=0.0, + help="Tanimoto threshold [0.0]", + ) - parser.add_argument("--oformat", default='png', help="Output format (png, svg)") + parser.add_argument("--oformat", default="png", help="Output format (png, svg)") - parser.add_argument('-p', '--processors', type=int, - default=4) + parser.add_argument("-p", "--processors", type=int, default=4) args = parser.parse_args() - targets = chemfp.open( args.input_path, format='fps' ) - arena = chemfp.load_fingerprints( targets ) - distances = distance_matrix( arena, args.tanimoto_threshold ) + targets = chemfp.open(args.input_path, format="fps") + arena = chemfp.load_fingerprints(targets) + distances = distance_matrix(arena, args.tanimoto_threshold) if args.similarity_matrix: numpy.savetxt(args.similarity_matrix, distances) if args.cluster_image: linkage = hcluster.linkage(distances, method="single", metric="euclidean") - hcluster.dendrogram(linkage, labels=arena.ids, leaf_rotation=90.) + hcluster.dendrogram(linkage, labels=arena.ids, leaf_rotation=90.0) pylab.savefig(args.cluster_image, format=args.oformat) -
--- a/nxn_clustering.xml Wed Jun 24 13:12:05 2020 -0400 +++ b/nxn_clustering.xml Sat Sep 25 19:07:44 2021 +0000 @@ -1,7 +1,11 @@ -<tool id="ctb_chemfp_nxn_clustering" name="NxN clustering" version="1.6"> +<tool id="ctb_chemfp_nxn_clustering" name="NxN clustering" version="@TOOL_VERSION@+@VERSION_SUFFIX@"> <description>of molecular fingerprints</description> + <macros> + <token name="@TOOL_VERSION@">1.6.1</token> + <token name="@VERSION_SUFFIX@">0</token> + </macros> <requirements> - <requirement type="package" version="1.6">chemfp</requirement> + <requirement type="package" version="@TOOL_VERSION@">chemfp</requirement> <requirement type="package" version="2.2.5">matplotlib</requirement> <requirement type="package" version="1.2.1">scipy</requirement> </requirements>
--- a/sdf2fps.xml Wed Jun 24 13:12:05 2020 -0400 +++ b/sdf2fps.xml Sat Sep 25 19:07:44 2021 +0000 @@ -1,8 +1,12 @@ -<tool id="ctb_sdf2fps" name="SDF to Fingerprint" version="1.6"> +<tool id="ctb_sdf2fps" name="SDF to Fingerprint" version="@TOOL_VERSION@+@VERSION_SUFFIX@"> <description>- extract fingerprints from sdf file metadata</description> <!--parallelism method="multi" split_inputs="infile" split_mode="to_size" split_size="10000" shared_inputs="" merge_outputs="outfile"></parallelism--> + <macros> + <token name="@TOOL_VERSION@">1.6.1</token> + <token name="@VERSION_SUFFIX@">0</token> + </macros> <requirements> - <requirement type="package" version="1.6">chemfp</requirement> + <requirement type="package" version="@TOOL_VERSION@">chemfp</requirement> </requirements> <command> <![CDATA[
--- a/test-data/CID_2244_FP2.fps Wed Jun 24 13:12:05 2020 -0400 +++ b/test-data/CID_2244_FP2.fps Sat Sep 25 19:07:44 2021 +0000 @@ -1,7 +1,7 @@ #FPS1 #num_bits=1021 #type=OpenBabel-FP2/1 -#software=OpenBabel/3.0.0 chemfp/1.6 -#source=CID_2244.sdf -#date=2019-05-03T12:39:13 +#software=OpenBabel/3.0.0 chemfp/1.6.1 +#source=/tmp/tmp6hdbhy1a/files/a/f/f/dataset_affef7f7-3399-4725-a9fa-ea0be7eb33ee.dat +#date=2021-09-22T11:52:48 00000010004000c00000020000030000010000000008000000000080000000000400400000000010200a020800000000000042000000000000800002000002000c200800010001010000000002808002208000400000000040080000000100000008000000000002004002000010000000020100080100200808000000000004 2244
--- a/test-data/CID_2244_FP3.fps Wed Jun 24 13:12:05 2020 -0400 +++ b/test-data/CID_2244_FP3.fps Sat Sep 25 19:07:44 2021 +0000 @@ -1,7 +1,7 @@ #FPS1 #num_bits=55 #type=OpenBabel-FP3/1 -#software=OpenBabel/3.0.0 chemfp/1.6 -#source=CID_2244.sdf -#date=2019-05-03T12:39:21 +#software=OpenBabel/3.0.0 chemfp/1.6.1 +#source=/tmp/tmp6hdbhy1a/files/7/6/2/dataset_762912b3-00ea-43f4-929b-ec5575808d8d.dat +#date=2021-09-22T11:53:12 0400000c50b007 2244
--- a/test-data/CID_2244_FP4.fps Wed Jun 24 13:12:05 2020 -0400 +++ b/test-data/CID_2244_FP4.fps Sat Sep 25 19:07:44 2021 +0000 @@ -1,7 +1,7 @@ #FPS1 #num_bits=307 #type=OpenBabel-FP4/1 -#software=OpenBabel/3.0.0 chemfp/1.6 -#source=CID_2244.sdf -#date=2019-05-03T12:39:27 +#software=OpenBabel/3.0.0 chemfp/1.6.1 +#source=/tmp/tmp6hdbhy1a/files/3/f/e/dataset_3fe99fed-d2c3-4f2e-a45b-f3f8ae8b83ad.dat +#date=2021-09-22T11:53:34 010000000000000000009800000000004001000000000000000000000000000000000240402801 2244
--- a/test-data/CID_2244_MACCS.fps Wed Jun 24 13:12:05 2020 -0400 +++ b/test-data/CID_2244_MACCS.fps Sat Sep 25 19:07:44 2021 +0000 @@ -1,7 +1,7 @@ #FPS1 #num_bits=166 #type=OpenBabel-MACCS/2 -#software=OpenBabel/3.0.0 chemfp/1.6 -#source=CID_2244.sdf -#date=2019-05-03T12:39:49 +#software=OpenBabel/3.0.0 chemfp/1.6.1 +#source=/tmp/tmp6hdbhy1a/files/6/6/e/dataset_66e4701a-88cc-4176-aae2-d18d73319e2a.dat +#date=2021-09-22T11:54:01 0000000000000000000000010000016480cca2d21e 2244
--- a/test-data/CID_2244_torsions.fps Wed Jun 24 13:12:05 2020 -0400 +++ b/test-data/CID_2244_torsions.fps Sat Sep 25 19:07:44 2021 +0000 @@ -1,7 +1,7 @@ #FPS1 #num_bits=2048 #type=RDKit-Torsion/2 fpSize=2048 targetSize=4 -#software=RDKit/2018.09.3 chemfp/1.6 -#source=test-data/CID_2244.smi -#date=2019-04-25T15:11:02 +#software=RDKit/2018.09.3 chemfp/1.6.1 +#source=/tmp/tmp6hdbhy1a/files/2/b/a/dataset_2baed193-8060-4271-9a6f-4f865eea0daa.dat +#date=2021-09-22T11:54:15 00100010000003000000000000000000000000000000000000000000000000000000000000000000000000000000000300000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000100000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000010000001000000000000110000000000000000000000000000000000001001000000000000000000001000000000000000000000 2244