Mercurial > repos > bgruening > chemfp

--- a/butina_clustering.py	Wed Jun 24 13:12:05 2020 -0400
+++ b/butina_clustering.py	Sat Sep 25 19:07:44 2021 +0000
@@ -5,22 +5,28 @@
     Thanks to Andrew Dalke of Andrew Dalke Scientific!
 """

-import chemfp
+import argparse
+import os
+import subprocess
 import sys
-import os
 import tempfile
-import argparse
-import subprocess
+
+import chemfp
 from chemfp import search

+
 def unix_sort(results):
     temp_unsorted = tempfile.NamedTemporaryFile(delete=False)
-    for (i,indices) in enumerate( results.iter_indices() ):
-        temp_unsorted.write('%s %s\n' % (len(indices), i))
+    for (i, indices) in enumerate(results.iter_indices()):
+        temp_unsorted.write("%s %s\n" % (len(indices), i))
     temp_unsorted.close()
     temp_sorted = tempfile.NamedTemporaryFile(delete=False)
     temp_sorted.close()
-    p = subprocess.Popen(['sort', '-n', '-r', '-k', '1,1'], stdin=open(temp_unsorted.name), stdout=open(temp_sorted.name, 'w+'))
+    p = subprocess.Popen(
+        ["sort", "-n", "-r", "-k", "1,1"],
+        stdin=open(temp_unsorted.name),
+        stdout=open(temp_sorted.name, "w+"),
+    )
     stdout, stderr = p.communicate()
     return_code = p.returncode

@@ -37,16 +43,19 @@

     os.remove(temp_sorted.name)

-def butina( args ):
+
+def butina(args):
     """
-        Taylor-Butina clustering from the chemfp help.
+    Taylor-Butina clustering from the chemfp help.
     """
     out = args.output_path
-    targets = chemfp.open( args.input_path, format='fps' )
-    arena = chemfp.load_fingerprints( targets )
+    targets = chemfp.open(args.input_path, format="fps")
+    arena = chemfp.load_fingerprints(targets)

-    chemfp.set_num_threads( args.processors )
-    results = search.threshold_tanimoto_search_symmetric(arena, threshold = args.tanimoto_threshold)
+    chemfp.set_num_threads(args.processors)
+    results = search.threshold_tanimoto_search_symmetric(
+        arena, threshold=args.tanimoto_threshold
+    )
     results.reorder_all("move-closest-first")

     sorted_ids = unix_sort(results)
@@ -57,10 +66,10 @@
     clusters = []

     seen = set()
-    #for (size, fp_idx, members) in results:
+    # for (size, fp_idx, members) in results:
     for (size, fp_idx) in sorted_ids:
         members = results[fp_idx].get_indices()
-        #print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members]
+        # print arena.ids[ fp_idx ], [arena.ids[ m ] for m in members]
         if fp_idx in seen:
             # Can't use a centroid which is already assigned
             continue
@@ -68,7 +77,7 @@

         if size == 0:
             # The only fingerprint in the exclusion sphere is itself
-            true_singletons.append( fp_idx )
+            true_singletons.append(fp_idx)
             continue

         # Figure out which ones haven't yet been assigned
@@ -79,16 +88,16 @@
             continue

         # this is a new cluster
-        clusters.append( (fp_idx, unassigned) )
+        clusters.append((fp_idx, unassigned))
         seen.update(unassigned)

     len_cluster = len(clusters)
-    #out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) )
-    #out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) )
+    # out.write( "#%s true singletons: %s\n" % ( len(true_singletons), " ".join(sorted(arena.ids[idx] for idx in true_singletons)) ) )
+    # out.write( "#%s false singletons: %s\n" % ( len(false_singletons), " ".join(sorted(arena.ids[idx] for idx in false_singletons)) ) )

-    out.write( "#%s true singletons\n" % len(true_singletons) )
-    out.write( "#%s false singletons\n" % len(false_singletons) )
-    out.write( "#clusters: %s\n" % len_cluster )
+    out.write("#%s true singletons\n" % len(true_singletons))
+    out.write("#%s false singletons\n" % len(false_singletons))
+    out.write("#clusters: %s\n" % len_cluster)

     # Sort so the cluster with the most compounds comes first,
     # then by alphabetically smallest id
@@ -100,8 +109,11 @@

     for centroid_idx, members in clusters:
         centroid_name = arena.ids[centroid_idx]
-        out.write("%s\t%s\t%s\n" % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members)))
-        #ToDo: len(members) need to be some biggest top 90% or something ...
+        out.write(
+            "%s\t%s\t%s\n"
+            % (centroid_name, len(members), " ".join(arena.ids[idx] for idx in members))
+        )
+        # ToDo: len(members) need to be some biggest top 90% or something ...

     for idx in true_singletons:
         out.write("%s\t%s\n" % (arena.ids[idx], 0))
@@ -110,25 +122,41 @@


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="""Taylor-Butina clustering for fps files.
+    parser = argparse.ArgumentParser(
+        description="""Taylor-Butina clustering for fps files.
 For more details please see the original publication or the chemfp documentation:
 http://www.chemomine.co.uk/dbclus-paper.pdf
 https://chemfp.readthedocs.org
-""")
+"""
+    )

-    parser.add_argument("-i", "--input", dest="input_path",
-                    required=True,
-                    help="Path to the input file.")
+    parser.add_argument(
+        "-i",
+        "--input",
+        dest="input_path",
+        required=True,
+        help="Path to the input file.",
+    )

-    parser.add_argument("-o", "--output", dest="output_path", type=argparse.FileType('w'),
-                    default=sys.stdout,
-                    help="Path to the output file.")
+    parser.add_argument(
+        "-o",
+        "--output",
+        dest="output_path",
+        type=argparse.FileType("w"),
+        default=sys.stdout,
+        help="Path to the output file.",
+    )

-    parser.add_argument("-t", "--threshold", dest="tanimoto_threshold", type=float,
-                    default=0.8,
-                    help="Tanimoto threshold [0.8]")
+    parser.add_argument(
+        "-t",
+        "--threshold",
+        dest="tanimoto_threshold",
+        type=float,
+        default=0.8,
+        help="Tanimoto threshold [0.8]",
+    )

-    parser.add_argument('-p', '--processors', type=int, default=4)
+    parser.add_argument("-p", "--processors", type=int, default=4)

     options = parser.parse_args()
-    butina( options )
+    butina(options)
--- a/butina_clustering.xml	Wed Jun 24 13:12:05 2020 -0400
+++ b/butina_clustering.xml	Sat Sep 25 19:07:44 2021 +0000
@@ -1,7 +1,11 @@
-<tool id="ctb_chemfp_butina_clustering" name="Taylor-Butina clustering" version="1.6">
+<tool id="ctb_chemfp_butina_clustering" name="Taylor-Butina clustering" version="@TOOL_VERSION@+@VERSION_SUFFIX@">
     <description>of molecular fingerprints</description>
+    <macros>
+        <token name="@TOOL_VERSION@">1.6.1</token>
+        <token name="@VERSION_SUFFIX@">0</token>
+    </macros>
     <requirements>
-        <requirement type="package" version="1.6">chemfp</requirement>
+        <requirement type="package" version="@TOOL_VERSION@">chemfp</requirement>
     </requirements>
     <command detect_errors="exit_code">
 <![CDATA[
--- a/mol2fps.xml	Wed Jun 24 13:12:05 2020 -0400
+++ b/mol2fps.xml	Sat Sep 25 19:07:44 2021 +0000
@@ -1,9 +1,13 @@
-<tool id="ctb_chemfp_mol2fps" name="Molecule to fingerprint" version="1.6">
+<tool id="ctb_chemfp_mol2fps" name="Molecule to fingerprint" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
     <description>conversion to several different fingerprint formats</description>
     <!--parallelism method="multi" split_inputs="infile" split_mode="to_size" split_size="10000" merge_outputs="outfile"></parallelism-->
     <requirements>
-        <requirement type="package" version="1.6">chemfp</requirement>
+        <requirement type="package" version="@TOOL_VERSION@">chemfp</requirement>
     </requirements>
+    <macros>
+        <token name="@TOOL_VERSION@">1.6.1</token>
+        <token name="@VERSION_SUFFIX@">0</token>
+    </macros>
     <command>
 <![CDATA[
--- a/nxn_clustering.py	Wed Jun 24 13:12:05 2020 -0400
+++ b/nxn_clustering.py	Sat Sep 25 19:07:44 2021 +0000
@@ -1,34 +1,38 @@
-#!/usr/bin/env python
+# !/usr/bin/env python
 """
     Modified version of code examples from the chemfp project.
     http://code.google.com/p/chem-fingerprints/
     Thanks to Andrew Dalke of Andrew Dalke Scientific!
 """
-import matplotlib
-matplotlib.use('Agg')
-from matplotlib import rcParams
-rcParams.update({'figure.autolayout': True})
+
 import argparse
-import os
+
 import chemfp
-import scipy.cluster.hierarchy as hcluster
-import pylab
-import numpy
+import matplotlib
+matplotlib.use("Agg")  # noqa
+from matplotlib import rcParams  # noqa
+rcParams.update({"figure.autolayout": True})  # noqa
+import numpy  # noqa
+import pylab  # noqa
+import scipy.cluster.hierarchy as hcluster  # noqa

-def distance_matrix(arena, tanimoto_threshold = 0.0):
+
+def distance_matrix(arena, tanimoto_threshold=0.0):
     n = len(arena)
     # Start off a similarity matrix with 1.0s along the diagonal
     try:
         similarities = numpy.identity(n, "d")
-    except:
-        raise Exception('Input dataset is to large!')
-    chemfp.set_num_threads( args.processors )
+    except Exception:
+        raise Exception("Input dataset is to large!")
+    chemfp.set_num_threads(args.processors)

-    ## Compute the full similarity matrix.
+    # Compute the full similarity matrix.
     # The implementation computes the upper-triangle then copies
     # the upper-triangle into lower-triangle. It does not include
     # terms for the diagonal.
-    results = chemfp.search.threshold_tanimoto_search_symmetric(arena, threshold=tanimoto_threshold)
+    results = chemfp.search.threshold_tanimoto_search_symmetric(
+        arena, threshold=tanimoto_threshold
+    )

     # Copy the results into the NumPy array.
     for row_index, row in enumerate(results.iter_indices_and_scores()):
@@ -40,41 +44,58 @@


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="""NxN clustering for fps files.
+    parser = argparse.ArgumentParser(
+        description="""NxN clustering for fps files.
 For more details please see the chemfp documentation:
 https://chemfp.readthedocs.org
-""")
+"""
+    )

-    parser.add_argument("-i", "--input", dest="input_path",
-                    required=True,
-                    help="Path to the input file.")
+    parser.add_argument(
+        "-i",
+        "--input",
+        dest="input_path",
+        required=True,
+        help="Path to the input file.",
+    )

-    parser.add_argument("-c", "--cluster", dest="cluster_image",
-                    help="Path to the output cluster image.")
+    parser.add_argument(
+        "-c",
+        "--cluster",
+        dest="cluster_image",
+        help="Path to the output cluster image.",
+    )

-    parser.add_argument("-s", "--smatrix", dest="similarity_matrix",
-                    help="Path to the similarity matrix output file.")
+    parser.add_argument(
+        "-s",
+        "--smatrix",
+        dest="similarity_matrix",
+        help="Path to the similarity matrix output file.",
+    )

-    parser.add_argument("-t", "--threshold", dest="tanimoto_threshold",
-                    type=float, default=0.0,
-                    help="Tanimoto threshold [0.0]")
+    parser.add_argument(
+        "-t",
+        "--threshold",
+        dest="tanimoto_threshold",
+        type=float,
+        default=0.0,
+        help="Tanimoto threshold [0.0]",
+    )

-    parser.add_argument("--oformat", default='png', help="Output format (png, svg)")
+    parser.add_argument("--oformat", default="png", help="Output format (png, svg)")

-    parser.add_argument('-p', '--processors', type=int,
-        default=4)
+    parser.add_argument("-p", "--processors", type=int, default=4)

     args = parser.parse_args()

-    targets = chemfp.open( args.input_path, format='fps' )
-    arena = chemfp.load_fingerprints( targets )
-    distances  = distance_matrix( arena, args.tanimoto_threshold )
+    targets = chemfp.open(args.input_path, format="fps")
+    arena = chemfp.load_fingerprints(targets)
+    distances = distance_matrix(arena, args.tanimoto_threshold)

     if args.similarity_matrix:
         numpy.savetxt(args.similarity_matrix, distances)

     if args.cluster_image:
         linkage = hcluster.linkage(distances, method="single", metric="euclidean")
-        hcluster.dendrogram(linkage, labels=arena.ids, leaf_rotation=90.)
+        hcluster.dendrogram(linkage, labels=arena.ids, leaf_rotation=90.0)
         pylab.savefig(args.cluster_image, format=args.oformat)
-
--- a/nxn_clustering.xml	Wed Jun 24 13:12:05 2020 -0400
+++ b/nxn_clustering.xml	Sat Sep 25 19:07:44 2021 +0000
@@ -1,7 +1,11 @@
-<tool id="ctb_chemfp_nxn_clustering" name="NxN clustering" version="1.6">
+<tool id="ctb_chemfp_nxn_clustering" name="NxN clustering" version="@TOOL_VERSION@+@VERSION_SUFFIX@">
     <description>of molecular fingerprints</description>
+    <macros>
+        <token name="@TOOL_VERSION@">1.6.1</token>
+        <token name="@VERSION_SUFFIX@">0</token>
+    </macros>
     <requirements>
-        <requirement type="package" version="1.6">chemfp</requirement>
+        <requirement type="package" version="@TOOL_VERSION@">chemfp</requirement>
         <requirement type="package" version="2.2.5">matplotlib</requirement>
         <requirement type="package" version="1.2.1">scipy</requirement>
     </requirements>
--- a/sdf2fps.xml	Wed Jun 24 13:12:05 2020 -0400
+++ b/sdf2fps.xml	Sat Sep 25 19:07:44 2021 +0000
@@ -1,8 +1,12 @@
-<tool id="ctb_sdf2fps" name="SDF to Fingerprint" version="1.6">
+<tool id="ctb_sdf2fps" name="SDF to Fingerprint" version="@TOOL_VERSION@+@VERSION_SUFFIX@">
     <description>- extract fingerprints from sdf file metadata</description>
     <!--parallelism method="multi" split_inputs="infile" split_mode="to_size" split_size="10000" shared_inputs="" merge_outputs="outfile"></parallelism-->
+    <macros>
+        <token name="@TOOL_VERSION@">1.6.1</token>
+        <token name="@VERSION_SUFFIX@">0</token>
+    </macros>
     <requirements>
-        <requirement type="package" version="1.6">chemfp</requirement>
+        <requirement type="package" version="@TOOL_VERSION@">chemfp</requirement>
     </requirements>
     <command>
 <![CDATA[
--- a/test-data/CID_2244_FP2.fps	Wed Jun 24 13:12:05 2020 -0400
+++ b/test-data/CID_2244_FP2.fps	Sat Sep 25 19:07:44 2021 +0000
@@ -1,7 +1,7 @@
 #FPS1
 #num_bits=1021
 #type=OpenBabel-FP2/1
-#software=OpenBabel/3.0.0 chemfp/1.6
-#source=CID_2244.sdf
-#date=2019-05-03T12:39:13
+#software=OpenBabel/3.0.0 chemfp/1.6.1
+#source=/tmp/tmp6hdbhy1a/files/a/f/f/dataset_affef7f7-3399-4725-a9fa-ea0be7eb33ee.dat
+#date=2021-09-22T11:52:48
 00000010004000c00000020000030000010000000008000000000080000000000400400000000010200a020800000000000042000000000000800002000002000c200800010001010000000002808002208000400000000040080000000100000008000000000002004002000010000000020100080100200808000000000004	2244
--- a/test-data/CID_2244_FP3.fps	Wed Jun 24 13:12:05 2020 -0400
+++ b/test-data/CID_2244_FP3.fps	Sat Sep 25 19:07:44 2021 +0000
@@ -1,7 +1,7 @@
 #FPS1
 #num_bits=55
 #type=OpenBabel-FP3/1
-#software=OpenBabel/3.0.0 chemfp/1.6
-#source=CID_2244.sdf
-#date=2019-05-03T12:39:21
+#software=OpenBabel/3.0.0 chemfp/1.6.1
+#source=/tmp/tmp6hdbhy1a/files/7/6/2/dataset_762912b3-00ea-43f4-929b-ec5575808d8d.dat
+#date=2021-09-22T11:53:12
 0400000c50b007	2244
--- a/test-data/CID_2244_FP4.fps	Wed Jun 24 13:12:05 2020 -0400
+++ b/test-data/CID_2244_FP4.fps	Sat Sep 25 19:07:44 2021 +0000
@@ -1,7 +1,7 @@
 #FPS1
 #num_bits=307
 #type=OpenBabel-FP4/1
-#software=OpenBabel/3.0.0 chemfp/1.6
-#source=CID_2244.sdf
-#date=2019-05-03T12:39:27
+#software=OpenBabel/3.0.0 chemfp/1.6.1
+#source=/tmp/tmp6hdbhy1a/files/3/f/e/dataset_3fe99fed-d2c3-4f2e-a45b-f3f8ae8b83ad.dat
+#date=2021-09-22T11:53:34
 010000000000000000009800000000004001000000000000000000000000000000000240402801	2244
--- a/test-data/CID_2244_MACCS.fps	Wed Jun 24 13:12:05 2020 -0400
+++ b/test-data/CID_2244_MACCS.fps	Sat Sep 25 19:07:44 2021 +0000
@@ -1,7 +1,7 @@
 #FPS1
 #num_bits=166
 #type=OpenBabel-MACCS/2
-#software=OpenBabel/3.0.0 chemfp/1.6
-#source=CID_2244.sdf
-#date=2019-05-03T12:39:49
+#software=OpenBabel/3.0.0 chemfp/1.6.1
+#source=/tmp/tmp6hdbhy1a/files/6/6/e/dataset_66e4701a-88cc-4176-aae2-d18d73319e2a.dat
+#date=2021-09-22T11:54:01
 0000000000000000000000010000016480cca2d21e	2244
--- a/test-data/CID_2244_torsions.fps	Wed Jun 24 13:12:05 2020 -0400
+++ b/test-data/CID_2244_torsions.fps	Sat Sep 25 19:07:44 2021 +0000
@@ -1,7 +1,7 @@
 #FPS1
 #num_bits=2048
 #type=RDKit-Torsion/2 fpSize=2048 targetSize=4
-#software=RDKit/2018.09.3 chemfp/1.6
-#source=test-data/CID_2244.smi
-#date=2019-04-25T15:11:02
+#software=RDKit/2018.09.3 chemfp/1.6.1
+#source=/tmp/tmp6hdbhy1a/files/2/b/a/dataset_2baed193-8060-4271-9a6f-4f865eea0daa.dat
+#date=2021-09-22T11:54:15
 00100010000003000000000000000000000000000000000000000000000000000000000000000000000000000000000300000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000000000000000000000000000000000000000000000100000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000010000001000000000000110000000000000000000000000000000000001001000000000000000000001000000000000000000000	2244