diff sucos_cluster.py @ 6:b8725fec8c7b draft default tip

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
author bgruening
date Wed, 14 Apr 2021 09:30:48 +0000
parents f80cfac80c53
children
line wrap: on
line diff
--- a/sucos_cluster.py	Tue Jul 28 08:48:16 2020 -0400
+++ b/sucos_cluster.py	Wed Apr 14 09:30:48 2021 +0000
@@ -10,15 +10,16 @@
 GitHub: https://github.com/susanhleung/SuCOS
 Publication: https://doi.org/10.26434/chemrxiv.8100203.v1
 """
+import argparse
 
-import sucos, utils
-import argparse, gzip
-from rdkit import Chem
 import numpy as np
 import pandas as pd
-from scipy.cluster.hierarchy import linkage, fcluster
+import sucos
+import utils
+from rdkit import Chem
+from scipy.cluster.hierarchy import fcluster, linkage
 
-### start main execution #########################################
+# start main execution #########################################
 
 
 def calc_distance_matrix(mols):
@@ -44,13 +45,17 @@
             if tuple1[0] == tuple2[0]:
                 tmp.append(0.0)
             else:
-                #utils.log("Calculating SuCOS between", mol1, mol2)
-                sucos_score, fm_score, tani_score = sucos.get_SucosScore(tuple1[0], tuple2[0],
-                    tani=True, ref_features=tuple1[1], query_features=tuple2[1])
+                # utils.log("Calculating SuCOS between", mol1, mol2)
+                sucos_score, fm_score, tani_score = sucos.get_SucosScore(
+                    tuple1[0],
+                    tuple2[0],
+                    tani=True,
+                    ref_features=tuple1[1],
+                    query_features=tuple2[1],
+                )
                 tmp.append(1.0 - sucos_score)
         matrix.append(tmp)
 
-
     return matrix
 
 
@@ -64,24 +69,25 @@
 
     indexes = [x for x in range(0, len(matrix))]
     cols = [x for x in range(0, len(matrix[0]))]
-    #utils.log("indexes", indexes)
-    #utils.log("cols", cols)
+    # utils.log("indexes", indexes)
+    # utils.log("cols", cols)
     df = pd.DataFrame(matrix, columns=cols, index=indexes)
     utils.log("DataFrame:", df.shape)
-    #utils.log(df)
+    # utils.log(df)
     indices = np.triu_indices(df.shape[0], k=1)
-    #utils.log("Indices:", indices)
+    # utils.log("Indices:", indices)
     t = np.array(df)[indices]
-    Z = linkage(t, 'average')
+    Z = linkage(t, "average")
     lig_clusters = []
-    cluster_arr = fcluster(Z, t=threshold, criterion='distance')
+    cluster_arr = fcluster(Z, t=threshold, criterion="distance")
     for i in range(np.amax(cluster_arr)):
-        clus = df.columns[np.argwhere(cluster_arr==i+1)]
+        clus = df.columns[np.argwhere(cluster_arr == i + 1)]
         lig_clusters.append([x[0] for x in clus.tolist()])
 
     utils.log("Clusters", lig_clusters)
     return lig_clusters
 
+
 def write_clusters_to_sdfs(mols, clusters, basename, gzip=False):
     """
     Write the molecules to SDF files, 1 file for each cluster.
@@ -99,7 +105,9 @@
         filename = basename + str(i) + ".sdf"
         if gzip:
             filename += ".gz"
-        utils.log("Writing ", len(cluster), "molecules in cluster", i, "to file", filename)
+        utils.log(
+            "Writing ", len(cluster), "molecules in cluster", i, "to file", filename
+        )
         output_file = utils.open_file_for_writing(filename)
         writer = Chem.SDWriter(output_file)
         for index in cluster:
@@ -110,14 +118,26 @@
         output_file.close()
 
 
-
 def main():
-    parser = argparse.ArgumentParser(description='Clustering with SuCOS and RDKit')
-    parser.add_argument('-i', '--input', help='Input file in SDF format. Can be gzipped (*.gz).')
-    parser.add_argument('-o', '--output', default="cluster", help="Base name for output files in SDF format. " +
-                                               "e.g. if value is 'output' then files like output1.sdf, output2.sdf will be created")
-    parser.add_argument('--gzip', action='store_true', help='Gzip the outputs generating files like output1.sdf.gz, output2.sdf.gz')
-    parser.add_argument('-t', '--threshold', type=float, default=0.8, help='Clustering threshold')
+    parser = argparse.ArgumentParser(description="Clustering with SuCOS and RDKit")
+    parser.add_argument(
+        "-i", "--input", help="Input file in SDF format. Can be gzipped (*.gz)."
+    )
+    parser.add_argument(
+        "-o",
+        "--output",
+        default="cluster",
+        help="Base name for output files in SDF format. "
+        + "e.g. if value is 'output' then files like output1.sdf, output2.sdf will be created",
+    )
+    parser.add_argument(
+        "--gzip",
+        action="store_true",
+        help="Gzip the outputs generating files like output1.sdf.gz, output2.sdf.gz",
+    )
+    parser.add_argument(
+        "-t", "--threshold", type=float, default=0.8, help="Clustering threshold"
+    )
 
     args = parser.parse_args()
     utils.log("SuCOS Cluster Args: ", args)
@@ -131,4 +151,4 @@
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()