annotate sucos_cluster.py @ 7:9b48456a96fe draft default tip

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
author bgruening
date Wed, 14 Apr 2021 09:30:28 +0000
parents bb5365381c8f
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
1 #!/usr/bin/env python
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
2 """
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
3 Cluster a set of molecules based on their 3D overlays as determined by the SuCOS score.
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
4
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
5 This will generate a set of SD files, one for each cluster of molecules (presumably corresponding to a
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
6 binding pocket in the protein target).
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
7
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
8
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
9 SuCOS is the work of Susan Leung.
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
10 GitHub: https://github.com/susanhleung/SuCOS
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
11 Publication: https://doi.org/10.26434/chemrxiv.8100203.v1
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
12 """
7
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
13 import argparse
0
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
14
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
15 import numpy as np
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
16 import pandas as pd
7
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
17 import sucos
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
18 import utils
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
19 from rdkit import Chem
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
20 from scipy.cluster.hierarchy import fcluster, linkage
0
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
21
7
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
22 # start main execution #########################################
0
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
23
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
24
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
25 def calc_distance_matrix(mols):
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
26 """
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
27 Calculate a full distance matrix for the given molecules. Identical molecules get a score of 0.0 with the maximum
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
28 distance possible being 1.0.
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
29 :param mols: A list of molecules. It must be possible to iterate through this list multiple times
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
30 :return: A NxN 2D array of distance scores, with N being the number of molecules in the input
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
31 """
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
32
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
33 # TODO - do we need to calculate both sides of the matrix? Tanimoto is supposed to be a symmetric distance measure,
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
34 # but the matrix that is generated does not seem to be symmetric.
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
35
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
36 mol_fm_tuples = []
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
37 for mol in mols:
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
38 features = sucos.getRawFeatures(mol)
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
39 mol_fm_tuples.append((mol, features))
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
40
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
41 matrix = []
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
42 for tuple1 in mol_fm_tuples:
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
43 tmp = []
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
44 for tuple2 in mol_fm_tuples:
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
45 if tuple1[0] == tuple2[0]:
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
46 tmp.append(0.0)
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
47 else:
7
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
48 # utils.log("Calculating SuCOS between", mol1, mol2)
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
49 sucos_score, fm_score, tani_score = sucos.get_SucosScore(
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
50 tuple1[0],
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
51 tuple2[0],
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
52 tani=True,
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
53 ref_features=tuple1[1],
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
54 query_features=tuple2[1],
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
55 )
0
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
56 tmp.append(1.0 - sucos_score)
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
57 matrix.append(tmp)
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
58
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
59 return matrix
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
60
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
61
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
62 def cluster(matrix, threshold=0.8):
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
63 """
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
64 Cluster the supplied distance matrix returning an array of clusters.
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
65 :param matrix: the distance matrix, as calculated with the calc_distance_matrix function.
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
66 :param threshold: The clustering cuttoff. The default of 0.8 is a reasonable value to use.
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
67 :return: An array of clusters, each cluster being an array of the indices from the matrix.
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
68 """
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
69
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
70 indexes = [x for x in range(0, len(matrix))]
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
71 cols = [x for x in range(0, len(matrix[0]))]
7
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
72 # utils.log("indexes", indexes)
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
73 # utils.log("cols", cols)
0
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
74 df = pd.DataFrame(matrix, columns=cols, index=indexes)
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
75 utils.log("DataFrame:", df.shape)
7
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
76 # utils.log(df)
0
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
77 indices = np.triu_indices(df.shape[0], k=1)
7
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
78 # utils.log("Indices:", indices)
0
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
79 t = np.array(df)[indices]
7
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
80 Z = linkage(t, "average")
0
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
81 lig_clusters = []
7
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
82 cluster_arr = fcluster(Z, t=threshold, criterion="distance")
0
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
83 for i in range(np.amax(cluster_arr)):
7
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
84 clus = df.columns[np.argwhere(cluster_arr == i + 1)]
0
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
85 lig_clusters.append([x[0] for x in clus.tolist()])
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
86
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
87 utils.log("Clusters", lig_clusters)
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
88 return lig_clusters
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
89
7
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
90
0
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
91 def write_clusters_to_sdfs(mols, clusters, basename, gzip=False):
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
92 """
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
93 Write the molecules to SDF files, 1 file for each cluster.
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
94 :param mols The molecules to write:
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
95 :param clusters The clusters, as returned by the cluster function:
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
96 :param basename The basename for the file name. e.g. if basename is 'output' then files like
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
97 output1.sdf, output2.sdf will be written:
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
98 :param gzip Whether to gzip the output
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
99 :return:
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
100 """
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
101
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
102 i = 0
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
103 for cluster in clusters:
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
104 i += 1
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
105 filename = basename + str(i) + ".sdf"
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
106 if gzip:
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
107 filename += ".gz"
7
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
108 utils.log(
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
109 "Writing ", len(cluster), "molecules in cluster", i, "to file", filename
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
110 )
0
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
111 output_file = utils.open_file_for_writing(filename)
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
112 writer = Chem.SDWriter(output_file)
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
113 for index in cluster:
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
114 mol = mols[index]
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
115 writer.write(mol)
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
116 writer.flush()
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
117 writer.close()
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
118 output_file.close()
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
119
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
120
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
121 def main():
7
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
122 parser = argparse.ArgumentParser(description="Clustering with SuCOS and RDKit")
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
123 parser.add_argument(
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
124 "-i", "--input", help="Input file in SDF format. Can be gzipped (*.gz)."
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
125 )
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
126 parser.add_argument(
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
127 "-o",
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
128 "--output",
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
129 default="cluster",
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
130 help="Base name for output files in SDF format. "
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
131 + "e.g. if value is 'output' then files like output1.sdf, output2.sdf will be created",
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
132 )
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
133 parser.add_argument(
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
134 "--gzip",
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
135 action="store_true",
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
136 help="Gzip the outputs generating files like output1.sdf.gz, output2.sdf.gz",
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
137 )
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
138 parser.add_argument(
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
139 "-t", "--threshold", type=float, default=0.8, help="Clustering threshold"
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
140 )
0
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
141
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
142 args = parser.parse_args()
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
143 utils.log("SuCOS Cluster Args: ", args)
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
144
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
145 input_file = utils.open_file_for_reading(args.input)
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
146 suppl = Chem.ForwardSDMolSupplier(input_file)
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
147 mols = list(suppl)
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
148 matrix = calc_distance_matrix(mols)
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
149 clusters = cluster(matrix, threshold=args.threshold)
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
150 write_clusters_to_sdfs(mols, clusters, args.output, gzip=args.gzip)
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
151
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
152
bb5365381c8f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
bgruening
parents:
diff changeset
153 if __name__ == "__main__":
7
9b48456a96fe "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
bgruening
parents: 0
diff changeset
154 main()