diff get_clusters.py @ 0:4f3222cb5cf6 draft

"planemo upload for repository https://github.com/galaxycomputationalchemistry/galaxy-tools-compchem/ commit 79589d149a8ff2791d4f71d28b155011672db827"
author chemteam
date Fri, 11 Sep 2020 21:54:45 +0000
parents
children afcb925def69
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_clusters.py	Fri Sep 11 21:54:45 2020 +0000
@@ -0,0 +1,39 @@
+import argparse
+import collections
+import json
+
+import numpy as np
+
+from scipy.cluster.hierarchy import fcluster
+
+
+def separate_clusters(Z_fpath, threshold, min_members, output):
+    Z = np.loadtxt(Z_fpath)
+    branch_assignments = fcluster(Z, threshold, criterion='distance')
+    cluster_dict = collections.defaultdict(list)
+    for n, val in enumerate(branch_assignments):
+        cluster_dict[branch_assignments[n]].append(n)
+    cluster_dict = {int(k): v for k, v in cluster_dict.items()
+                    if len(v) >= min_members}
+    with open(output, 'w') as f:
+        json.dump(cluster_dict, f, indent=4, sort_keys=True)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--Z', required=True,
+                        help='File for cluster linkage array.')
+    parser.add_argument('--threshold', type=int, required=True,
+                        help='Distance cutoff.')
+    parser.add_argument('--min-members', type=int, required=True,
+                        help='Minimum number of members of the cluster.')
+    parser.add_argument('--output', required=True,
+                        help='Output file.')
+    args = parser.parse_args()
+
+    separate_clusters(args.Z, args.threshold,
+                      args.min_members, args.output)
+
+
+if __name__ == "__main__":
+    main()