diff clustering_from_distmat.py @ 1:c0b01c55a0e0 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/clustering_from_distmat/ commit 65b5c6f177478883ce664aeb6f27d0bec7155fdc
author iuc
date Mon, 19 Aug 2024 15:33:16 +0000
parents 8192b416f945
children
line wrap: on
line diff
--- a/clustering_from_distmat.py	Thu Aug 08 19:34:36 2024 +0000
+++ b/clustering_from_distmat.py	Mon Aug 19 15:33:16 2024 +0000
@@ -1,5 +1,6 @@
 import argparse
 import sys
+from collections import Counter
 
 import scipy
 
@@ -45,6 +46,15 @@
         ],
         help="Clustering method to use"
     )
+    missing_names = parser.add_mutually_exclusive_group()
+    missing_names.add_argument(
+        "--nc", "--no-colnames", action="store_true",
+        help="Indicate that the distance matrix input does not feature column names"
+    )
+    missing_names.add_argument(
+        "--nr", "--no-rownames", action="store_true",
+        help="Indicate that the distance matrix input does not feature row names"
+    )
     cut_mode = parser.add_mutually_exclusive_group()
     cut_mode.add_argument(
         "-n", "--n-clusters", nargs="*", type=int
@@ -52,39 +62,67 @@
     cut_mode.add_argument(
         "--height", nargs="*", type=float
     )
+    parser.add_argument("-s", "--min-cluster-size", type=int, default=2)
     args = parser.parse_args()
 
-    # TO DO:
-    # - parse outputs to generate
-
     # read from input and check that
     # we have been passed a symmetric distance matrix
     with open(args.infile) as i:
-        col_names = next(i).rstrip("\n\r").split("\t")[1:]
-        col_count = len(col_names)
-        if not col_count:
-            sys.exit(
-                'No data columns found. '
-                'This tool expects tabular input with column names on the first line '
-                'and a row name in the first column of each row followed by data columns.'
-            )
+        col_count = None
         row_count = 0
         matrix = []
+        if args.nc:
+            col_names = col_count = None
+        else:
+            while True:
+                # skip leading empty lines
+                line = next(i).rstrip("\n\r")
+                if line:
+                    break
+            if args.nr:
+                col_names = line.split("\t")
+            else:
+                # first column is for row names, rest are column names
+                col_names = line.split("\t")[1:]
+            col_count = len(col_names)
+            if not col_count:
+                sys.exit(
+                    'No data columns found. '
+                    'By default, this tool expects tabular input with column names on the first line '
+                    'and a row name in the first column of each row followed by data columns. '
+                    'Use --no-colnames or --no-rownames to modify the expected format.'
+                )
         for line in i:
             if not line.strip():
                 # skip empty lines
                 continue
             row_count += 1
-            if row_count > col_count:
+            if col_count is not None and row_count > col_count:
                 sys.exit(
                     'This tool expects a symmetric distance matrix with an equal number of rows and columns, '
                     'but got more rows than columns.'
                 )
-            row_name, *row_data = line.strip(" \n\r").split("\t")
+            if args.nr:
+                row_name = None
+                row_data = line.rstrip("\n\r").split("\t")
+            else:
+                row_name, *row_data = line.rstrip("\n\r").split("\t")
+            if col_count is None:
+                col_count = len(row_data)
+                col_names = [None] * col_count
             col_name = col_names[row_count - 1]
-            if not row_name:
+            if not row_name and col_name:
                 # tolerate omitted row names, use col name instead
                 row_name = col_name
+            elif row_name and not col_name:
+                # likewise for column names
+                # plus update list of col names with row name
+                col_name = col_names[row_count - 1] = row_name
+            elif not row_name and not col_name:
+                sys.exit(
+                    'Each sample in the distance matrix must have its name specified via a row name, a column name, or both, '
+                    f'but found no name for sample number {row_count}'
+                )
             if row_name != col_name:
                 sys.exit(
                     'This tool expects a symmetric distance matrix with identical names for rows and columns, '
@@ -98,7 +136,10 @@
             try:
                 matrix.append([float(x) for x in row_data])
             except ValueError as e:
-                sys.exit(str(e) + f' on row {row_count} ("{row_name}")')
+                if args.nr:
+                    sys.exit(str(e) + f' on row {row_count}')
+                else:
+                    sys.exit(str(e) + f' on row {row_count} ("{row_name}")')
     if row_count < col_count:
         sys.exit(
             'This tool expects a symmetric distance matrix with an equal number of rows and columns, '
@@ -128,18 +169,47 @@
         header_cols = ["sample"] + [
             colname_template.format(x) for x in cut_values
         ]
+        cut_result = scipy.cluster.hierarchy.cut_tree(
+            linkage,
+            args.n_clusters,
+            args.height
+        )
+
+        # Go through the cut results once to determine cluster sizes
+
+        # In the final report, the ids of clusters with fewer members than
+        # args.min_cluster_size will be masked with "-".
+        # The remaining cluster ids will be renumbered to start fom 1.
+        # This has to be done for each clustering resulting from the
+        # user-specified cut_values.
+        cluster_member_counts = [Counter() for _ in cut_values]
+        effective_cluster_ids = [{} for _ in cut_values]
+        for cluster_ids in cut_result:
+            for cl_count, cl_id, eff_id in zip(cluster_member_counts, cluster_ids, effective_cluster_ids):
+                cl_count[cl_id] += 1
+        for counter, eff_ids in zip(cluster_member_counts, effective_cluster_ids):
+            eff_id = 1
+            for item, count in counter.items():
+                # Since Python 3.7, Counter objects (like dicts) preserve
+                # insertion order so we can be sure that in the mapping
+                # constructed below, clusters will get renumbered in
+                # the order they will be reported later.
+                if count >= args.min_cluster_size:
+                    eff_ids[item] = str(eff_id)
+                    eff_id += 1
+                else:
+                    eff_ids[item] = "-"
+
+        # build and write the cluster assignment report
+        # with remapped cluster ids
         cluster_assignments = []
-        for name, cluster_ids in zip(
-            col_names,
-            scipy.cluster.hierarchy.cut_tree(
-                linkage,
-                args.n_clusters,
-                args.height
-            )
-        ):
+        for name, cluster_ids in zip(col_names, cut_result):
             cluster_assignments.append(
                 [name]
-                + [str(c + 1) for c in cluster_ids]
+                + [
+                    eff_ids[c]
+                    for c, eff_ids in zip(cluster_ids, effective_cluster_ids)
+                ]
             )
         with open(args.out_prefix + '.cluster_assignments.tsv', 'w') as o:
             print("\t".join(header_cols), file=o)