Mercurial > repos > iuc > clustering_from_distmat

--- a/clustering_from_distmat.py	Thu Aug 08 19:34:36 2024 +0000
+++ b/clustering_from_distmat.py	Mon Aug 19 15:33:16 2024 +0000
@@ -1,5 +1,6 @@
 import argparse
 import sys
+from collections import Counter

 import scipy

@@ -45,6 +46,15 @@
         ],
         help="Clustering method to use"
     )
+    missing_names = parser.add_mutually_exclusive_group()
+    missing_names.add_argument(
+        "--nc", "--no-colnames", action="store_true",
+        help="Indicate that the distance matrix input does not feature column names"
+    )
+    missing_names.add_argument(
+        "--nr", "--no-rownames", action="store_true",
+        help="Indicate that the distance matrix input does not feature row names"
+    )
     cut_mode = parser.add_mutually_exclusive_group()
     cut_mode.add_argument(
         "-n", "--n-clusters", nargs="*", type=int
@@ -52,39 +62,67 @@
     cut_mode.add_argument(
         "--height", nargs="*", type=float
     )
+    parser.add_argument("-s", "--min-cluster-size", type=int, default=2)
     args = parser.parse_args()

-    # TO DO:
-    # - parse outputs to generate
-
     # read from input and check that
     # we have been passed a symmetric distance matrix
     with open(args.infile) as i:
-        col_names = next(i).rstrip("\n\r").split("\t")[1:]
-        col_count = len(col_names)
-        if not col_count:
-            sys.exit(
-                'No data columns found. '
-                'This tool expects tabular input with column names on the first line '
-                'and a row name in the first column of each row followed by data columns.'
-            )
+        col_count = None
         row_count = 0
         matrix = []
+        if args.nc:
+            col_names = col_count = None
+        else:
+            while True:
+                # skip leading empty lines
+                line = next(i).rstrip("\n\r")
+                if line:
+                    break
+            if args.nr:
+                col_names = line.split("\t")
+            else:
+                # first column is for row names, rest are column names
+                col_names = line.split("\t")[1:]
+            col_count = len(col_names)
+            if not col_count:
+                sys.exit(
+                    'No data columns found. '
+                    'By default, this tool expects tabular input with column names on the first line '
+                    'and a row name in the first column of each row followed by data columns. '
+                    'Use --no-colnames or --no-rownames to modify the expected format.'
+                )
         for line in i:
             if not line.strip():
                 # skip empty lines
                 continue
             row_count += 1
-            if row_count > col_count:
+            if col_count is not None and row_count > col_count:
                 sys.exit(
                     'This tool expects a symmetric distance matrix with an equal number of rows and columns, '
                     'but got more rows than columns.'
                 )
-            row_name, *row_data = line.strip(" \n\r").split("\t")
+            if args.nr:
+                row_name = None
+                row_data = line.rstrip("\n\r").split("\t")
+            else:
+                row_name, *row_data = line.rstrip("\n\r").split("\t")
+            if col_count is None:
+                col_count = len(row_data)
+                col_names = [None] * col_count
             col_name = col_names[row_count - 1]
-            if not row_name:
+            if not row_name and col_name:
                 # tolerate omitted row names, use col name instead
                 row_name = col_name
+            elif row_name and not col_name:
+                # likewise for column names
+                # plus update list of col names with row name
+                col_name = col_names[row_count - 1] = row_name
+            elif not row_name and not col_name:
+                sys.exit(
+                    'Each sample in the distance matrix must have its name specified via a row name, a column name, or both, '
+                    f'but found no name for sample number {row_count}'
+                )
             if row_name != col_name:
                 sys.exit(
                     'This tool expects a symmetric distance matrix with identical names for rows and columns, '
@@ -98,7 +136,10 @@
             try:
                 matrix.append([float(x) for x in row_data])
             except ValueError as e:
-                sys.exit(str(e) + f' on row {row_count} ("{row_name}")')
+                if args.nr:
+                    sys.exit(str(e) + f' on row {row_count}')
+                else:
+                    sys.exit(str(e) + f' on row {row_count} ("{row_name}")')
     if row_count < col_count:
         sys.exit(
             'This tool expects a symmetric distance matrix with an equal number of rows and columns, '
@@ -128,18 +169,47 @@
         header_cols = ["sample"] + [
             colname_template.format(x) for x in cut_values
         ]
+        cut_result = scipy.cluster.hierarchy.cut_tree(
+            linkage,
+            args.n_clusters,
+            args.height
+        )
+
+        # Go through the cut results once to determine cluster sizes
+
+        # In the final report, the ids of clusters with fewer members than
+        # args.min_cluster_size will be masked with "-".
+        # The remaining cluster ids will be renumbered to start fom 1.
+        # This has to be done for each clustering resulting from the
+        # user-specified cut_values.
+        cluster_member_counts = [Counter() for _ in cut_values]
+        effective_cluster_ids = [{} for _ in cut_values]
+        for cluster_ids in cut_result:
+            for cl_count, cl_id, eff_id in zip(cluster_member_counts, cluster_ids, effective_cluster_ids):
+                cl_count[cl_id] += 1
+        for counter, eff_ids in zip(cluster_member_counts, effective_cluster_ids):
+            eff_id = 1
+            for item, count in counter.items():
+                # Since Python 3.7, Counter objects (like dicts) preserve
+                # insertion order so we can be sure that in the mapping
+                # constructed below, clusters will get renumbered in
+                # the order they will be reported later.
+                if count >= args.min_cluster_size:
+                    eff_ids[item] = str(eff_id)
+                    eff_id += 1
+                else:
+                    eff_ids[item] = "-"
+
+        # build and write the cluster assignment report
+        # with remapped cluster ids
         cluster_assignments = []
-        for name, cluster_ids in zip(
-            col_names,
-            scipy.cluster.hierarchy.cut_tree(
-                linkage,
-                args.n_clusters,
-                args.height
-            )
-        ):
+        for name, cluster_ids in zip(col_names, cut_result):
             cluster_assignments.append(
                 [name]
-                + [str(c + 1) for c in cluster_ids]
+                + [
+                    eff_ids[c]
+                    for c, eff_ids in zip(cluster_ids, effective_cluster_ids)
+                ]
             )
         with open(args.out_prefix + '.cluster_assignments.tsv', 'w') as o:
             print("\t".join(header_cols), file=o)
--- a/clustering_from_distmat.xml	Thu Aug 08 19:34:36 2024 +0000
+++ b/clustering_from_distmat.xml	Mon Aug 19 15:33:16 2024 +0000
@@ -1,5 +1,11 @@
-<tool id="clustering_from_distmat" name="Distance matrix-based hierarchical clustering" version="1.0" profile="23.0">
+<tool id="clustering_from_distmat" name="Distance matrix-based hierarchical clustering" version="1.1" profile="23.0">
     <description>using Scipy</description>
+    <macros>
+        <xml name="cluster_assignment_options">
+            <param name="min_cluster_size" type="integer" value="2" min="1" label="Mask clusters with less than this number of samples" help="Samples assigned to clusters smaller than this threshold will have '-' in the corresponding cluster ID column" />
+            <param name="generate_dendrogram" type="boolean" label="Produce also the dendrogram of clustering results" />
+        </xml>
+    </macros>
     <edam_topics>
         <edam_topic>topic_2269</edam_topic>
         <edam_topic>topic_0084</edam_topic>
@@ -16,11 +22,15 @@
   '$distmat'
   result
   --method $method
+  $missing_names
   #if str($cluster_assignment.select) == 'n-cluster':
     --n-clusters $cluster_assignment.n_cluster
   #elif str($cluster_assignment.select) == 'height':
     --height $cluster_assignment.height
   #end if
+  #if str($cluster_assignment.select) != 'dendrogram-only' and $cluster_assignment.min_cluster_size != 2:
+    --min-cluster-size $cluster_assignment.min_cluster_size
+  #end if
     ]]></command>
     <inputs>
         <param name="distmat" type="data" format="tabular" label="Distance matrix" />
@@ -33,6 +43,11 @@
             <option value="median">WPGMC (scipy 'median' method)</option>
             <option value="ward">Ward/Incremental (scipy 'ward' method)</option>
         </param>
+        <param name="missing_names" type="select" label="How does the input specify sample names?">
+            <option value="">First line and first column specify sample names (fully symmetric input)</option>
+            <option value="--nr">First line specifies sample names, subsequent lines only data</option>
+            <option value="--nc">Each line specifies sample name in first column, first line is not special</option>
+        </param>
         <conditional name="cluster_assignment">
             <param name="select" type="select" label="Generate cluster assignments?">
                 <option value="dendrogram-only">No, just generate the dendrogram of clustering results</option>
@@ -42,11 +57,11 @@
             <when value="dendrogram-only" />
             <when value="n-cluster">
                 <param name="n_cluster" type="integer" value="5" min="1" label="How many clusters to divide into?" />
-                <param name="generate_dendrogram" type="boolean" label="Produce also the dendrogram of clustering results" />
+                <expand macro="cluster_assignment_options" />
             </when>
             <when value="height">
                 <param name="height" type="float" value="5.0" label="Distance threshold for clusters to be reported" />
-                <param name="generate_dendrogram" type="boolean" label="Produce also the dendrogram of clustering results" />
+                <expand macro="cluster_assignment_options" />
             </when>
         </conditional>
     </inputs>
@@ -74,14 +89,46 @@
             <conditional name="cluster_assignment">
                 <param name="select" value="height" />
                 <param name="height" value="18" />
+                <param name="min_cluster_size" value="1" />
             </conditional>
             <output name="clustering_assignment" ftype="tabular" file="test_assignment_average_h18.tsv" />
         </test>
+        <test expect_num_outputs="1">
+            <param name="distmat" value="test_matrix.tsv"/>
+            <conditional name="cluster_assignment">
+                <param name="select" value="height" />
+                <param name="height" value="18" />
+            </conditional>
+            <output name="clustering_assignment" ftype="tabular" file="test_assignment_average_h18_s2.tsv" />
+        </test>
         <test expect_num_outputs="2">
             <param name="distmat" value="test_matrix.tsv"/>
             <conditional name="cluster_assignment">
                 <param name="select" value="n-cluster" />
                 <param name="n_cluster" value="4" />
+                <param name="min_cluster_size" value="1" />
+                <param name="generate_dendrogram" value="true" />
+            </conditional>
+            <output name="clustering_assignment" ftype="tabular" file="test_assignment_average_n4.tsv" />
+        </test>
+        <test expect_num_outputs="2">
+            <param name="distmat" value="test_matrix_nr.tsv" />
+            <param name="missing_names" value="--nr" />
+            <conditional name="cluster_assignment">
+                <param name="select" value="n-cluster" />
+                <param name="n_cluster" value="4" />
+                <param name="min_cluster_size" value="1" />
+                <param name="generate_dendrogram" value="true" />
+            </conditional>
+            <output name="clustering_assignment" ftype="tabular" file="test_assignment_average_n4.tsv" />
+        </test>
+        <test expect_num_outputs="2">
+            <param name="distmat" value="test_matrix_nc.tsv" />
+            <param name="missing_names" value="--nc" />
+            <conditional name="cluster_assignment">
+                <param name="select" value="n-cluster" />
+                <param name="n_cluster" value="4" />
+                <param name="min_cluster_size" value="1" />
                 <param name="generate_dendrogram" value="true" />
             </conditional>
             <output name="clustering_assignment" ftype="tabular" file="test_assignment_average_n4.tsv" />
@@ -95,7 +142,7 @@

 This tool lets you perform hierarchical clustering of samples using the `scipy.cluster.hierarchy.linkage`_ function and any of the clustering methods supported by it.

-As input it expects a symmetrical distance matrix with sample names on the first row and in the first column.
+As input it expects a symmetrical distance matrix with sample names on the first row and/or in the first column.

 The clustering result can be reported in the form of a dendrogram in newick format.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_assignment_average_h18_s2.tsv	Mon Aug 19 15:33:16 2024 +0000
@@ -0,0 +1,6 @@
+sample	cluster_id_h18.0
+a	1
+b	1
+c	-
+d	-
+e	-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_matrix_nc.tsv	Mon Aug 19 15:33:16 2024 +0000
@@ -0,0 +1,5 @@
+a	0	17	21	31	23
+b	17	0	30	34	21
+c	21	30	0	28	39
+d	31	34	28	0	43
+e	23	21	39	43	0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_matrix_nr.tsv	Mon Aug 19 15:33:16 2024 +0000
@@ -0,0 +1,6 @@
+a	b	c	d	e
+0	17	21	31	23
+17	0	30	34	21
+21	30	0	28	39
+31	34	28	0	43
+23	21	39	43	0