Mercurial > repos > iuc > clustering_from_distmat
changeset 1:c0b01c55a0e0 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/clustering_from_distmat/ commit 65b5c6f177478883ce664aeb6f27d0bec7155fdc
author | iuc |
---|---|
date | Mon, 19 Aug 2024 15:33:16 +0000 |
parents | 8192b416f945 |
children | f8ee933de3ca |
files | clustering_from_distmat.py clustering_from_distmat.xml test-data/test_assignment_average_h18_s2.tsv test-data/test_matrix_nc.tsv test-data/test_matrix_nr.tsv |
diffstat | 5 files changed, 162 insertions(+), 28 deletions(-) [+] |
line wrap: on
line diff
--- a/clustering_from_distmat.py Thu Aug 08 19:34:36 2024 +0000 +++ b/clustering_from_distmat.py Mon Aug 19 15:33:16 2024 +0000 @@ -1,5 +1,6 @@ import argparse import sys +from collections import Counter import scipy @@ -45,6 +46,15 @@ ], help="Clustering method to use" ) + missing_names = parser.add_mutually_exclusive_group() + missing_names.add_argument( + "--nc", "--no-colnames", action="store_true", + help="Indicate that the distance matrix input does not feature column names" + ) + missing_names.add_argument( + "--nr", "--no-rownames", action="store_true", + help="Indicate that the distance matrix input does not feature row names" + ) cut_mode = parser.add_mutually_exclusive_group() cut_mode.add_argument( "-n", "--n-clusters", nargs="*", type=int @@ -52,39 +62,67 @@ cut_mode.add_argument( "--height", nargs="*", type=float ) + parser.add_argument("-s", "--min-cluster-size", type=int, default=2) args = parser.parse_args() - # TO DO: - # - parse outputs to generate - # read from input and check that # we have been passed a symmetric distance matrix with open(args.infile) as i: - col_names = next(i).rstrip("\n\r").split("\t")[1:] - col_count = len(col_names) - if not col_count: - sys.exit( - 'No data columns found. ' - 'This tool expects tabular input with column names on the first line ' - 'and a row name in the first column of each row followed by data columns.' - ) + col_count = None row_count = 0 matrix = [] + if args.nc: + col_names = col_count = None + else: + while True: + # skip leading empty lines + line = next(i).rstrip("\n\r") + if line: + break + if args.nr: + col_names = line.split("\t") + else: + # first column is for row names, rest are column names + col_names = line.split("\t")[1:] + col_count = len(col_names) + if not col_count: + sys.exit( + 'No data columns found. ' + 'By default, this tool expects tabular input with column names on the first line ' + 'and a row name in the first column of each row followed by data columns. ' + 'Use --no-colnames or --no-rownames to modify the expected format.' + ) for line in i: if not line.strip(): # skip empty lines continue row_count += 1 - if row_count > col_count: + if col_count is not None and row_count > col_count: sys.exit( 'This tool expects a symmetric distance matrix with an equal number of rows and columns, ' 'but got more rows than columns.' ) - row_name, *row_data = line.strip(" \n\r").split("\t") + if args.nr: + row_name = None + row_data = line.rstrip("\n\r").split("\t") + else: + row_name, *row_data = line.rstrip("\n\r").split("\t") + if col_count is None: + col_count = len(row_data) + col_names = [None] * col_count col_name = col_names[row_count - 1] - if not row_name: + if not row_name and col_name: # tolerate omitted row names, use col name instead row_name = col_name + elif row_name and not col_name: + # likewise for column names + # plus update list of col names with row name + col_name = col_names[row_count - 1] = row_name + elif not row_name and not col_name: + sys.exit( + 'Each sample in the distance matrix must have its name specified via a row name, a column name, or both, ' + f'but found no name for sample number {row_count}' + ) if row_name != col_name: sys.exit( 'This tool expects a symmetric distance matrix with identical names for rows and columns, ' @@ -98,7 +136,10 @@ try: matrix.append([float(x) for x in row_data]) except ValueError as e: - sys.exit(str(e) + f' on row {row_count} ("{row_name}")') + if args.nr: + sys.exit(str(e) + f' on row {row_count}') + else: + sys.exit(str(e) + f' on row {row_count} ("{row_name}")') if row_count < col_count: sys.exit( 'This tool expects a symmetric distance matrix with an equal number of rows and columns, ' @@ -128,18 +169,47 @@ header_cols = ["sample"] + [ colname_template.format(x) for x in cut_values ] + cut_result = scipy.cluster.hierarchy.cut_tree( + linkage, + args.n_clusters, + args.height + ) + + # Go through the cut results once to determine cluster sizes + + # In the final report, the ids of clusters with fewer members than + # args.min_cluster_size will be masked with "-". + # The remaining cluster ids will be renumbered to start fom 1. + # This has to be done for each clustering resulting from the + # user-specified cut_values. + cluster_member_counts = [Counter() for _ in cut_values] + effective_cluster_ids = [{} for _ in cut_values] + for cluster_ids in cut_result: + for cl_count, cl_id, eff_id in zip(cluster_member_counts, cluster_ids, effective_cluster_ids): + cl_count[cl_id] += 1 + for counter, eff_ids in zip(cluster_member_counts, effective_cluster_ids): + eff_id = 1 + for item, count in counter.items(): + # Since Python 3.7, Counter objects (like dicts) preserve + # insertion order so we can be sure that in the mapping + # constructed below, clusters will get renumbered in + # the order they will be reported later. + if count >= args.min_cluster_size: + eff_ids[item] = str(eff_id) + eff_id += 1 + else: + eff_ids[item] = "-" + + # build and write the cluster assignment report + # with remapped cluster ids cluster_assignments = [] - for name, cluster_ids in zip( - col_names, - scipy.cluster.hierarchy.cut_tree( - linkage, - args.n_clusters, - args.height - ) - ): + for name, cluster_ids in zip(col_names, cut_result): cluster_assignments.append( [name] - + [str(c + 1) for c in cluster_ids] + + [ + eff_ids[c] + for c, eff_ids in zip(cluster_ids, effective_cluster_ids) + ] ) with open(args.out_prefix + '.cluster_assignments.tsv', 'w') as o: print("\t".join(header_cols), file=o)
--- a/clustering_from_distmat.xml Thu Aug 08 19:34:36 2024 +0000 +++ b/clustering_from_distmat.xml Mon Aug 19 15:33:16 2024 +0000 @@ -1,5 +1,11 @@ -<tool id="clustering_from_distmat" name="Distance matrix-based hierarchical clustering" version="1.0" profile="23.0"> +<tool id="clustering_from_distmat" name="Distance matrix-based hierarchical clustering" version="1.1" profile="23.0"> <description>using Scipy</description> + <macros> + <xml name="cluster_assignment_options"> + <param name="min_cluster_size" type="integer" value="2" min="1" label="Mask clusters with less than this number of samples" help="Samples assigned to clusters smaller than this threshold will have '-' in the corresponding cluster ID column" /> + <param name="generate_dendrogram" type="boolean" label="Produce also the dendrogram of clustering results" /> + </xml> + </macros> <edam_topics> <edam_topic>topic_2269</edam_topic> <edam_topic>topic_0084</edam_topic> @@ -16,11 +22,15 @@ '$distmat' result --method $method + $missing_names #if str($cluster_assignment.select) == 'n-cluster': --n-clusters $cluster_assignment.n_cluster #elif str($cluster_assignment.select) == 'height': --height $cluster_assignment.height #end if + #if str($cluster_assignment.select) != 'dendrogram-only' and $cluster_assignment.min_cluster_size != 2: + --min-cluster-size $cluster_assignment.min_cluster_size + #end if ]]></command> <inputs> <param name="distmat" type="data" format="tabular" label="Distance matrix" /> @@ -33,6 +43,11 @@ <option value="median">WPGMC (scipy 'median' method)</option> <option value="ward">Ward/Incremental (scipy 'ward' method)</option> </param> + <param name="missing_names" type="select" label="How does the input specify sample names?"> + <option value="">First line and first column specify sample names (fully symmetric input)</option> + <option value="--nr">First line specifies sample names, subsequent lines only data</option> + <option value="--nc">Each line specifies sample name in first column, first line is not special</option> + </param> <conditional name="cluster_assignment"> <param name="select" type="select" label="Generate cluster assignments?"> <option value="dendrogram-only">No, just generate the dendrogram of clustering results</option> @@ -42,11 +57,11 @@ <when value="dendrogram-only" /> <when value="n-cluster"> <param name="n_cluster" type="integer" value="5" min="1" label="How many clusters to divide into?" /> - <param name="generate_dendrogram" type="boolean" label="Produce also the dendrogram of clustering results" /> + <expand macro="cluster_assignment_options" /> </when> <when value="height"> <param name="height" type="float" value="5.0" label="Distance threshold for clusters to be reported" /> - <param name="generate_dendrogram" type="boolean" label="Produce also the dendrogram of clustering results" /> + <expand macro="cluster_assignment_options" /> </when> </conditional> </inputs> @@ -74,14 +89,46 @@ <conditional name="cluster_assignment"> <param name="select" value="height" /> <param name="height" value="18" /> + <param name="min_cluster_size" value="1" /> </conditional> <output name="clustering_assignment" ftype="tabular" file="test_assignment_average_h18.tsv" /> </test> + <test expect_num_outputs="1"> + <param name="distmat" value="test_matrix.tsv"/> + <conditional name="cluster_assignment"> + <param name="select" value="height" /> + <param name="height" value="18" /> + </conditional> + <output name="clustering_assignment" ftype="tabular" file="test_assignment_average_h18_s2.tsv" /> + </test> <test expect_num_outputs="2"> <param name="distmat" value="test_matrix.tsv"/> <conditional name="cluster_assignment"> <param name="select" value="n-cluster" /> <param name="n_cluster" value="4" /> + <param name="min_cluster_size" value="1" /> + <param name="generate_dendrogram" value="true" /> + </conditional> + <output name="clustering_assignment" ftype="tabular" file="test_assignment_average_n4.tsv" /> + </test> + <test expect_num_outputs="2"> + <param name="distmat" value="test_matrix_nr.tsv" /> + <param name="missing_names" value="--nr" /> + <conditional name="cluster_assignment"> + <param name="select" value="n-cluster" /> + <param name="n_cluster" value="4" /> + <param name="min_cluster_size" value="1" /> + <param name="generate_dendrogram" value="true" /> + </conditional> + <output name="clustering_assignment" ftype="tabular" file="test_assignment_average_n4.tsv" /> + </test> + <test expect_num_outputs="2"> + <param name="distmat" value="test_matrix_nc.tsv" /> + <param name="missing_names" value="--nc" /> + <conditional name="cluster_assignment"> + <param name="select" value="n-cluster" /> + <param name="n_cluster" value="4" /> + <param name="min_cluster_size" value="1" /> <param name="generate_dendrogram" value="true" /> </conditional> <output name="clustering_assignment" ftype="tabular" file="test_assignment_average_n4.tsv" /> @@ -95,7 +142,7 @@ This tool lets you perform hierarchical clustering of samples using the `scipy.cluster.hierarchy.linkage`_ function and any of the clustering methods supported by it. -As input it expects a symmetrical distance matrix with sample names on the first row and in the first column. +As input it expects a symmetrical distance matrix with sample names on the first row and/or in the first column. The clustering result can be reported in the form of a dendrogram in newick format.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_assignment_average_h18_s2.tsv Mon Aug 19 15:33:16 2024 +0000 @@ -0,0 +1,6 @@ +sample cluster_id_h18.0 +a 1 +b 1 +c - +d - +e -