Mercurial > repos > iuc > clustering_from_distmat
changeset 0:8192b416f945 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/clustering_from_distmat/ commit a34052b87a2d05cabed5001c50f1bb10e74f97ee
author | iuc |
---|---|
date | Thu, 08 Aug 2024 19:34:36 +0000 |
parents | |
children | c0b01c55a0e0 |
files | clustering_from_distmat.py clustering_from_distmat.xml test-data/test_assignment_average_h18.tsv test-data/test_assignment_average_n4.tsv test-data/test_matrix.tsv test-data/test_tree_average.newick test-data/test_tree_complete.newick |
diffstat | 7 files changed, 278 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/clustering_from_distmat.py Thu Aug 08 19:34:36 2024 +0000 @@ -0,0 +1,147 @@ +import argparse +import sys + +import scipy + + +def linkage_as_newick(linkage, tip_names): + newick_parts = tip_names[::] + within_cluster_distances = [0] * len(tip_names) + for step in linkage: + n1 = int(step[0]) + n2 = int(step[1]) + d = float(step[2]) + d1 = d - within_cluster_distances[n1] + d2 = d - within_cluster_distances[n2] + id1 = newick_parts[n1] + id2 = newick_parts[n2] + part = f'({id1}:{d1 / 2},{id2}:{d2 / 2})' + within_cluster_distances.append(d) + newick_parts.append(part) + return newick_parts[-1].format(*newick_parts) + ';' + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + 'infile', + help='Distance matrix input file' + ) + parser.add_argument( + 'out_prefix', + help="Output prefix" + ) + parser.add_argument + parser.add_argument( + '-m', '--method', default="average", + choices=[ + "single", + "complete", + "average", + "weighted", + "centroid", + "median", + "ward" + ], + help="Clustering method to use" + ) + cut_mode = parser.add_mutually_exclusive_group() + cut_mode.add_argument( + "-n", "--n-clusters", nargs="*", type=int + ) + cut_mode.add_argument( + "--height", nargs="*", type=float + ) + args = parser.parse_args() + + # TO DO: + # - parse outputs to generate + + # read from input and check that + # we have been passed a symmetric distance matrix + with open(args.infile) as i: + col_names = next(i).rstrip("\n\r").split("\t")[1:] + col_count = len(col_names) + if not col_count: + sys.exit( + 'No data columns found. ' + 'This tool expects tabular input with column names on the first line ' + 'and a row name in the first column of each row followed by data columns.' + ) + row_count = 0 + matrix = [] + for line in i: + if not line.strip(): + # skip empty lines + continue + row_count += 1 + if row_count > col_count: + sys.exit( + 'This tool expects a symmetric distance matrix with an equal number of rows and columns, ' + 'but got more rows than columns.' + ) + row_name, *row_data = line.strip(" \n\r").split("\t") + col_name = col_names[row_count - 1] + if not row_name: + # tolerate omitted row names, use col name instead + row_name = col_name + if row_name != col_name: + sys.exit( + 'This tool expects a symmetric distance matrix with identical names for rows and columns, ' + f'but got "{col_name}" in column {row_count} and "{row_name}" on row {row_count}.' + ) + if len(row_data) != col_count: + sys.exit( + 'This tool expects a symmetric distance matrix with the same number of columns on each row, ' + f'but row {row_count} ("{row_name}") has {len(row_data)} columns instead of {col_count}.' + ) + try: + matrix.append([float(x) for x in row_data]) + except ValueError as e: + sys.exit(str(e) + f' on row {row_count} ("{row_name}")') + if row_count < col_count: + sys.exit( + 'This tool expects a symmetric distance matrix with an equal number of rows and columns, ' + 'but got more columns than rows.' + ) + + # turn the distance matrix into "condensed" vector form + # this gives us further checks and raises ValueErrors if: + # - the values on the diagonal aren't zero + # - the upper and lower triangle of the matrix aren't identical + D = scipy.spatial.distance.squareform(matrix) + + # perform the requested clustering and retrieve the result as a linkage object + linkage = scipy.cluster.hierarchy.linkage(D, args.method) + + with open(args.out_prefix + '.tree.newick', 'w') as o: + o.write(linkage_as_newick(linkage, col_names)) + + # cut the tree as specified and report sample to cluster assignments + if args.n_clusters or args.height: + if args.n_clusters: + cut_values = args.n_clusters + colname_template = "cluster_id_n{}" + else: + cut_values = args.height + colname_template = "cluster_id_h{}" + header_cols = ["sample"] + [ + colname_template.format(x) for x in cut_values + ] + cluster_assignments = [] + for name, cluster_ids in zip( + col_names, + scipy.cluster.hierarchy.cut_tree( + linkage, + args.n_clusters, + args.height + ) + ): + cluster_assignments.append( + [name] + + [str(c + 1) for c in cluster_ids] + ) + with open(args.out_prefix + '.cluster_assignments.tsv', 'w') as o: + print("\t".join(header_cols), file=o) + for ass in cluster_assignments: + print("\t".join(ass), file=o)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/clustering_from_distmat.xml Thu Aug 08 19:34:36 2024 +0000 @@ -0,0 +1,111 @@ +<tool id="clustering_from_distmat" name="Distance matrix-based hierarchical clustering" version="1.0" profile="23.0"> + <description>using Scipy</description> + <edam_topics> + <edam_topic>topic_2269</edam_topic> + <edam_topic>topic_0084</edam_topic> + </edam_topics> + <edam_operations> + <edam_operation>operation_3432</edam_operation> + </edam_operations> + <requirements> + <requirement type="package" version="3.12">python</requirement> + <requirement type="package" version="1.14.0">scipy</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ +python '$__tool_directory__/clustering_from_distmat.py' + '$distmat' + result + --method $method + #if str($cluster_assignment.select) == 'n-cluster': + --n-clusters $cluster_assignment.n_cluster + #elif str($cluster_assignment.select) == 'height': + --height $cluster_assignment.height + #end if + ]]></command> + <inputs> + <param name="distmat" type="data" format="tabular" label="Distance matrix" /> + <param name="method" type="select" label="Clustering method"> + <option value="single">Nearest Point (scipy 'single' method)</option> + <option value="complete">Farthest Point (scipy 'complete' method)</option> + <option value="average" selected="true">UPGMA (scipy 'average' method)</option> + <option value="weighted">WPGMA (scipy 'weighted' method)</option> + <option value="centroid">UPGMC (scipy 'centroid' method)</option> + <option value="median">WPGMC (scipy 'median' method)</option> + <option value="ward">Ward/Incremental (scipy 'ward' method)</option> + </param> + <conditional name="cluster_assignment"> + <param name="select" type="select" label="Generate cluster assignments?"> + <option value="dendrogram-only">No, just generate the dendrogram of clustering results</option> + <option value="n-cluster">Yes, and divide into specified number of clusters </option> + <option value="height">Yes, and use distance threshold to divide into clusters</option> + </param> + <when value="dendrogram-only" /> + <when value="n-cluster"> + <param name="n_cluster" type="integer" value="5" min="1" label="How many clusters to divide into?" /> + <param name="generate_dendrogram" type="boolean" label="Produce also the dendrogram of clustering results" /> + </when> + <when value="height"> + <param name="height" type="float" value="5.0" label="Distance threshold for clusters to be reported" /> + <param name="generate_dendrogram" type="boolean" label="Produce also the dendrogram of clustering results" /> + </when> + </conditional> + </inputs> + <outputs> + <data name="clustering_dendrogram" format="newick" from_work_dir="result.tree.newick" label="${tool.name} on ${on_string}: Dendrogram"> + <filter>cluster_assignment["select"] == "dendrogram-only" or cluster_assignment["generate_dendrogram"]</filter> + </data> + <data name="clustering_assignment" format="tabular" from_work_dir="result.cluster_assignments.tsv" label="${tool.name} on ${on_string}: Cluster assignment"> + <filter>cluster_assignment["select"] in ["n-cluster", "height"]</filter> + </data> + </outputs> + <tests> + <!-- Test data and expected results taken from https://en.wikipedia.org/wiki/UPGMA#Working_example --> + <test expect_num_outputs="1"> + <param name="distmat" value="test_matrix.tsv"/> + <output name="clustering_dendrogram" ftype="newick" file="test_tree_average.newick" /> + </test> + <test expect_num_outputs="1"> + <param name="distmat" value="test_matrix.tsv" /> + <param name="method" value="complete" /> + <output name="clustering_dendrogram" ftype="newick" file="test_tree_complete.newick" /> + </test> + <test expect_num_outputs="1"> + <param name="distmat" value="test_matrix.tsv"/> + <conditional name="cluster_assignment"> + <param name="select" value="height" /> + <param name="height" value="18" /> + </conditional> + <output name="clustering_assignment" ftype="tabular" file="test_assignment_average_h18.tsv" /> + </test> + <test expect_num_outputs="2"> + <param name="distmat" value="test_matrix.tsv"/> + <conditional name="cluster_assignment"> + <param name="select" value="n-cluster" /> + <param name="n_cluster" value="4" /> + <param name="generate_dendrogram" value="true" /> + </conditional> + <output name="clustering_assignment" ftype="tabular" file="test_assignment_average_n4.tsv" /> + </test> + </tests> + <help><![CDATA[ + +.. class:: infomark + +**What it does** + +This tool lets you perform hierarchical clustering of samples using the `scipy.cluster.hierarchy.linkage`_ function and any of the clustering methods supported by it. + +As input it expects a symmetrical distance matrix with sample names on the first row and in the first column. + +The clustering result can be reported in the form of a dendrogram in newick format. + +Additionally, the tool can report the assignment of the samples to clusters "cut" from the clustering tree using the `scipy.cluster.hierarchy.cut_tree`_ function. +Reflecting the parameters of that function, you can specify *how* to cut the tree by specifying either the number of clusters to cut into or a distance threshold, i.e., the height at which to cut the tree as SciPy calls it. + +.. _`scipy.cluster.hierarchy.linkage`: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html +.. _`scipy.cluster.hierarchy.cut_tree`: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.cut_tree.html + ]]></help> + <citations> + <citation type="doi">10.1038/s41592-019-0686-2</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_assignment_average_h18.tsv Thu Aug 08 19:34:36 2024 +0000 @@ -0,0 +1,6 @@ +sample cluster_id_h18.0 +a 1 +b 1 +c 2 +d 3 +e 4
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_assignment_average_n4.tsv Thu Aug 08 19:34:36 2024 +0000 @@ -0,0 +1,6 @@ +sample cluster_id_n4 +a 1 +b 1 +c 2 +d 3 +e 4
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_matrix.tsv Thu Aug 08 19:34:36 2024 +0000 @@ -0,0 +1,6 @@ + a b c d e +a 0 17 21 31 23 +b 17 0 30 34 21 +c 21 30 0 28 39 +d 31 34 28 0 43 +e 23 21 39 43 0