Mercurial > repos > iuc > clustering_from_distmat

<tool id="clustering_from_distmat" name="Distance matrix-based hierarchical clustering" version="1.1" profile="23.0">
    <description>using Scipy</description>
    <macros>
        <xml name="cluster_assignment_options">
            <param name="min_cluster_size" type="integer" value="2" min="1" label="Mask clusters with less than this number of samples" help="Samples assigned to clusters smaller than this threshold will have '-' in the corresponding cluster ID column" />
            <param name="generate_dendrogram" type="boolean" label="Produce also the dendrogram of clustering results" />
        </xml>
    </macros>
    <edam_topics>
        <edam_topic>topic_2269</edam_topic>
        <edam_topic>topic_0084</edam_topic>
    </edam_topics>
    <edam_operations>
        <edam_operation>operation_3432</edam_operation>
    </edam_operations>
    <requirements>
        <requirement type="package" version="3.12">python</requirement>
        <requirement type="package" version="1.14.0">scipy</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
python '$__tool_directory__/clustering_from_distmat.py'
  '$distmat'
  result
  --method $method
  $missing_names
  #if str($cluster_assignment.select) == 'n-cluster':
    --n-clusters $cluster_assignment.n_cluster
  #elif str($cluster_assignment.select) == 'height':
    --height $cluster_assignment.height
  #end if
  #if str($cluster_assignment.select) != 'dendrogram-only' and $cluster_assignment.min_cluster_size != 2:
    --min-cluster-size $cluster_assignment.min_cluster_size
  #end if
    ]]></command>
    <inputs>
        <param name="distmat" type="data" format="tabular" label="Distance matrix" />
        <param name="method" type="select" label="Clustering method">
            <option value="single">Nearest Point (scipy 'single' method)</option>
            <option value="complete">Farthest Point (scipy 'complete' method)</option>
            <option value="average" selected="true">UPGMA (scipy 'average' method)</option>
            <option value="weighted">WPGMA (scipy 'weighted' method)</option>
            <option value="centroid">UPGMC (scipy 'centroid' method)</option>
            <option value="median">WPGMC (scipy 'median' method)</option>
            <option value="ward">Ward/Incremental (scipy 'ward' method)</option>
        </param>
        <param name="missing_names" type="select" label="How does the input specify sample names?">
            <option value="">First line and first column specify sample names (fully symmetric input)</option>
            <option value="--nr">First line specifies sample names, subsequent lines only data</option>
            <option value="--nc">Each line specifies sample name in first column, first line is not special</option>
        </param>
        <conditional name="cluster_assignment">
            <param name="select" type="select" label="Generate cluster assignments?">
                <option value="dendrogram-only">No, just generate the dendrogram of clustering results</option>
                <option value="n-cluster">Yes, and divide into specified number of clusters </option>
                <option value="height">Yes, and use distance threshold to divide into clusters</option>
            </param>
            <when value="dendrogram-only" />
            <when value="n-cluster">
                <param name="n_cluster" type="integer" value="5" min="1" label="How many clusters to divide into?" />
                <expand macro="cluster_assignment_options" />
            </when>
            <when value="height">
                <param name="height" type="float" value="5.0" label="Distance threshold for clusters to be reported" />
                <expand macro="cluster_assignment_options" />
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data name="clustering_dendrogram" format="newick" from_work_dir="result.tree.newick" label="${tool.name} on ${on_string}: Dendrogram">
            <filter>cluster_assignment["select"] == "dendrogram-only" or cluster_assignment["generate_dendrogram"]</filter>
        </data>
        <data name="clustering_assignment" format="tabular" from_work_dir="result.cluster_assignments.tsv" label="${tool.name} on ${on_string}: Cluster assignment">
            <filter>cluster_assignment["select"] in ["n-cluster", "height"]</filter>
        </data>
    </outputs>
    <tests>
        <!-- Test data and expected results taken from https://en.wikipedia.org/wiki/UPGMA#Working_example -->
        <test expect_num_outputs="1">
            <param name="distmat" value="test_matrix.tsv"/>
            <output name="clustering_dendrogram" ftype="newick" file="test_tree_average.newick" />
        </test>
        <test expect_num_outputs="1">
            <param name="distmat" value="test_matrix.tsv" />
            <param name="method" value="complete" />
            <output name="clustering_dendrogram" ftype="newick" file="test_tree_complete.newick" />
        </test>
        <test expect_num_outputs="1">
            <param name="distmat" value="test_matrix.tsv"/>
            <conditional name="cluster_assignment">
                <param name="select" value="height" />
                <param name="height" value="18" />
                <param name="min_cluster_size" value="1" />
            </conditional>
            <output name="clustering_assignment" ftype="tabular" file="test_assignment_average_h18.tsv" />
        </test>
        <test expect_num_outputs="1">
            <param name="distmat" value="test_matrix.tsv"/>
            <conditional name="cluster_assignment">
                <param name="select" value="height" />
                <param name="height" value="18" />
            </conditional>
            <output name="clustering_assignment" ftype="tabular" file="test_assignment_average_h18_s2.tsv" />
        </test>
        <test expect_num_outputs="2">
            <param name="distmat" value="test_matrix.tsv"/>
            <conditional name="cluster_assignment">
                <param name="select" value="n-cluster" />
                <param name="n_cluster" value="4" />
                <param name="min_cluster_size" value="1" />
                <param name="generate_dendrogram" value="true" />
            </conditional>
            <output name="clustering_assignment" ftype="tabular" file="test_assignment_average_n4.tsv" />
        </test>
        <test expect_num_outputs="2">
            <param name="distmat" value="test_matrix_nr.tsv" />
            <param name="missing_names" value="--nr" />
            <conditional name="cluster_assignment">
                <param name="select" value="n-cluster" />
                <param name="n_cluster" value="4" />
                <param name="min_cluster_size" value="1" />
                <param name="generate_dendrogram" value="true" />
            </conditional>
            <output name="clustering_assignment" ftype="tabular" file="test_assignment_average_n4.tsv" />
        </test>
        <test expect_num_outputs="2">
            <param name="distmat" value="test_matrix_nc.tsv" />
            <param name="missing_names" value="--nc" />
            <conditional name="cluster_assignment">
                <param name="select" value="n-cluster" />
                <param name="n_cluster" value="4" />
                <param name="min_cluster_size" value="1" />
                <param name="generate_dendrogram" value="true" />
            </conditional>
            <output name="clustering_assignment" ftype="tabular" file="test_assignment_average_n4.tsv" />
        </test>
    </tests>
    <help><![CDATA[

.. class:: infomark

**What it does**

This tool lets you perform hierarchical clustering of samples using the `scipy.cluster.hierarchy.linkage`_ function and any of the clustering methods supported by it.

As input it expects a symmetrical distance matrix with sample names on the first row and/or in the first column.

The clustering result can be reported in the form of a dendrogram in newick format.

Additionally, the tool can report the assignment of the samples to clusters "cut" from the clustering tree using the `scipy.cluster.hierarchy.cut_tree`_ function.
Reflecting the parameters of that function, you can specify *how* to cut the tree by specifying either the number of clusters to cut into or a distance threshold, i.e., the height at which to cut the tree as SciPy calls it.

.. _`scipy.cluster.hierarchy.linkage`: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
.. _`scipy.cluster.hierarchy.cut_tree`: https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.cut_tree.html
    ]]></help>
    <citations>
        <citation type="doi">10.1038/s41592-019-0686-2</citation>
    </citations>
</tool>
author	iuc
date	Mon, 19 Aug 2024 15:33:16 +0000
parents	8192b416f945
children	f8ee933de3ca