changeset 0:60f93f839759 draft default tip

Uploaded
author saket-choudhary
date Tue, 07 Oct 2014 19:35:50 -0400
parents
children
files inchlib_clust/inchlib_clust.py inchlib_clust/inchlib_clust.xml inchlib_clust/test-data/inchlib_input.csv inchlib_clust/test-data/inchlib_input1.csv inchlib_clust/test-data/inchlib_input_metadata.csv inchlib_clust/test-data/inchlib_output.json inchlib_clust/tool_dependencies.xml
diffstat 7 files changed, 921 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/inchlib_clust/inchlib_clust.py	Tue Oct 07 19:35:50 2014 -0400
@@ -0,0 +1,533 @@
+#coding: utf-8
+from __future__ import print_function
+
+import csv, json, copy, re, argparse, os, urllib2
+
+import numpy, scipy, fastcluster, sklearn
+import scipy.cluster.hierarchy as hcluster
+from sklearn import preprocessing
+from scipy import spatial
+
+LINKAGES = ["single", "complete", "average", "centroid", "ward", "median", "weighted"]
+RAW_LINKAGES = ["ward", "centroid"]
+DISTANCES = {"numeric": ["braycurtis", "canberra", "chebyshev", "cityblock", "correlation", "cosine", "euclidean", "mahalanobis", "minkowski", "seuclidean", "sqeuclidean"],
+              "binary": ["dice","hamming","jaccard","kulsinski","matching","rogerstanimoto","russellrao","sokalmichener","sokalsneath","yule"]}
+
+class Dendrogram():
+    """Class which handles the generation of cluster heatmap format of clustered data. 
+    As an input it takes a Cluster instance with clustered data."""
+
+    def __init__(self, clustering):
+        self.cluster_object = clustering
+        self.data_type = clustering.data_type
+        self.axis = clustering.clustering_axis
+        self.clustering = clustering.clustering
+        self.tree = hcluster.to_tree(self.clustering)
+        self.data = clustering.data
+        self.data_names = clustering.data_names
+        self.header = clustering.header
+        self.dendrogram = False
+
+    def __get_cluster_heatmap__(self, write_data):
+        root, nodes = hcluster.to_tree(self.clustering, rd=True)
+        node_id2node = {}
+        dendrogram = {"nodes":{}}
+
+        for node in nodes:
+            node_id = node.id
+            if node.count == 1:
+                node_id2node[node_id] = {"count":1, "distance":0}
+
+            else:
+                node_left_child = node.get_left().id
+                node_right_child = node.get_right().id
+                node_id2node[node_id] = {"count":node.count, "distance":round(node.dist, 3), "left_child": node_left_child, "right_child": node_right_child}
+
+        for n in node_id2node:
+            node = node_id2node[n]
+            if node["count"] != 1:
+                node_id2node[node["left_child"]]["parent"] = n
+                node_id2node[node["right_child"]]["parent"] = n
+
+        for n in node_id2node:
+            node = node_id2node[n]
+
+            if node["count"] == 1:
+                data = self.data[n]
+                node["objects"] = [self.data_names[n]]
+
+                if node_id2node[node["parent"]]["left_child"] == n:
+                    node_id2node[node["parent"]]["left_child"] = n
+                else:
+                    node_id2node[node["parent"]]["right_child"] = n
+
+                if not write_data:
+                    data = []
+
+                node["features"] = data
+                dendrogram["nodes"][n] = node
+
+        for n in node_id2node:
+             if node_id2node[n]["count"] != 1:
+                dendrogram["nodes"][n] = node_id2node[n]
+
+        return dendrogram
+
+    def __get_column_dendrogram__(self):
+        root, nodes = hcluster.to_tree(self.cluster_object.column_clustering, rd=True)
+        node_id2node = {}
+        dendrogram = {"nodes":{}}
+
+        for node in nodes:
+            node_id = node.id
+            if node.count == 1:
+                node_id2node[node_id] = {"count":1, "distance":0}
+
+            else:
+                node_left_child = node.get_left().id
+                node_right_child = node.get_right().id
+                node_id2node[node_id] = {"count":node.count, "distance":round(node.dist, 3), "left_child": node_left_child, "right_child": node_right_child}
+
+        for n in node_id2node:
+            node = node_id2node[n]
+            if node["count"] != 1:
+                node_id2node[node["left_child"]]["parent"] = n
+                node_id2node[node["right_child"]]["parent"] = n
+
+        for n in node_id2node:
+             if not n in dendrogram["nodes"]:
+                dendrogram["nodes"][n] = node_id2node[n]
+
+        return dendrogram
+
+    def create_cluster_heatmap(self, compress=False, compressed_value="median", write_data=True):
+        """Creates cluster heatmap representation in inchlib format. By setting compress parameter to True you can
+        cut the dendrogram in a distance to decrease the row size of the heatmap to specified count. 
+        When compressing the type of the resulted value of merged rows is given by the compressed_value parameter (median, mean).
+        When the metadata are nominal (text values) the most frequent is the result after compression.
+        By setting write_data to False the data features won't be present in the resulting format."""
+        self.dendrogram = {"data": self.__get_cluster_heatmap__(write_data)}
+
+        self.compress = compress
+        self.compressed_value = compressed_value
+        self.compress_cluster_treshold = 0
+        if self.compress and self.compress >= 0:
+            self.compress_cluster_treshold = self.__get_distance_treshold__(compress)
+            print("Distance treshold for compression:", self.compress_cluster_treshold)
+            if self.compress_cluster_treshold >= 0:
+                self.__compress_data__()
+        else:
+            self.compress = False
+
+        if self.header and write_data:
+            self.dendrogram["data"]["feature_names"] = [h for h in self.header]
+        elif self.header and not write_data:
+            self.dendrogram["data"]["feature_names"] = []
+        
+        if self.axis == "both" and len(self.cluster_object.column_clustering):
+            column_dendrogram = hcluster.to_tree(self.cluster_object.column_clustering)            
+            self.dendrogram["column_dendrogram"] = self.__get_column_dendrogram__()
+        return
+
+    def __compress_data__(self):
+        nodes = {}
+        to_remove = set()
+
+        compressed_value2fnc = {
+            "median": lambda values: [round(numpy.median(value), 3) for value in values],
+            "mean": lambda values: [round(numpy.average(value), 3) for value in values],
+        }
+        
+        for n in self.dendrogram["data"]["nodes"]:
+            node = self.dendrogram["data"]["nodes"][n]
+
+            if node["count"] == 1:
+                objects = node["objects"]
+                data = node["features"]
+                node_id = n
+
+                while self.dendrogram["data"]["nodes"][node["parent"]]["distance"] <= self.compress_cluster_treshold:
+                    to_remove.add(node_id)
+                    node_id = node["parent"]
+                    node = self.dendrogram["data"]["nodes"][node_id]
+
+                if node["count"] != 1:
+
+                    if not "objects" in self.dendrogram["data"]["nodes"][node_id]:
+                        self.dendrogram["data"]["nodes"][node_id]["objects"] = []
+                        self.dendrogram["data"]["nodes"][node_id]["features"] = []
+                    
+                    self.dendrogram["data"]["nodes"][node_id]["objects"].extend(objects)
+
+                    if data:
+                        self.dendrogram["data"]["nodes"][node_id]["features"].append(data)
+
+        for node in to_remove:
+            self.dendrogram["data"]["nodes"].pop(node)
+
+        for k in self.dendrogram["data"]["nodes"]:
+            node = self.dendrogram["data"]["nodes"][k]
+            if "objects" in node and node["count"] != 1:
+                self.dendrogram["data"]["nodes"][k]["distance"] = 0
+                self.dendrogram["data"]["nodes"][k]["count"] = 1
+                self.dendrogram["data"]["nodes"][k].pop("left_child")
+                self.dendrogram["data"]["nodes"][k].pop("right_child")
+                rows = zip(*self.dendrogram["data"]["nodes"][k]["features"])
+                self.dendrogram["data"]["nodes"][k]["features"] = compressed_value2fnc[self.compressed_value](rows)
+
+        self.__adjust_node_counts__()
+
+        return
+
+    def __adjust_node_counts__(self):
+        leaves = []
+
+        for n in self.dendrogram["data"]["nodes"]:
+            if self.dendrogram["data"]["nodes"][n]["count"] > 1:
+                self.dendrogram["data"]["nodes"][n]["count"] = 0
+            else:
+                leaves.append(n)
+
+        for n in leaves:
+            node = self.dendrogram["data"]["nodes"][n]
+            parent_id = node["parent"]
+
+            while parent_id:
+                node = self.dendrogram["data"]["nodes"][parent_id]
+                self.dendrogram["data"]["nodes"][parent_id]["count"] += 1
+                parent_id = False
+                if "parent" in node:
+                    parent_id = node["parent"]
+        return
+
+    def __get_distance_treshold__(self, cluster_count):
+        print("Calculating distance treshold for cluster compression...")
+        if cluster_count >= self.tree.count:
+            return -1
+        
+        i = 0
+        count = cluster_count + 1
+        test_step = self.tree.dist/2
+
+        while test_step >= 0.1:
+            count = len(set([c for c in hcluster.fcluster(self.clustering, i, "distance")]))
+            if count < cluster_count:
+                if i == 0:
+                    return 0
+                i = i - test_step
+                test_step = test_step/2
+            elif count == cluster_count:
+                return i
+            else:
+                i += test_step
+
+        return i+test_step*2
+
+    def export_cluster_heatmap_as_json(self, filename=None):
+        """Returns cluster heatmap in a JSON format or exports it to the file specified by the filename parameter."""
+        dendrogram_json = json.dumps(self.dendrogram, indent=4)
+        if filename:
+            output = open(filename, "w")
+            output.write(dendrogram_json)
+        return dendrogram_json
+
+    def export_cluster_heatmap_as_html(self, htmldir="."):
+        """Export simple HTML page with embedded cluster heatmap and dependencies to given directory."""
+        if not os.path.exists(htmldir):
+            os.makedirs(htmldir)
+        dendrogram_json = json.dumps(self.dendrogram, indent=4)
+        template = """<html>
+        <head>
+            <script src="jquery-2.0.3.min.js"></script>
+            <script src="kinetic-v5.0.0.min.js"></script>
+            <script src="inchlib-1.0.1.min.js"></script>
+            <script>
+            $(document).ready(function() {{
+                var data = {};
+                var inchlib = new InCHlib({{
+                    target: "inchlib",
+                    max_height: 1200,
+                    width: 1000,
+                }});
+                inchlib.read_data(data);
+                inchlib.draw();
+            }});
+            </script>
+        </head>
+
+        <body>
+            <div id="inchlib"></div>
+        </body>
+        </html>""".format(dendrogram_json)
+
+        lib2url = {"inchlib-1.0.1.min.js": "http://openscreen.cz/software/inchlib/static/js/inchlib-1.0.1.min.js",
+                    "jquery-2.0.3.min.js": "http://openscreen.cz/software/inchlib/static/js/jquery-2.0.3.min.js",
+                    "kinetic-v5.0.0.min.js": "http://openscreen.cz/software/inchlib/static/js/kinetic-v5.0.0.min.js"}
+        
+        for lib, url in lib2url.items():
+            try:
+                source = urllib2.urlopen(url)
+                source_html = source.read()
+                with open(os.path.join(htmldir, lib), "w") as output:
+                    output.write(source_html)
+            except urllib2.URLError, e:
+                raise Exception("\nCan't download file {}.\nPlease check your internet connection and try again.\nIf the error persists there can be something wrong with the InCHlib server.\n".format(url))
+
+        with open(os.path.join(htmdlir, "inchlib.html"), "w") as output:
+            output.write(template)
+        return
+
+    def add_metadata_from_file(self, metadata_file, delimiter, header=True, metadata_compressed_value="median"):
+        """Adds metadata from csv file.
+        Metadata_compressed_value specifies the resulted value when the data are compressed (median/mean/frequency)"""
+        self.metadata_compressed_value = metadata_compressed_value
+        self.metadata, self.metadata_header = self.__read_metadata_file__(metadata_file, delimiter, header)
+        self.__connect_metadata_to_data__()
+        return
+
+    def add_metadata(self, metadata, header=True, metadata_compressed_value="median"):
+        """Adds metadata in a form of list of lists (tuples).
+        Metadata_compressed_value specifies the resulted value when the data are compressed (median/mean/frequency)"""
+        self.metadata_compressed_value = metadata_compressed_value
+        self.metadata, self.metadata_header = self.__read_metadata__(metadata, header)
+        self.__connect_metadata_to_data__()
+        return
+
+    def __connect_metadata_to_data__(self):
+        if len(set(self.metadata.keys()) & set(self.data_names)) == 0:
+            raise Exception("Metadata objects must correspond with original data objects.")
+
+        if not self.dendrogram:
+            raise Exception("You must create dendrogram before adding metadata.")
+
+        self.dendrogram["metadata"] = {"nodes":{}}
+
+        if self.metadata_header:
+            self.dendrogram["metadata"]["feature_names"] = self.metadata_header
+
+        leaves = {n:node for n, node in self.dendrogram["data"]["nodes"].items() if node["count"] == 1}
+
+        if not self.compress:
+            
+            for leaf_id, leaf in leaves.items():
+                try:
+                    self.dendrogram["metadata"]["nodes"][leaf_id] = self.metadata[leaf["objects"][0]]
+                except Exception, e:
+                    continue
+        else:
+            compressed_value2fnc = {
+                "median": lambda values: round(numpy.median(col), 3),
+                "mean": lambda values: round(numpy.average(col), 3)
+            }
+
+            for leaf in leaves:
+                objects = []
+                for item in leaves[leaf]["objects"]:
+                    try:
+                        objects.append(self.metadata[item])
+                    except Exception, e:
+                        continue
+
+                cols = zip(*objects)
+                row = []
+                cols = [list(c) for c in cols]
+
+                for col in cols:
+                    
+                    if self.metadata_compressed_value in compressed_value2fnc:
+                        try:
+                            col = [float(c) for c in col]
+                            value = compressed_value2fnc[self.metadata_compressed_value](col)
+                        except ValueError:
+                            freq2val = {col.count(v):v for v in set(col)}
+                            value = freq2val[max(freq2val.keys())]
+                    
+                    elif self.metadata_compressed_value == "frequency":
+                        freq2val = {col.count(v):v for v in set(col)}
+                        value = freq2val[max(freq2val.keys())]
+                    
+                    else:
+                        raise Exception("Unkown type of metadata_compressed_value: {}. Possible values are: median, mean, frequency.".format(self.metadata_compressed_value))
+                    
+                    row.append(value)
+
+                self.dendrogram["metadata"]["nodes"][leaf] = row
+        return
+
+    def __read_metadata__(self, metadata, header):
+        metadata_header = []
+        rows = metadata
+        metadata = {}
+        data_start = 0
+
+        if header:
+            metadata_header = rows[0][1:]
+            data_start = 1
+        
+        for row in rows[data_start:]:
+            metadata[str(row[0])] = [r for r in row[1:]]
+
+        return metadata, metadata_header
+
+        
+    def __read_metadata_file__(self, metadata_file, delimiter, header):
+        csv_reader = csv.reader(open(metadata_file, "r"), delimiter=delimiter)
+        metadata_header = []
+        rows = [row for row in csv_reader]
+        metadata = {}
+        data_start = 0
+
+        if header:
+            metadata_header = rows[0][1:]
+            data_start = 1
+        
+        for row in rows[data_start:]:
+            metadata_id = str(row[0])
+            metadata[metadata_id] = [r for r in row[1:]]
+
+        return metadata, metadata_header
+
+
+class Cluster():
+    """Class for data clustering"""
+
+    def __init__(self):
+        self.write_original = False
+
+    def read_csv(self, filename, delimiter=",", header=False):
+        """Reads data from the CSV file"""
+        self.filename = filename
+        csv_reader = csv.reader(open(self.filename, "r"), delimiter=delimiter)
+        rows = [row for row in csv_reader]
+        self.read_data(rows, header)
+
+    def read_data(self, rows, header=False):
+        """Reads data in a form of list of lists (tuples)"""
+        self.header = header
+        data_start = 0
+
+        if self.header:
+            self.header = rows[0][1:]
+            data_start = 1
+        
+        self.data_names = [str(row[0]) for row in rows[data_start:]]
+        self.data = [[round(float(value), 3) for value in row[1:]] for row in rows[data_start:]]
+        self.original_data = copy.deepcopy(self.data)
+        return
+
+    def normalize_data(self, feature_range=(0,1), write_original=False):
+        """Normalizes data to a scale from 0 to 1. When write_original is set to True, 
+        the normalized data will be clustered, but original data will be written to the heatmap."""
+        self.write_original = write_original
+        min_max_scaler = preprocessing.MinMaxScaler(feature_range)
+        self.data = min_max_scaler.fit_transform(self.data)
+        self.data = [[round(v, 3) for v in row] for row in self.data]
+        return
+
+    def cluster_data(self, data_type="numeric", row_distance="euclidean", row_linkage="single", axis="row", column_distance="euclidean", column_linkage="ward"):
+        """Performs clustering according to the given parameters.
+        @data_type - numeric/binary
+        @row_distance/column_distance - see. DISTANCES variable
+        @row_linkage/column_linkage - see. LINKAGES variable
+        @axis - row/both
+        """
+        
+        print("Clustering rows:", row_distance, row_linkage)
+        self.data_type = data_type
+        self.clustering_axis = axis
+        row_linkage = str(row_linkage)
+        
+        if row_linkage in RAW_LINKAGES:
+            self.clustering = fastcluster.linkage(self.data, method=row_linkage, metric=row_distance)
+
+        else:
+            self.distance_vector = fastcluster.pdist(self.data, row_distance)
+
+            if data_type in DISTANCES and not row_distance in DISTANCES[data_type]:
+                raise Exception("".join(["When clustering" , data_type, "data you must choose from these distance measures: ", ", ".join(DISTANCES[data_type])]))
+            elif not data_type in DISTANCES.keys():
+                raise Exception("".join(["You can choose only from data types: ", ", ".join(DISTANCES.keys())]))
+
+            self.clustering = fastcluster.linkage(self.distance_vector, method=str(row_linkage))
+
+        self.column_clustering = []
+        if axis == "both" and len(self.data[0]) > 2:
+            print("Clustering columns:", column_distance, column_linkage)
+            self.__cluster_columns__(column_distance, column_linkage)
+
+        if self.write_original:
+            self.data = self.original_data
+
+        return
+
+    def __cluster_columns__(self, column_distance, column_linkage):
+        columns = zip(*self.data)
+        self.column_clustering = fastcluster.linkage(columns, method=column_linkage, metric=column_distance)
+        self.data_order = hcluster.leaves_list(self.column_clustering)
+        self.data = self.__reorder_data__(self.data, self.data_order)
+        self.original_data = self.__reorder_data__(self.original_data, self.data_order)
+        if self.header:
+            self.header = self.__reorder_data__([self.header], self.data_order)[0]
+        return
+
+    def __reorder_data__(self, data, order):
+        for i in xrange(len(data)):
+            reordered_data = []
+            for j in order:
+                reordered_data.append(data[i][j])
+            reordered_data.reverse()
+            data[i] = reordered_data
+
+        return data
+
+def _process_(arguments):
+    c = Cluster()
+    c.read_csv(arguments.data_file, arguments.data_delimiter, arguments.data_header)
+    
+    if arguments.normalize:
+        c.normalize_data(feature_range=(0,1), write_original=arguments.write_original)
+
+    c.cluster_data(data_type=arguments.datatype, row_distance=arguments.row_distance, row_linkage=arguments.row_linkage, axis=arguments.axis, column_distance=arguments.column_distance, column_linkage=arguments.column_linkage)
+
+    d = Dendrogram(c)
+    d.create_cluster_heatmap(compress=arguments.compress, compressed_value=arguments.compressed_value, write_data=not arguments.dont_write_data)
+    
+    if arguments.metadata:
+        d.add_metadata_from_file(metadata_file=arguments.metadata, delimiter=arguments.metadata_delimiter, header=arguments.metadata_header, metadata_compressed_value=arguments.metadata_compressed_value)
+    
+    if arguments.output_file or arguments.html_dir:
+        if arguments.output_file:
+            d.export_cluster_heatmap_as_json(arguments.output_file)
+        else:
+            d.export_cluster_heatmap_as_html(arguments.html_dir)
+    else:
+        print(json.dumps(d.dendrogram, indent=4))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument("data_file", type=str, help="csv(text) data file with delimited values")
+    parser.add_argument("-o", "--output_file", type=str, help="the name of output file")
+    parser.add_argument("-html", "--html_dir", type=str, help="the directory to store HTML page with dependencies")
+    parser.add_argument("-rd", "--row_distance", type=str, default="euclidean", help="set the distance to use for clustering rows")
+    parser.add_argument("-rl", "--row_linkage", type=str, default="ward", help="set the linkage to use for clustering rows")
+    parser.add_argument("-cd", "--column_distance", type=str, default="euclidean", help="set the distance to use for clustering columns (only when clustering by both axis -a parameter)")
+    parser.add_argument("-cl", "--column_linkage", type=str, default="ward", help="set the linkage to use for clustering columns (only when clustering by both axis -a parameter)")
+    parser.add_argument("-a", "--axis", type=str, default="row", help="define clustering axis (row/both)")
+    parser.add_argument("-dt", "--datatype", type=str, default="numeric", help="specify the type of the data (numeric/binary)")
+    parser.add_argument("-dd", "--data_delimiter", type=str, default=",", help="delimiter of values in data file")
+    parser.add_argument("-m", "--metadata", type=str, default=None, help="csv(text) metadata file with delimited values")
+    parser.add_argument("-md", "--metadata_delimiter", type=str, default=",", help="delimiter of values in metadata file")
+    parser.add_argument("-dh", "--data_header", default=False, help="whether the first row of data file is a header", action="store_true")
+    parser.add_argument("-mh", "--metadata_header", default=False, help="whether the first row of metadata file is a header", action="store_true")
+    parser.add_argument("-c", "--compress", type=int, default=0, help="compress the data to contain maximum of specified count of rows")
+    parser.add_argument("-cv", "--compressed_value", type=str, default="median", help="the resulted value from merged rows when the data are compressed (median/mean/frequency)")
+    parser.add_argument("-mcv", "--metadata_compressed_value", type=str, default="median", help="the resulted value from merged rows of metadata when the data are compressed (median/mean/count)")
+    parser.add_argument("-dwd", "--dont_write_data", default=False, help="don't write clustered data to the inchlib data format", action="store_true")
+    parser.add_argument("-n", "--normalize", default=False, help="normalize data to [0, 1] range", action="store_true")
+    parser.add_argument("-wo", "--write_original", default=False, help="cluster normalized data but write the original ones to the heatmap", action="store_true")
+    
+    args = parser.parse_args()
+    _process_(args)
+    
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/inchlib_clust/inchlib_clust.xml	Tue Oct 07 19:35:50 2014 -0400
@@ -0,0 +1,230 @@
+<tool id="inchlib_clust" name="INCHlib" version="1.0.0" hidden="false">
+    <requirements>
+        <requirement type="package" version="3.5.0">blas</requirement>
+        <requirement type="package" version="1.7">numpy</requirement>
+        <requirement type="package" version="0.12">scipy</requirement>
+        <requirement type="package" version="0.15">scikit-learn</requirement>
+        <requirement type="package" version="1.1.13">fastcluster</requirement>
+        <requirement type="python-module">numpy</requirement>
+        <requirement type="python-module">scipy</requirement>
+        <requirement type="python-module">scikit-learn</requirement>
+        <requirement type="python-module">fastcluster</requirement>
+    </requirements>
+    <description>Performs data clustering for input to InCHlib.js</description>
+    <command interpreter="python">
+        inchlib_clust.py $input -o $output
+                        --row_distance $row_distance
+                        --row_linkage $row_linkage
+                        --column_distance $column_distance
+                        --column_linkage $column_linkage
+                        --axis $axis
+                        --datatype $datatype
+                        --data_delimiter $data_delimiter
+                    #if $data_header.data_header_present == "yes"
+                        --data_header
+                    #end if
+                    #if $metadata_option.metadata_present == "yes"
+                        --metadata $metadata_option.metadata
+                        --metadata_delimiter $metadata_option.metadata_delimiter
+                        #if $metadata_option.metadata_header.metadata_header_present == "yes"
+                            --metadata_header
+                        #end if
+                        #if $metadata_option.metadata_compress_option.should_compress_metadata == "yes"
+                            --metadata_compressed_value $metadata_option.metadata_compressed_value
+                        #end if
+                        #end if
+                    #if $data_compress_option.should_compress_data == "yes"
+                        --data_compressed_value $data_compressed_value
+                    #end if
+                    #if $should_normalize == "yes"
+                        --normalize
+                    #end if
+    </command>
+    <inputs>
+        <param format="text" name="input" type="data" label="Input File" />
+
+        <param name="row_distance" type="select" label="Row Distance Method" help="Set the distance to use for clustering rows">
+            <option value="braycurtis">braycurtis</option>
+            <option value="canberra">canberra</option>
+            <option value="chebyshev">chebyshev</option>
+            <option value="cityblock">cityblock</option>
+            <option value="correlation">correlation</option>
+            <option value="cosine">cosine</option>
+            <option value="euclidean" selected="true">euclidean</option>
+            <option value="mahalanobis">mahalanobis</option>
+            <option value="minkowski">minkowski</option>
+            <option value="seuclidean">seuclidean</option>
+            <option value="sqeuclidean">sqeuclidean</option>
+
+            <option value="dice">dice</option>
+            <option value="hamming">hamming</option>
+            <option value="jaccard">jaccard</option>
+            <option value="kulsinski">kulsinski</option>
+            <option value="matching">matching</option>
+            <option value="rogerstanimoto">rogerstanimoto</option>
+            <option value="russellrao">russellrao</option>
+            <option value="sokalmichener">sokalmichener</option>
+            <option value="sokalsneath">sokalsneath</option>
+            <option value="yule">yule</option>
+        </param>
+
+        <param name="row_linkage" type="select" label="Row Linkage Method" help="Set the linkage to use for clustering rows">
+                <option value="single">single</option>
+                <option value="complete">complete</option>
+                <option value="average">average</option>
+                <option value="centroid">centroid</option>
+                <option value="ward">ward</option>
+                <option value="median">median</option>
+                <option value="weighted">weighted</option>
+
+                <option value="ward" selected="true">ward</option>
+                <option value="centroid">centroid</option>
+        </param>
+
+        <param name="column_distance" type="select" label="Column Distance Method" help="Set the distance to use for clustering columns">
+            <option value="braycurtis">braycurtis</option>
+            <option value="canberra">canberra</option>
+            <option value="chebyshev">chebyshev</option>
+            <option value="cityblock">cityblock</option>
+            <option value="correlation">correlation</option>
+            <option value="cosine">cosine</option>
+            <option value="euclidean" selected="true">euclidean</option>
+            <option value="mahalanobis">mahalanobis</option>
+            <option value="minkowski">minkowski</option>
+            <option value="seuclidean">seuclidean</option>
+            <option value="sqeuclidean">sqeuclidean</option>
+
+            <option value="dice">dice</option>
+            <option value="hamming">hamming</option>
+            <option value="jaccard">jaccard</option>
+            <option value="kulsinski">kulsinski</option>
+            <option value="matching">matching</option>
+            <option value="rogerstanimoto">rogerstanimoto</option>
+            <option value="russellrao">russellrao</option>
+            <option value="sokalmichener">sokalmichener</option>
+            <option value="sokalsneath">sokalsneath</option>
+            <option value="yule">yule</option>
+        </param>
+
+        <param name="column_linkage" type="select" label="Column Linkage" help="Set the linkage to use for clustering columns">
+                <option value="single">single</option>
+                <option value="complete">complete</option>
+                <option value="average">average</option>
+                <option value="centroid">centroid</option>
+                <option value="ward">ward</option>
+                <option value="median">median</option>
+                <option value="weighted">weighted</option>
+
+                <option value="ward" selected="true">ward</option>
+                <option value="centroid">centroid</option>
+        </param>
+
+        <param name="axis" type="select" label="Clustering Axis">
+            <option value="row" selected="true">row</option>
+            <option value="both">both</option>
+        </param>
+
+        <param name="datatype" type="select" label="Type Of Data">
+            <option value="numeric" selected="true">numeric</option>
+            <option value="binary">binary</option>
+        </param>
+
+        <param name="data_delimiter" type="select" label="Data Delimiter">
+            <option value="\t">tab</option>
+            <option value=",">comma</option>
+            <option value=";">semicolon</option>
+        </param>
+
+        <conditional name="data_header">
+            <param name="data_header_present" type="select" label="First row of file is header">
+                <option value="yes">Yes</option>
+                <option value="no" selected="true">No</option>
+            </param>
+        </conditional>
+
+        <conditional name="data_compress_option">
+            <param name="should_compress_data" type="select" label="Compress Rows">
+                <option value="yes">Yes</option>
+                <option value="no" selected="true">No</option>
+            </param>
+            <when value="yes">
+                <param name="data_compressed_value" type="select">
+                    <option value="median" selected="true">median</option>
+                    <option value="mean">mean</option>
+                </param>
+            </when>
+        </conditional>
+
+        <param name="should_normalize" type="select" label="Normalise">
+                <option value="yes">Yes</option>
+                <option value="no" selected="true">No</option>
+        </param>
+
+        <conditional name="metadata_option">
+            <param name="metadata_present" type="select" label="Metadata Input">
+                <option value="yes">Yes</option>
+                <option value="no" selected="true">No</option>
+            </param>
+            <when value="yes">
+                <param name="metadata" type="data" format="text" label="Metadata File"/>
+                <conditional name="metadata_header">
+                    <param name="metadata_header_present" type="select" label="Metadata Header Present?">
+                        <option value="yes">Yes</option>
+                        <option value="no" selected="true">No</option>
+                    </param>
+                </conditional>
+                <param name="metadata_delimiter" type="select" label="Metadata Delimiter">
+                    <option value="\t">tab</option>
+                    <option value=",">comma</option>
+                    <option value=";">semicolon</option>
+                </param>
+                <conditional name="metadata_compress_option">
+                    <param name="should_compress_metadata" type="select" label="Compress Metadata Rows?">
+                        <option value="yes">Yes</option>
+                        <option value="no" selected="true">No</option>
+                    </param>
+                    <when value="yes">
+                        <param name="metadata_compressed_value" type="select">
+                            <option value="median" selected="true">median</option>
+                            <option value="mean">mean</option>
+                            <option value="frequency">frequency</option>
+                        </param>
+                    </when>
+                </conditional>
+            </when>
+        </conditional>
+    </inputs>
+
+    <outputs>
+        <data format="json" name="output" label="${tool.name} on ${on_string}: output" />
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="input" value="inchlib_input1.csv"/>
+            <param name="row_distance" value="euclidean"/>
+            <param name="row_linkage" value="ward"/>
+            <param name="column_distance" value="euclidean"/>
+            <param name="column_linkage" value="ward"/>
+            <param name="axis" value="row" />
+            <param name="datatype" value="numeric" />
+            <param name="data_delimiter" value="," />
+            <param name="data_header_present" value="yes" />
+            <param name="should_compress_data" value="no" />
+            <param name="should_normalize" value="no" />
+            <param name="metadata_present" value="yes" />
+            <param name="metadata" value="inchlib_input_metadata.csv" />
+            <param name="metadata_delimiter" value="," />
+            <param name="metadata_header_present" value="yes" />
+            <output name="output" file="inchlib_output.json" ftype="json" />
+        </test>
+    </tests>
+
+    <help>
+        **What it does**
+
+            Performs data clustering and prepares input data for InCHlib.
+
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/inchlib_clust/test-data/inchlib_input.csv	Tue Oct 07 19:35:50 2014 -0400
@@ -0,0 +1,6 @@
+id,Provean,Sift,MA
+1,0.0,1.0,1.0
+2,0.0,1.0,1.0
+3,0.0,1.0,1.0
+4,0.0,0.0,0.0
+5,0.0,1.0,1.0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/inchlib_clust/test-data/inchlib_input1.csv	Tue Oct 07 19:35:50 2014 -0400
@@ -0,0 +1,6 @@
+id,feature 1,feature 2,feature 3,feature 4
+1,5.1,3.5,1.4,0.2
+2,4.9,3,1.4,0.2
+3,4.7,3.2,1.3,0.2
+4,4.6,3.1,1.5,0.2
+5,5,3.6,1.4,0.2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/inchlib_clust/test-data/inchlib_input_metadata.csv	Tue Oct 07 19:35:50 2014 -0400
@@ -0,0 +1,6 @@
+id,class
+1,class 1
+2,class 1
+3,class 2
+4,class 2
+5,class 3
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/inchlib_clust/test-data/inchlib_output.json	Tue Oct 07 19:35:50 2014 -0400
@@ -0,0 +1,131 @@
+{
+    "data": {
+        "nodes": {
+            "0": {
+                "count": 1, 
+                "distance": 0, 
+                "objects": [
+                    "1"
+                ], 
+                "features": [
+                    5.1, 
+                    3.5, 
+                    1.4, 
+                    0.2
+                ], 
+                "parent": 5
+            }, 
+            "1": {
+                "count": 1, 
+                "distance": 0, 
+                "objects": [
+                    "2"
+                ], 
+                "features": [
+                    4.9, 
+                    3.0, 
+                    1.4, 
+                    0.2
+                ], 
+                "parent": 7
+            }, 
+            "2": {
+                "count": 1, 
+                "distance": 0, 
+                "objects": [
+                    "3"
+                ], 
+                "features": [
+                    4.7, 
+                    3.2, 
+                    1.3, 
+                    0.2
+                ], 
+                "parent": 6
+            }, 
+            "3": {
+                "count": 1, 
+                "distance": 0, 
+                "objects": [
+                    "4"
+                ], 
+                "features": [
+                    4.6, 
+                    3.1, 
+                    1.5, 
+                    0.2
+                ], 
+                "parent": 6
+            }, 
+            "4": {
+                "count": 1, 
+                "distance": 0, 
+                "objects": [
+                    "5"
+                ], 
+                "features": [
+                    5.0, 
+                    3.6, 
+                    1.4, 
+                    0.2
+                ], 
+                "parent": 5
+            }, 
+            "5": {
+                "count": 2, 
+                "distance": 0.141, 
+                "left_child": 0, 
+                "parent": 8, 
+                "right_child": 4
+            }, 
+            "6": {
+                "count": 2, 
+                "distance": 0.245, 
+                "left_child": 2, 
+                "parent": 7, 
+                "right_child": 3
+            }, 
+            "7": {
+                "count": 3, 
+                "distance": 0.337, 
+                "left_child": 1, 
+                "parent": 8, 
+                "right_child": 6
+            }, 
+            "8": {
+                "count": 5, 
+                "distance": 0.852, 
+                "left_child": 5, 
+                "right_child": 7
+            }
+        }, 
+        "feature_names": [
+            "feature 1", 
+            "feature 2", 
+            "feature 3", 
+            "feature 4"
+        ]
+    }, 
+    "metadata": {
+        "nodes": {
+            "0": [
+                "class 1"
+            ], 
+            "1": [
+                "class 1"
+            ], 
+            "2": [
+                "class 2"
+            ], 
+            "3": [
+                "class 2"
+            ], 
+            "4": [
+                "class 3"
+            ]
+        }, 
+        "feature_names": [
+            "class"
+        ]
+    }
+}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/inchlib_clust/tool_dependencies.xml	Tue Oct 07 19:35:50 2014 -0400
@@ -0,0 +1,9 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="scikit-learn" version="0.15">
+        <repository changeset_revision="ef12a3a11d5b" name="package_numpy_1_7" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" />
+        <repository changeset_revision="336aead68607" name="package_scipy_0_12" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" />
+        <repository changeset_revision="5d85851b211b" name="package_scikit_learn_0_15" owner="saket-choudhary" toolshed="http://toolshed.g2.bx.psu.edu" />
+        <repository changeset_revision="6ef4ff4524ca" name="package_fastcluster_1_1_13" owner="saket-choudhary" toolshed="http://toolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>