# HG changeset patch # User saket-choudhary # Date 1412724950 14400 # Node ID 60f93f839759709d2e7eaacbf68c617f3b4f395b Uploaded diff -r 000000000000 -r 60f93f839759 inchlib_clust/inchlib_clust.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/inchlib_clust/inchlib_clust.py Tue Oct 07 19:35:50 2014 -0400 @@ -0,0 +1,533 @@ +#coding: utf-8 +from __future__ import print_function + +import csv, json, copy, re, argparse, os, urllib2 + +import numpy, scipy, fastcluster, sklearn +import scipy.cluster.hierarchy as hcluster +from sklearn import preprocessing +from scipy import spatial + +LINKAGES = ["single", "complete", "average", "centroid", "ward", "median", "weighted"] +RAW_LINKAGES = ["ward", "centroid"] +DISTANCES = {"numeric": ["braycurtis", "canberra", "chebyshev", "cityblock", "correlation", "cosine", "euclidean", "mahalanobis", "minkowski", "seuclidean", "sqeuclidean"], + "binary": ["dice","hamming","jaccard","kulsinski","matching","rogerstanimoto","russellrao","sokalmichener","sokalsneath","yule"]} + +class Dendrogram(): + """Class which handles the generation of cluster heatmap format of clustered data. + As an input it takes a Cluster instance with clustered data.""" + + def __init__(self, clustering): + self.cluster_object = clustering + self.data_type = clustering.data_type + self.axis = clustering.clustering_axis + self.clustering = clustering.clustering + self.tree = hcluster.to_tree(self.clustering) + self.data = clustering.data + self.data_names = clustering.data_names + self.header = clustering.header + self.dendrogram = False + + def __get_cluster_heatmap__(self, write_data): + root, nodes = hcluster.to_tree(self.clustering, rd=True) + node_id2node = {} + dendrogram = {"nodes":{}} + + for node in nodes: + node_id = node.id + if node.count == 1: + node_id2node[node_id] = {"count":1, "distance":0} + + else: + node_left_child = node.get_left().id + node_right_child = node.get_right().id + node_id2node[node_id] = {"count":node.count, "distance":round(node.dist, 3), "left_child": node_left_child, "right_child": node_right_child} + + for n in node_id2node: + node = node_id2node[n] + if node["count"] != 1: + node_id2node[node["left_child"]]["parent"] = n + node_id2node[node["right_child"]]["parent"] = n + + for n in node_id2node: + node = node_id2node[n] + + if node["count"] == 1: + data = self.data[n] + node["objects"] = [self.data_names[n]] + + if node_id2node[node["parent"]]["left_child"] == n: + node_id2node[node["parent"]]["left_child"] = n + else: + node_id2node[node["parent"]]["right_child"] = n + + if not write_data: + data = [] + + node["features"] = data + dendrogram["nodes"][n] = node + + for n in node_id2node: + if node_id2node[n]["count"] != 1: + dendrogram["nodes"][n] = node_id2node[n] + + return dendrogram + + def __get_column_dendrogram__(self): + root, nodes = hcluster.to_tree(self.cluster_object.column_clustering, rd=True) + node_id2node = {} + dendrogram = {"nodes":{}} + + for node in nodes: + node_id = node.id + if node.count == 1: + node_id2node[node_id] = {"count":1, "distance":0} + + else: + node_left_child = node.get_left().id + node_right_child = node.get_right().id + node_id2node[node_id] = {"count":node.count, "distance":round(node.dist, 3), "left_child": node_left_child, "right_child": node_right_child} + + for n in node_id2node: + node = node_id2node[n] + if node["count"] != 1: + node_id2node[node["left_child"]]["parent"] = n + node_id2node[node["right_child"]]["parent"] = n + + for n in node_id2node: + if not n in dendrogram["nodes"]: + dendrogram["nodes"][n] = node_id2node[n] + + return dendrogram + + def create_cluster_heatmap(self, compress=False, compressed_value="median", write_data=True): + """Creates cluster heatmap representation in inchlib format. By setting compress parameter to True you can + cut the dendrogram in a distance to decrease the row size of the heatmap to specified count. + When compressing the type of the resulted value of merged rows is given by the compressed_value parameter (median, mean). + When the metadata are nominal (text values) the most frequent is the result after compression. + By setting write_data to False the data features won't be present in the resulting format.""" + self.dendrogram = {"data": self.__get_cluster_heatmap__(write_data)} + + self.compress = compress + self.compressed_value = compressed_value + self.compress_cluster_treshold = 0 + if self.compress and self.compress >= 0: + self.compress_cluster_treshold = self.__get_distance_treshold__(compress) + print("Distance treshold for compression:", self.compress_cluster_treshold) + if self.compress_cluster_treshold >= 0: + self.__compress_data__() + else: + self.compress = False + + if self.header and write_data: + self.dendrogram["data"]["feature_names"] = [h for h in self.header] + elif self.header and not write_data: + self.dendrogram["data"]["feature_names"] = [] + + if self.axis == "both" and len(self.cluster_object.column_clustering): + column_dendrogram = hcluster.to_tree(self.cluster_object.column_clustering) + self.dendrogram["column_dendrogram"] = self.__get_column_dendrogram__() + return + + def __compress_data__(self): + nodes = {} + to_remove = set() + + compressed_value2fnc = { + "median": lambda values: [round(numpy.median(value), 3) for value in values], + "mean": lambda values: [round(numpy.average(value), 3) for value in values], + } + + for n in self.dendrogram["data"]["nodes"]: + node = self.dendrogram["data"]["nodes"][n] + + if node["count"] == 1: + objects = node["objects"] + data = node["features"] + node_id = n + + while self.dendrogram["data"]["nodes"][node["parent"]]["distance"] <= self.compress_cluster_treshold: + to_remove.add(node_id) + node_id = node["parent"] + node = self.dendrogram["data"]["nodes"][node_id] + + if node["count"] != 1: + + if not "objects" in self.dendrogram["data"]["nodes"][node_id]: + self.dendrogram["data"]["nodes"][node_id]["objects"] = [] + self.dendrogram["data"]["nodes"][node_id]["features"] = [] + + self.dendrogram["data"]["nodes"][node_id]["objects"].extend(objects) + + if data: + self.dendrogram["data"]["nodes"][node_id]["features"].append(data) + + for node in to_remove: + self.dendrogram["data"]["nodes"].pop(node) + + for k in self.dendrogram["data"]["nodes"]: + node = self.dendrogram["data"]["nodes"][k] + if "objects" in node and node["count"] != 1: + self.dendrogram["data"]["nodes"][k]["distance"] = 0 + self.dendrogram["data"]["nodes"][k]["count"] = 1 + self.dendrogram["data"]["nodes"][k].pop("left_child") + self.dendrogram["data"]["nodes"][k].pop("right_child") + rows = zip(*self.dendrogram["data"]["nodes"][k]["features"]) + self.dendrogram["data"]["nodes"][k]["features"] = compressed_value2fnc[self.compressed_value](rows) + + self.__adjust_node_counts__() + + return + + def __adjust_node_counts__(self): + leaves = [] + + for n in self.dendrogram["data"]["nodes"]: + if self.dendrogram["data"]["nodes"][n]["count"] > 1: + self.dendrogram["data"]["nodes"][n]["count"] = 0 + else: + leaves.append(n) + + for n in leaves: + node = self.dendrogram["data"]["nodes"][n] + parent_id = node["parent"] + + while parent_id: + node = self.dendrogram["data"]["nodes"][parent_id] + self.dendrogram["data"]["nodes"][parent_id]["count"] += 1 + parent_id = False + if "parent" in node: + parent_id = node["parent"] + return + + def __get_distance_treshold__(self, cluster_count): + print("Calculating distance treshold for cluster compression...") + if cluster_count >= self.tree.count: + return -1 + + i = 0 + count = cluster_count + 1 + test_step = self.tree.dist/2 + + while test_step >= 0.1: + count = len(set([c for c in hcluster.fcluster(self.clustering, i, "distance")])) + if count < cluster_count: + if i == 0: + return 0 + i = i - test_step + test_step = test_step/2 + elif count == cluster_count: + return i + else: + i += test_step + + return i+test_step*2 + + def export_cluster_heatmap_as_json(self, filename=None): + """Returns cluster heatmap in a JSON format or exports it to the file specified by the filename parameter.""" + dendrogram_json = json.dumps(self.dendrogram, indent=4) + if filename: + output = open(filename, "w") + output.write(dendrogram_json) + return dendrogram_json + + def export_cluster_heatmap_as_html(self, htmldir="."): + """Export simple HTML page with embedded cluster heatmap and dependencies to given directory.""" + if not os.path.exists(htmldir): + os.makedirs(htmldir) + dendrogram_json = json.dumps(self.dendrogram, indent=4) + template = """ + + + + + + + + +
+ + """.format(dendrogram_json) + + lib2url = {"inchlib-1.0.1.min.js": "http://openscreen.cz/software/inchlib/static/js/inchlib-1.0.1.min.js", + "jquery-2.0.3.min.js": "http://openscreen.cz/software/inchlib/static/js/jquery-2.0.3.min.js", + "kinetic-v5.0.0.min.js": "http://openscreen.cz/software/inchlib/static/js/kinetic-v5.0.0.min.js"} + + for lib, url in lib2url.items(): + try: + source = urllib2.urlopen(url) + source_html = source.read() + with open(os.path.join(htmldir, lib), "w") as output: + output.write(source_html) + except urllib2.URLError, e: + raise Exception("\nCan't download file {}.\nPlease check your internet connection and try again.\nIf the error persists there can be something wrong with the InCHlib server.\n".format(url)) + + with open(os.path.join(htmdlir, "inchlib.html"), "w") as output: + output.write(template) + return + + def add_metadata_from_file(self, metadata_file, delimiter, header=True, metadata_compressed_value="median"): + """Adds metadata from csv file. + Metadata_compressed_value specifies the resulted value when the data are compressed (median/mean/frequency)""" + self.metadata_compressed_value = metadata_compressed_value + self.metadata, self.metadata_header = self.__read_metadata_file__(metadata_file, delimiter, header) + self.__connect_metadata_to_data__() + return + + def add_metadata(self, metadata, header=True, metadata_compressed_value="median"): + """Adds metadata in a form of list of lists (tuples). + Metadata_compressed_value specifies the resulted value when the data are compressed (median/mean/frequency)""" + self.metadata_compressed_value = metadata_compressed_value + self.metadata, self.metadata_header = self.__read_metadata__(metadata, header) + self.__connect_metadata_to_data__() + return + + def __connect_metadata_to_data__(self): + if len(set(self.metadata.keys()) & set(self.data_names)) == 0: + raise Exception("Metadata objects must correspond with original data objects.") + + if not self.dendrogram: + raise Exception("You must create dendrogram before adding metadata.") + + self.dendrogram["metadata"] = {"nodes":{}} + + if self.metadata_header: + self.dendrogram["metadata"]["feature_names"] = self.metadata_header + + leaves = {n:node for n, node in self.dendrogram["data"]["nodes"].items() if node["count"] == 1} + + if not self.compress: + + for leaf_id, leaf in leaves.items(): + try: + self.dendrogram["metadata"]["nodes"][leaf_id] = self.metadata[leaf["objects"][0]] + except Exception, e: + continue + else: + compressed_value2fnc = { + "median": lambda values: round(numpy.median(col), 3), + "mean": lambda values: round(numpy.average(col), 3) + } + + for leaf in leaves: + objects = [] + for item in leaves[leaf]["objects"]: + try: + objects.append(self.metadata[item]) + except Exception, e: + continue + + cols = zip(*objects) + row = [] + cols = [list(c) for c in cols] + + for col in cols: + + if self.metadata_compressed_value in compressed_value2fnc: + try: + col = [float(c) for c in col] + value = compressed_value2fnc[self.metadata_compressed_value](col) + except ValueError: + freq2val = {col.count(v):v for v in set(col)} + value = freq2val[max(freq2val.keys())] + + elif self.metadata_compressed_value == "frequency": + freq2val = {col.count(v):v for v in set(col)} + value = freq2val[max(freq2val.keys())] + + else: + raise Exception("Unkown type of metadata_compressed_value: {}. Possible values are: median, mean, frequency.".format(self.metadata_compressed_value)) + + row.append(value) + + self.dendrogram["metadata"]["nodes"][leaf] = row + return + + def __read_metadata__(self, metadata, header): + metadata_header = [] + rows = metadata + metadata = {} + data_start = 0 + + if header: + metadata_header = rows[0][1:] + data_start = 1 + + for row in rows[data_start:]: + metadata[str(row[0])] = [r for r in row[1:]] + + return metadata, metadata_header + + + def __read_metadata_file__(self, metadata_file, delimiter, header): + csv_reader = csv.reader(open(metadata_file, "r"), delimiter=delimiter) + metadata_header = [] + rows = [row for row in csv_reader] + metadata = {} + data_start = 0 + + if header: + metadata_header = rows[0][1:] + data_start = 1 + + for row in rows[data_start:]: + metadata_id = str(row[0]) + metadata[metadata_id] = [r for r in row[1:]] + + return metadata, metadata_header + + +class Cluster(): + """Class for data clustering""" + + def __init__(self): + self.write_original = False + + def read_csv(self, filename, delimiter=",", header=False): + """Reads data from the CSV file""" + self.filename = filename + csv_reader = csv.reader(open(self.filename, "r"), delimiter=delimiter) + rows = [row for row in csv_reader] + self.read_data(rows, header) + + def read_data(self, rows, header=False): + """Reads data in a form of list of lists (tuples)""" + self.header = header + data_start = 0 + + if self.header: + self.header = rows[0][1:] + data_start = 1 + + self.data_names = [str(row[0]) for row in rows[data_start:]] + self.data = [[round(float(value), 3) for value in row[1:]] for row in rows[data_start:]] + self.original_data = copy.deepcopy(self.data) + return + + def normalize_data(self, feature_range=(0,1), write_original=False): + """Normalizes data to a scale from 0 to 1. When write_original is set to True, + the normalized data will be clustered, but original data will be written to the heatmap.""" + self.write_original = write_original + min_max_scaler = preprocessing.MinMaxScaler(feature_range) + self.data = min_max_scaler.fit_transform(self.data) + self.data = [[round(v, 3) for v in row] for row in self.data] + return + + def cluster_data(self, data_type="numeric", row_distance="euclidean", row_linkage="single", axis="row", column_distance="euclidean", column_linkage="ward"): + """Performs clustering according to the given parameters. + @data_type - numeric/binary + @row_distance/column_distance - see. DISTANCES variable + @row_linkage/column_linkage - see. LINKAGES variable + @axis - row/both + """ + + print("Clustering rows:", row_distance, row_linkage) + self.data_type = data_type + self.clustering_axis = axis + row_linkage = str(row_linkage) + + if row_linkage in RAW_LINKAGES: + self.clustering = fastcluster.linkage(self.data, method=row_linkage, metric=row_distance) + + else: + self.distance_vector = fastcluster.pdist(self.data, row_distance) + + if data_type in DISTANCES and not row_distance in DISTANCES[data_type]: + raise Exception("".join(["When clustering" , data_type, "data you must choose from these distance measures: ", ", ".join(DISTANCES[data_type])])) + elif not data_type in DISTANCES.keys(): + raise Exception("".join(["You can choose only from data types: ", ", ".join(DISTANCES.keys())])) + + self.clustering = fastcluster.linkage(self.distance_vector, method=str(row_linkage)) + + self.column_clustering = [] + if axis == "both" and len(self.data[0]) > 2: + print("Clustering columns:", column_distance, column_linkage) + self.__cluster_columns__(column_distance, column_linkage) + + if self.write_original: + self.data = self.original_data + + return + + def __cluster_columns__(self, column_distance, column_linkage): + columns = zip(*self.data) + self.column_clustering = fastcluster.linkage(columns, method=column_linkage, metric=column_distance) + self.data_order = hcluster.leaves_list(self.column_clustering) + self.data = self.__reorder_data__(self.data, self.data_order) + self.original_data = self.__reorder_data__(self.original_data, self.data_order) + if self.header: + self.header = self.__reorder_data__([self.header], self.data_order)[0] + return + + def __reorder_data__(self, data, order): + for i in xrange(len(data)): + reordered_data = [] + for j in order: + reordered_data.append(data[i][j]) + reordered_data.reverse() + data[i] = reordered_data + + return data + +def _process_(arguments): + c = Cluster() + c.read_csv(arguments.data_file, arguments.data_delimiter, arguments.data_header) + + if arguments.normalize: + c.normalize_data(feature_range=(0,1), write_original=arguments.write_original) + + c.cluster_data(data_type=arguments.datatype, row_distance=arguments.row_distance, row_linkage=arguments.row_linkage, axis=arguments.axis, column_distance=arguments.column_distance, column_linkage=arguments.column_linkage) + + d = Dendrogram(c) + d.create_cluster_heatmap(compress=arguments.compress, compressed_value=arguments.compressed_value, write_data=not arguments.dont_write_data) + + if arguments.metadata: + d.add_metadata_from_file(metadata_file=arguments.metadata, delimiter=arguments.metadata_delimiter, header=arguments.metadata_header, metadata_compressed_value=arguments.metadata_compressed_value) + + if arguments.output_file or arguments.html_dir: + if arguments.output_file: + d.export_cluster_heatmap_as_json(arguments.output_file) + else: + d.export_cluster_heatmap_as_html(arguments.html_dir) + else: + print(json.dumps(d.dendrogram, indent=4)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument("data_file", type=str, help="csv(text) data file with delimited values") + parser.add_argument("-o", "--output_file", type=str, help="the name of output file") + parser.add_argument("-html", "--html_dir", type=str, help="the directory to store HTML page with dependencies") + parser.add_argument("-rd", "--row_distance", type=str, default="euclidean", help="set the distance to use for clustering rows") + parser.add_argument("-rl", "--row_linkage", type=str, default="ward", help="set the linkage to use for clustering rows") + parser.add_argument("-cd", "--column_distance", type=str, default="euclidean", help="set the distance to use for clustering columns (only when clustering by both axis -a parameter)") + parser.add_argument("-cl", "--column_linkage", type=str, default="ward", help="set the linkage to use for clustering columns (only when clustering by both axis -a parameter)") + parser.add_argument("-a", "--axis", type=str, default="row", help="define clustering axis (row/both)") + parser.add_argument("-dt", "--datatype", type=str, default="numeric", help="specify the type of the data (numeric/binary)") + parser.add_argument("-dd", "--data_delimiter", type=str, default=",", help="delimiter of values in data file") + parser.add_argument("-m", "--metadata", type=str, default=None, help="csv(text) metadata file with delimited values") + parser.add_argument("-md", "--metadata_delimiter", type=str, default=",", help="delimiter of values in metadata file") + parser.add_argument("-dh", "--data_header", default=False, help="whether the first row of data file is a header", action="store_true") + parser.add_argument("-mh", "--metadata_header", default=False, help="whether the first row of metadata file is a header", action="store_true") + parser.add_argument("-c", "--compress", type=int, default=0, help="compress the data to contain maximum of specified count of rows") + parser.add_argument("-cv", "--compressed_value", type=str, default="median", help="the resulted value from merged rows when the data are compressed (median/mean/frequency)") + parser.add_argument("-mcv", "--metadata_compressed_value", type=str, default="median", help="the resulted value from merged rows of metadata when the data are compressed (median/mean/count)") + parser.add_argument("-dwd", "--dont_write_data", default=False, help="don't write clustered data to the inchlib data format", action="store_true") + parser.add_argument("-n", "--normalize", default=False, help="normalize data to [0, 1] range", action="store_true") + parser.add_argument("-wo", "--write_original", default=False, help="cluster normalized data but write the original ones to the heatmap", action="store_true") + + args = parser.parse_args() + _process_(args) + diff -r 000000000000 -r 60f93f839759 inchlib_clust/inchlib_clust.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/inchlib_clust/inchlib_clust.xml Tue Oct 07 19:35:50 2014 -0400 @@ -0,0 +1,230 @@ + diff -r 000000000000 -r 60f93f839759 inchlib_clust/test-data/inchlib_input.csv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/inchlib_clust/test-data/inchlib_input.csv Tue Oct 07 19:35:50 2014 -0400 @@ -0,0 +1,6 @@ +id,Provean,Sift,MA +1,0.0,1.0,1.0 +2,0.0,1.0,1.0 +3,0.0,1.0,1.0 +4,0.0,0.0,0.0 +5,0.0,1.0,1.0 diff -r 000000000000 -r 60f93f839759 inchlib_clust/test-data/inchlib_input1.csv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/inchlib_clust/test-data/inchlib_input1.csv Tue Oct 07 19:35:50 2014 -0400 @@ -0,0 +1,6 @@ +id,feature 1,feature 2,feature 3,feature 4 +1,5.1,3.5,1.4,0.2 +2,4.9,3,1.4,0.2 +3,4.7,3.2,1.3,0.2 +4,4.6,3.1,1.5,0.2 +5,5,3.6,1.4,0.2 diff -r 000000000000 -r 60f93f839759 inchlib_clust/test-data/inchlib_input_metadata.csv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/inchlib_clust/test-data/inchlib_input_metadata.csv Tue Oct 07 19:35:50 2014 -0400 @@ -0,0 +1,6 @@ +id,class +1,class 1 +2,class 1 +3,class 2 +4,class 2 +5,class 3 diff -r 000000000000 -r 60f93f839759 inchlib_clust/test-data/inchlib_output.json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/inchlib_clust/test-data/inchlib_output.json Tue Oct 07 19:35:50 2014 -0400 @@ -0,0 +1,131 @@ +{ + "data": { + "nodes": { + "0": { + "count": 1, + "distance": 0, + "objects": [ + "1" + ], + "features": [ + 5.1, + 3.5, + 1.4, + 0.2 + ], + "parent": 5 + }, + "1": { + "count": 1, + "distance": 0, + "objects": [ + "2" + ], + "features": [ + 4.9, + 3.0, + 1.4, + 0.2 + ], + "parent": 7 + }, + "2": { + "count": 1, + "distance": 0, + "objects": [ + "3" + ], + "features": [ + 4.7, + 3.2, + 1.3, + 0.2 + ], + "parent": 6 + }, + "3": { + "count": 1, + "distance": 0, + "objects": [ + "4" + ], + "features": [ + 4.6, + 3.1, + 1.5, + 0.2 + ], + "parent": 6 + }, + "4": { + "count": 1, + "distance": 0, + "objects": [ + "5" + ], + "features": [ + 5.0, + 3.6, + 1.4, + 0.2 + ], + "parent": 5 + }, + "5": { + "count": 2, + "distance": 0.141, + "left_child": 0, + "parent": 8, + "right_child": 4 + }, + "6": { + "count": 2, + "distance": 0.245, + "left_child": 2, + "parent": 7, + "right_child": 3 + }, + "7": { + "count": 3, + "distance": 0.337, + "left_child": 1, + "parent": 8, + "right_child": 6 + }, + "8": { + "count": 5, + "distance": 0.852, + "left_child": 5, + "right_child": 7 + } + }, + "feature_names": [ + "feature 1", + "feature 2", + "feature 3", + "feature 4" + ] + }, + "metadata": { + "nodes": { + "0": [ + "class 1" + ], + "1": [ + "class 1" + ], + "2": [ + "class 2" + ], + "3": [ + "class 2" + ], + "4": [ + "class 3" + ] + }, + "feature_names": [ + "class" + ] + } +} \ No newline at end of file diff -r 000000000000 -r 60f93f839759 inchlib_clust/tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/inchlib_clust/tool_dependencies.xml Tue Oct 07 19:35:50 2014 -0400 @@ -0,0 +1,9 @@ + + + + + + + + +