annotate inchlib_clust/inchlib_clust.py @ 0:60f93f839759 draft default tip

Uploaded
author saket-choudhary
date Tue, 07 Oct 2014 19:35:50 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
1 #coding: utf-8
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
2 from __future__ import print_function
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
3
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
4 import csv, json, copy, re, argparse, os, urllib2
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
5
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
6 import numpy, scipy, fastcluster, sklearn
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
7 import scipy.cluster.hierarchy as hcluster
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
8 from sklearn import preprocessing
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
9 from scipy import spatial
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
10
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
11 LINKAGES = ["single", "complete", "average", "centroid", "ward", "median", "weighted"]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
12 RAW_LINKAGES = ["ward", "centroid"]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
13 DISTANCES = {"numeric": ["braycurtis", "canberra", "chebyshev", "cityblock", "correlation", "cosine", "euclidean", "mahalanobis", "minkowski", "seuclidean", "sqeuclidean"],
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
14 "binary": ["dice","hamming","jaccard","kulsinski","matching","rogerstanimoto","russellrao","sokalmichener","sokalsneath","yule"]}
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
15
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
16 class Dendrogram():
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
17 """Class which handles the generation of cluster heatmap format of clustered data.
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
18 As an input it takes a Cluster instance with clustered data."""
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
19
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
20 def __init__(self, clustering):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
21 self.cluster_object = clustering
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
22 self.data_type = clustering.data_type
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
23 self.axis = clustering.clustering_axis
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
24 self.clustering = clustering.clustering
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
25 self.tree = hcluster.to_tree(self.clustering)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
26 self.data = clustering.data
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
27 self.data_names = clustering.data_names
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
28 self.header = clustering.header
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
29 self.dendrogram = False
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
30
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
31 def __get_cluster_heatmap__(self, write_data):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
32 root, nodes = hcluster.to_tree(self.clustering, rd=True)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
33 node_id2node = {}
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
34 dendrogram = {"nodes":{}}
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
35
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
36 for node in nodes:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
37 node_id = node.id
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
38 if node.count == 1:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
39 node_id2node[node_id] = {"count":1, "distance":0}
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
40
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
41 else:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
42 node_left_child = node.get_left().id
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
43 node_right_child = node.get_right().id
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
44 node_id2node[node_id] = {"count":node.count, "distance":round(node.dist, 3), "left_child": node_left_child, "right_child": node_right_child}
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
45
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
46 for n in node_id2node:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
47 node = node_id2node[n]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
48 if node["count"] != 1:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
49 node_id2node[node["left_child"]]["parent"] = n
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
50 node_id2node[node["right_child"]]["parent"] = n
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
51
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
52 for n in node_id2node:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
53 node = node_id2node[n]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
54
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
55 if node["count"] == 1:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
56 data = self.data[n]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
57 node["objects"] = [self.data_names[n]]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
58
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
59 if node_id2node[node["parent"]]["left_child"] == n:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
60 node_id2node[node["parent"]]["left_child"] = n
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
61 else:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
62 node_id2node[node["parent"]]["right_child"] = n
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
63
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
64 if not write_data:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
65 data = []
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
66
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
67 node["features"] = data
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
68 dendrogram["nodes"][n] = node
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
69
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
70 for n in node_id2node:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
71 if node_id2node[n]["count"] != 1:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
72 dendrogram["nodes"][n] = node_id2node[n]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
73
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
74 return dendrogram
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
75
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
76 def __get_column_dendrogram__(self):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
77 root, nodes = hcluster.to_tree(self.cluster_object.column_clustering, rd=True)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
78 node_id2node = {}
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
79 dendrogram = {"nodes":{}}
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
80
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
81 for node in nodes:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
82 node_id = node.id
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
83 if node.count == 1:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
84 node_id2node[node_id] = {"count":1, "distance":0}
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
85
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
86 else:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
87 node_left_child = node.get_left().id
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
88 node_right_child = node.get_right().id
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
89 node_id2node[node_id] = {"count":node.count, "distance":round(node.dist, 3), "left_child": node_left_child, "right_child": node_right_child}
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
90
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
91 for n in node_id2node:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
92 node = node_id2node[n]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
93 if node["count"] != 1:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
94 node_id2node[node["left_child"]]["parent"] = n
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
95 node_id2node[node["right_child"]]["parent"] = n
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
96
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
97 for n in node_id2node:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
98 if not n in dendrogram["nodes"]:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
99 dendrogram["nodes"][n] = node_id2node[n]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
100
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
101 return dendrogram
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
102
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
103 def create_cluster_heatmap(self, compress=False, compressed_value="median", write_data=True):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
104 """Creates cluster heatmap representation in inchlib format. By setting compress parameter to True you can
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
105 cut the dendrogram in a distance to decrease the row size of the heatmap to specified count.
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
106 When compressing the type of the resulted value of merged rows is given by the compressed_value parameter (median, mean).
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
107 When the metadata are nominal (text values) the most frequent is the result after compression.
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
108 By setting write_data to False the data features won't be present in the resulting format."""
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
109 self.dendrogram = {"data": self.__get_cluster_heatmap__(write_data)}
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
110
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
111 self.compress = compress
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
112 self.compressed_value = compressed_value
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
113 self.compress_cluster_treshold = 0
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
114 if self.compress and self.compress >= 0:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
115 self.compress_cluster_treshold = self.__get_distance_treshold__(compress)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
116 print("Distance treshold for compression:", self.compress_cluster_treshold)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
117 if self.compress_cluster_treshold >= 0:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
118 self.__compress_data__()
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
119 else:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
120 self.compress = False
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
121
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
122 if self.header and write_data:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
123 self.dendrogram["data"]["feature_names"] = [h for h in self.header]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
124 elif self.header and not write_data:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
125 self.dendrogram["data"]["feature_names"] = []
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
126
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
127 if self.axis == "both" and len(self.cluster_object.column_clustering):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
128 column_dendrogram = hcluster.to_tree(self.cluster_object.column_clustering)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
129 self.dendrogram["column_dendrogram"] = self.__get_column_dendrogram__()
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
130 return
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
131
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
132 def __compress_data__(self):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
133 nodes = {}
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
134 to_remove = set()
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
135
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
136 compressed_value2fnc = {
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
137 "median": lambda values: [round(numpy.median(value), 3) for value in values],
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
138 "mean": lambda values: [round(numpy.average(value), 3) for value in values],
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
139 }
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
140
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
141 for n in self.dendrogram["data"]["nodes"]:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
142 node = self.dendrogram["data"]["nodes"][n]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
143
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
144 if node["count"] == 1:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
145 objects = node["objects"]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
146 data = node["features"]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
147 node_id = n
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
148
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
149 while self.dendrogram["data"]["nodes"][node["parent"]]["distance"] <= self.compress_cluster_treshold:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
150 to_remove.add(node_id)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
151 node_id = node["parent"]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
152 node = self.dendrogram["data"]["nodes"][node_id]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
153
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
154 if node["count"] != 1:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
155
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
156 if not "objects" in self.dendrogram["data"]["nodes"][node_id]:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
157 self.dendrogram["data"]["nodes"][node_id]["objects"] = []
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
158 self.dendrogram["data"]["nodes"][node_id]["features"] = []
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
159
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
160 self.dendrogram["data"]["nodes"][node_id]["objects"].extend(objects)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
161
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
162 if data:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
163 self.dendrogram["data"]["nodes"][node_id]["features"].append(data)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
164
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
165 for node in to_remove:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
166 self.dendrogram["data"]["nodes"].pop(node)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
167
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
168 for k in self.dendrogram["data"]["nodes"]:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
169 node = self.dendrogram["data"]["nodes"][k]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
170 if "objects" in node and node["count"] != 1:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
171 self.dendrogram["data"]["nodes"][k]["distance"] = 0
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
172 self.dendrogram["data"]["nodes"][k]["count"] = 1
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
173 self.dendrogram["data"]["nodes"][k].pop("left_child")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
174 self.dendrogram["data"]["nodes"][k].pop("right_child")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
175 rows = zip(*self.dendrogram["data"]["nodes"][k]["features"])
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
176 self.dendrogram["data"]["nodes"][k]["features"] = compressed_value2fnc[self.compressed_value](rows)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
177
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
178 self.__adjust_node_counts__()
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
179
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
180 return
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
181
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
182 def __adjust_node_counts__(self):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
183 leaves = []
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
184
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
185 for n in self.dendrogram["data"]["nodes"]:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
186 if self.dendrogram["data"]["nodes"][n]["count"] > 1:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
187 self.dendrogram["data"]["nodes"][n]["count"] = 0
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
188 else:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
189 leaves.append(n)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
190
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
191 for n in leaves:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
192 node = self.dendrogram["data"]["nodes"][n]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
193 parent_id = node["parent"]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
194
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
195 while parent_id:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
196 node = self.dendrogram["data"]["nodes"][parent_id]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
197 self.dendrogram["data"]["nodes"][parent_id]["count"] += 1
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
198 parent_id = False
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
199 if "parent" in node:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
200 parent_id = node["parent"]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
201 return
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
202
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
203 def __get_distance_treshold__(self, cluster_count):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
204 print("Calculating distance treshold for cluster compression...")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
205 if cluster_count >= self.tree.count:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
206 return -1
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
207
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
208 i = 0
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
209 count = cluster_count + 1
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
210 test_step = self.tree.dist/2
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
211
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
212 while test_step >= 0.1:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
213 count = len(set([c for c in hcluster.fcluster(self.clustering, i, "distance")]))
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
214 if count < cluster_count:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
215 if i == 0:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
216 return 0
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
217 i = i - test_step
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
218 test_step = test_step/2
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
219 elif count == cluster_count:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
220 return i
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
221 else:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
222 i += test_step
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
223
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
224 return i+test_step*2
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
225
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
226 def export_cluster_heatmap_as_json(self, filename=None):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
227 """Returns cluster heatmap in a JSON format or exports it to the file specified by the filename parameter."""
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
228 dendrogram_json = json.dumps(self.dendrogram, indent=4)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
229 if filename:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
230 output = open(filename, "w")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
231 output.write(dendrogram_json)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
232 return dendrogram_json
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
233
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
234 def export_cluster_heatmap_as_html(self, htmldir="."):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
235 """Export simple HTML page with embedded cluster heatmap and dependencies to given directory."""
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
236 if not os.path.exists(htmldir):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
237 os.makedirs(htmldir)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
238 dendrogram_json = json.dumps(self.dendrogram, indent=4)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
239 template = """<html>
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
240 <head>
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
241 <script src="jquery-2.0.3.min.js"></script>
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
242 <script src="kinetic-v5.0.0.min.js"></script>
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
243 <script src="inchlib-1.0.1.min.js"></script>
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
244 <script>
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
245 $(document).ready(function() {{
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
246 var data = {};
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
247 var inchlib = new InCHlib({{
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
248 target: "inchlib",
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
249 max_height: 1200,
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
250 width: 1000,
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
251 }});
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
252 inchlib.read_data(data);
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
253 inchlib.draw();
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
254 }});
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
255 </script>
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
256 </head>
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
257
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
258 <body>
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
259 <div id="inchlib"></div>
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
260 </body>
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
261 </html>""".format(dendrogram_json)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
262
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
263 lib2url = {"inchlib-1.0.1.min.js": "http://openscreen.cz/software/inchlib/static/js/inchlib-1.0.1.min.js",
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
264 "jquery-2.0.3.min.js": "http://openscreen.cz/software/inchlib/static/js/jquery-2.0.3.min.js",
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
265 "kinetic-v5.0.0.min.js": "http://openscreen.cz/software/inchlib/static/js/kinetic-v5.0.0.min.js"}
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
266
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
267 for lib, url in lib2url.items():
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
268 try:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
269 source = urllib2.urlopen(url)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
270 source_html = source.read()
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
271 with open(os.path.join(htmldir, lib), "w") as output:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
272 output.write(source_html)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
273 except urllib2.URLError, e:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
274 raise Exception("\nCan't download file {}.\nPlease check your internet connection and try again.\nIf the error persists there can be something wrong with the InCHlib server.\n".format(url))
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
275
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
276 with open(os.path.join(htmdlir, "inchlib.html"), "w") as output:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
277 output.write(template)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
278 return
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
279
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
280 def add_metadata_from_file(self, metadata_file, delimiter, header=True, metadata_compressed_value="median"):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
281 """Adds metadata from csv file.
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
282 Metadata_compressed_value specifies the resulted value when the data are compressed (median/mean/frequency)"""
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
283 self.metadata_compressed_value = metadata_compressed_value
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
284 self.metadata, self.metadata_header = self.__read_metadata_file__(metadata_file, delimiter, header)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
285 self.__connect_metadata_to_data__()
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
286 return
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
287
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
288 def add_metadata(self, metadata, header=True, metadata_compressed_value="median"):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
289 """Adds metadata in a form of list of lists (tuples).
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
290 Metadata_compressed_value specifies the resulted value when the data are compressed (median/mean/frequency)"""
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
291 self.metadata_compressed_value = metadata_compressed_value
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
292 self.metadata, self.metadata_header = self.__read_metadata__(metadata, header)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
293 self.__connect_metadata_to_data__()
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
294 return
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
295
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
296 def __connect_metadata_to_data__(self):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
297 if len(set(self.metadata.keys()) & set(self.data_names)) == 0:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
298 raise Exception("Metadata objects must correspond with original data objects.")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
299
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
300 if not self.dendrogram:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
301 raise Exception("You must create dendrogram before adding metadata.")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
302
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
303 self.dendrogram["metadata"] = {"nodes":{}}
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
304
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
305 if self.metadata_header:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
306 self.dendrogram["metadata"]["feature_names"] = self.metadata_header
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
307
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
308 leaves = {n:node for n, node in self.dendrogram["data"]["nodes"].items() if node["count"] == 1}
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
309
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
310 if not self.compress:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
311
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
312 for leaf_id, leaf in leaves.items():
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
313 try:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
314 self.dendrogram["metadata"]["nodes"][leaf_id] = self.metadata[leaf["objects"][0]]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
315 except Exception, e:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
316 continue
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
317 else:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
318 compressed_value2fnc = {
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
319 "median": lambda values: round(numpy.median(col), 3),
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
320 "mean": lambda values: round(numpy.average(col), 3)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
321 }
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
322
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
323 for leaf in leaves:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
324 objects = []
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
325 for item in leaves[leaf]["objects"]:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
326 try:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
327 objects.append(self.metadata[item])
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
328 except Exception, e:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
329 continue
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
330
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
331 cols = zip(*objects)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
332 row = []
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
333 cols = [list(c) for c in cols]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
334
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
335 for col in cols:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
336
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
337 if self.metadata_compressed_value in compressed_value2fnc:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
338 try:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
339 col = [float(c) for c in col]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
340 value = compressed_value2fnc[self.metadata_compressed_value](col)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
341 except ValueError:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
342 freq2val = {col.count(v):v for v in set(col)}
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
343 value = freq2val[max(freq2val.keys())]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
344
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
345 elif self.metadata_compressed_value == "frequency":
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
346 freq2val = {col.count(v):v for v in set(col)}
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
347 value = freq2val[max(freq2val.keys())]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
348
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
349 else:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
350 raise Exception("Unkown type of metadata_compressed_value: {}. Possible values are: median, mean, frequency.".format(self.metadata_compressed_value))
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
351
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
352 row.append(value)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
353
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
354 self.dendrogram["metadata"]["nodes"][leaf] = row
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
355 return
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
356
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
357 def __read_metadata__(self, metadata, header):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
358 metadata_header = []
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
359 rows = metadata
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
360 metadata = {}
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
361 data_start = 0
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
362
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
363 if header:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
364 metadata_header = rows[0][1:]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
365 data_start = 1
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
366
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
367 for row in rows[data_start:]:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
368 metadata[str(row[0])] = [r for r in row[1:]]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
369
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
370 return metadata, metadata_header
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
371
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
372
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
373 def __read_metadata_file__(self, metadata_file, delimiter, header):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
374 csv_reader = csv.reader(open(metadata_file, "r"), delimiter=delimiter)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
375 metadata_header = []
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
376 rows = [row for row in csv_reader]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
377 metadata = {}
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
378 data_start = 0
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
379
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
380 if header:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
381 metadata_header = rows[0][1:]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
382 data_start = 1
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
383
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
384 for row in rows[data_start:]:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
385 metadata_id = str(row[0])
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
386 metadata[metadata_id] = [r for r in row[1:]]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
387
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
388 return metadata, metadata_header
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
389
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
390
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
391 class Cluster():
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
392 """Class for data clustering"""
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
393
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
394 def __init__(self):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
395 self.write_original = False
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
396
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
397 def read_csv(self, filename, delimiter=",", header=False):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
398 """Reads data from the CSV file"""
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
399 self.filename = filename
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
400 csv_reader = csv.reader(open(self.filename, "r"), delimiter=delimiter)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
401 rows = [row for row in csv_reader]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
402 self.read_data(rows, header)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
403
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
404 def read_data(self, rows, header=False):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
405 """Reads data in a form of list of lists (tuples)"""
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
406 self.header = header
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
407 data_start = 0
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
408
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
409 if self.header:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
410 self.header = rows[0][1:]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
411 data_start = 1
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
412
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
413 self.data_names = [str(row[0]) for row in rows[data_start:]]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
414 self.data = [[round(float(value), 3) for value in row[1:]] for row in rows[data_start:]]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
415 self.original_data = copy.deepcopy(self.data)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
416 return
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
417
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
418 def normalize_data(self, feature_range=(0,1), write_original=False):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
419 """Normalizes data to a scale from 0 to 1. When write_original is set to True,
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
420 the normalized data will be clustered, but original data will be written to the heatmap."""
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
421 self.write_original = write_original
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
422 min_max_scaler = preprocessing.MinMaxScaler(feature_range)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
423 self.data = min_max_scaler.fit_transform(self.data)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
424 self.data = [[round(v, 3) for v in row] for row in self.data]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
425 return
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
426
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
427 def cluster_data(self, data_type="numeric", row_distance="euclidean", row_linkage="single", axis="row", column_distance="euclidean", column_linkage="ward"):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
428 """Performs clustering according to the given parameters.
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
429 @data_type - numeric/binary
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
430 @row_distance/column_distance - see. DISTANCES variable
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
431 @row_linkage/column_linkage - see. LINKAGES variable
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
432 @axis - row/both
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
433 """
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
434
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
435 print("Clustering rows:", row_distance, row_linkage)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
436 self.data_type = data_type
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
437 self.clustering_axis = axis
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
438 row_linkage = str(row_linkage)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
439
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
440 if row_linkage in RAW_LINKAGES:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
441 self.clustering = fastcluster.linkage(self.data, method=row_linkage, metric=row_distance)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
442
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
443 else:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
444 self.distance_vector = fastcluster.pdist(self.data, row_distance)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
445
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
446 if data_type in DISTANCES and not row_distance in DISTANCES[data_type]:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
447 raise Exception("".join(["When clustering" , data_type, "data you must choose from these distance measures: ", ", ".join(DISTANCES[data_type])]))
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
448 elif not data_type in DISTANCES.keys():
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
449 raise Exception("".join(["You can choose only from data types: ", ", ".join(DISTANCES.keys())]))
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
450
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
451 self.clustering = fastcluster.linkage(self.distance_vector, method=str(row_linkage))
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
452
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
453 self.column_clustering = []
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
454 if axis == "both" and len(self.data[0]) > 2:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
455 print("Clustering columns:", column_distance, column_linkage)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
456 self.__cluster_columns__(column_distance, column_linkage)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
457
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
458 if self.write_original:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
459 self.data = self.original_data
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
460
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
461 return
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
462
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
463 def __cluster_columns__(self, column_distance, column_linkage):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
464 columns = zip(*self.data)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
465 self.column_clustering = fastcluster.linkage(columns, method=column_linkage, metric=column_distance)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
466 self.data_order = hcluster.leaves_list(self.column_clustering)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
467 self.data = self.__reorder_data__(self.data, self.data_order)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
468 self.original_data = self.__reorder_data__(self.original_data, self.data_order)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
469 if self.header:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
470 self.header = self.__reorder_data__([self.header], self.data_order)[0]
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
471 return
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
472
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
473 def __reorder_data__(self, data, order):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
474 for i in xrange(len(data)):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
475 reordered_data = []
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
476 for j in order:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
477 reordered_data.append(data[i][j])
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
478 reordered_data.reverse()
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
479 data[i] = reordered_data
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
480
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
481 return data
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
482
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
483 def _process_(arguments):
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
484 c = Cluster()
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
485 c.read_csv(arguments.data_file, arguments.data_delimiter, arguments.data_header)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
486
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
487 if arguments.normalize:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
488 c.normalize_data(feature_range=(0,1), write_original=arguments.write_original)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
489
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
490 c.cluster_data(data_type=arguments.datatype, row_distance=arguments.row_distance, row_linkage=arguments.row_linkage, axis=arguments.axis, column_distance=arguments.column_distance, column_linkage=arguments.column_linkage)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
491
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
492 d = Dendrogram(c)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
493 d.create_cluster_heatmap(compress=arguments.compress, compressed_value=arguments.compressed_value, write_data=not arguments.dont_write_data)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
494
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
495 if arguments.metadata:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
496 d.add_metadata_from_file(metadata_file=arguments.metadata, delimiter=arguments.metadata_delimiter, header=arguments.metadata_header, metadata_compressed_value=arguments.metadata_compressed_value)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
497
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
498 if arguments.output_file or arguments.html_dir:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
499 if arguments.output_file:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
500 d.export_cluster_heatmap_as_json(arguments.output_file)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
501 else:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
502 d.export_cluster_heatmap_as_html(arguments.html_dir)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
503 else:
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
504 print(json.dumps(d.dendrogram, indent=4))
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
505
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
506
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
507 if __name__ == '__main__':
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
508 parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
509
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
510 parser.add_argument("data_file", type=str, help="csv(text) data file with delimited values")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
511 parser.add_argument("-o", "--output_file", type=str, help="the name of output file")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
512 parser.add_argument("-html", "--html_dir", type=str, help="the directory to store HTML page with dependencies")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
513 parser.add_argument("-rd", "--row_distance", type=str, default="euclidean", help="set the distance to use for clustering rows")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
514 parser.add_argument("-rl", "--row_linkage", type=str, default="ward", help="set the linkage to use for clustering rows")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
515 parser.add_argument("-cd", "--column_distance", type=str, default="euclidean", help="set the distance to use for clustering columns (only when clustering by both axis -a parameter)")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
516 parser.add_argument("-cl", "--column_linkage", type=str, default="ward", help="set the linkage to use for clustering columns (only when clustering by both axis -a parameter)")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
517 parser.add_argument("-a", "--axis", type=str, default="row", help="define clustering axis (row/both)")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
518 parser.add_argument("-dt", "--datatype", type=str, default="numeric", help="specify the type of the data (numeric/binary)")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
519 parser.add_argument("-dd", "--data_delimiter", type=str, default=",", help="delimiter of values in data file")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
520 parser.add_argument("-m", "--metadata", type=str, default=None, help="csv(text) metadata file with delimited values")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
521 parser.add_argument("-md", "--metadata_delimiter", type=str, default=",", help="delimiter of values in metadata file")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
522 parser.add_argument("-dh", "--data_header", default=False, help="whether the first row of data file is a header", action="store_true")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
523 parser.add_argument("-mh", "--metadata_header", default=False, help="whether the first row of metadata file is a header", action="store_true")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
524 parser.add_argument("-c", "--compress", type=int, default=0, help="compress the data to contain maximum of specified count of rows")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
525 parser.add_argument("-cv", "--compressed_value", type=str, default="median", help="the resulted value from merged rows when the data are compressed (median/mean/frequency)")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
526 parser.add_argument("-mcv", "--metadata_compressed_value", type=str, default="median", help="the resulted value from merged rows of metadata when the data are compressed (median/mean/count)")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
527 parser.add_argument("-dwd", "--dont_write_data", default=False, help="don't write clustered data to the inchlib data format", action="store_true")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
528 parser.add_argument("-n", "--normalize", default=False, help="normalize data to [0, 1] range", action="store_true")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
529 parser.add_argument("-wo", "--write_original", default=False, help="cluster normalized data but write the original ones to the heatmap", action="store_true")
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
530
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
531 args = parser.parse_args()
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
532 _process_(args)
60f93f839759 Uploaded
saket-choudhary
parents:
diff changeset
533