Mercurial > repos > bimib > marea
comparison Marea/marea_cluster.py @ 23:a8825e66c3a0 draft
Uploaded
author | bimib |
---|---|
date | Wed, 02 Oct 2019 08:23:53 -0400 |
parents | c71ac0bb12de |
children | 9992eba50cfb |
comparison
equal
deleted
inserted
replaced
22:ac70e60d5789 | 23:a8825e66c3a0 |
---|---|
1 # -*- coding: utf-8 -*- | 1 # -*- coding: utf-8 -*- |
2 """ | 2 """ |
3 Created on Mon Jun 3 19:51:00 2019 | 3 Created on Mon Jun 3 19:51:00 2019 |
4 | |
5 @author: Narger | 4 @author: Narger |
6 """ | 5 """ |
7 | 6 |
8 import sys | 7 import sys |
9 import argparse | 8 import argparse |
120 return dataset | 119 return dataset |
121 | 120 |
122 ############################## write to csv ################################## | 121 ############################## write to csv ################################## |
123 | 122 |
124 def write_to_csv (dataset, labels, name): | 123 def write_to_csv (dataset, labels, name): |
125 list_labels = labels | 124 #labels = predict |
126 list_values = dataset | 125 predict = [x+1 for x in labels] |
127 | 126 |
128 list_values = list_values.tolist() | 127 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) |
129 d = {'Label' : list_labels, 'Value' : list_values} | 128 |
130 | 129 dest = name |
131 df = pd.DataFrame(d, columns=['Value','Label']) | 130 classe.to_csv(dest, sep = '\t', index = False, |
132 | 131 header = ['Patient_ID', 'Class']) |
133 dest = name + '.tsv' | 132 |
134 df.to_csv(dest, sep = '\t', index = False, | 133 |
135 header = ['Value', 'Label']) | 134 #list_labels = labels |
135 #list_values = dataset | |
136 | |
137 #list_values = list_values.tolist() | |
138 #d = {'Label' : list_labels, 'Value' : list_values} | |
139 | |
140 #df = pd.DataFrame(d, columns=['Value','Label']) | |
141 | |
142 #dest = name + '.tsv' | |
143 #df.to_csv(dest, sep = '\t', index = False, | |
144 # header = ['Value', 'Label']) | |
136 | 145 |
137 ########################### trova il massimo in lista ######################## | 146 ########################### trova il massimo in lista ######################## |
138 def max_index (lista): | 147 def max_index (lista): |
139 best = -1 | 148 best = -1 |
140 best_index = 0 | 149 best_index = 0 |
146 return best_index | 155 return best_index |
147 | 156 |
148 ################################ kmeans ##################################### | 157 ################################ kmeans ##################################### |
149 | 158 |
150 def kmeans (k_min, k_max, dataset, elbow, silhouette, davies): | 159 def kmeans (k_min, k_max, dataset, elbow, silhouette, davies): |
151 if not os.path.exists('clustering/kmeans_output'): | 160 if not os.path.exists('clustering'): |
152 os.makedirs('clustering/kmeans_output') | 161 os.makedirs('clustering') |
153 | 162 |
154 | 163 |
155 if elbow == 'true': | 164 if elbow == 'true': |
156 elbow = True | 165 elbow = True |
157 else: | 166 else: |
187 for i in range(len(all_labels)): | 196 for i in range(len(all_labels)): |
188 prefix = '' | 197 prefix = '' |
189 if (i + k_min == best): | 198 if (i + k_min == best): |
190 prefix = '_BEST' | 199 prefix = '_BEST' |
191 | 200 |
192 write_to_csv(dataset, all_labels[i], 'clustering/kmeans_output/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv') | 201 write_to_csv(dataset, all_labels[i], 'clustering/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv') |
193 | 202 |
194 if davies: | 203 if davies: |
195 with np.errstate(divide='ignore', invalid='ignore'): | 204 with np.errstate(divide='ignore', invalid='ignore'): |
196 davies_bouldin = davies_bouldin_score(dataset, all_labels[i]) | 205 davies_bouldin = davies_bouldin_score(dataset, all_labels[i]) |
197 warning("\nFor n_clusters = " + str(i + k_min) + | 206 warning("\nFor n_clusters = " + str(i + k_min) + |
198 " The average davies bouldin score is: " + str(davies_bouldin)) | 207 " The average davies bouldin score is: " + str(davies_bouldin)) |
199 | 208 |
200 | 209 |
201 if silhouette: | 210 if silhouette: |
202 silihouette_draw(dataset, all_labels[i], i + k_min, 'clustering/kmeans_output/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png') | 211 silihouette_draw(dataset, all_labels[i], i + k_min, 'clustering/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png') |
203 | 212 |
204 | 213 |
205 if elbow: | 214 if elbow: |
206 elbow_plot(distortions, k_min,k_max) | 215 elbow_plot(distortions, k_min,k_max) |
207 | 216 |
214 def elbow_plot (distortions, k_min, k_max): | 223 def elbow_plot (distortions, k_min, k_max): |
215 plt.figure(0) | 224 plt.figure(0) |
216 plt.plot(range(k_min, k_max+1), distortions, marker = 'o') | 225 plt.plot(range(k_min, k_max+1), distortions, marker = 'o') |
217 plt.xlabel('Number of cluster') | 226 plt.xlabel('Number of cluster') |
218 plt.ylabel('Distortion') | 227 plt.ylabel('Distortion') |
219 s = 'clustering/kmeans_output/elbow_plot.png' | 228 s = 'clustering/elbow_plot.png' |
220 fig = plt.gcf() | 229 fig = plt.gcf() |
221 fig.set_size_inches(18.5, 10.5, forward = True) | 230 fig.set_size_inches(18.5, 10.5, forward = True) |
222 fig.savefig(s, dpi=100) | 231 fig.savefig(s, dpi=100) |
223 | 232 |
224 | 233 |
286 plt.savefig(path, bbox_inches='tight') | 295 plt.savefig(path, bbox_inches='tight') |
287 | 296 |
288 ######################## dbscan ############################################## | 297 ######################## dbscan ############################################## |
289 | 298 |
290 def dbscan(dataset, eps, min_samples): | 299 def dbscan(dataset, eps, min_samples): |
291 if not os.path.exists('clustering/dbscan_output'): | 300 if not os.path.exists('clustering'): |
292 os.makedirs('clustering/dbscan_output') | 301 os.makedirs('clustering') |
293 | 302 |
294 if eps is not None: | 303 if eps is not None: |
295 clusterer = DBSCAN(eps = eps, min_samples = min_samples) | 304 clusterer = DBSCAN(eps = eps, min_samples = min_samples) |
296 else: | 305 else: |
297 clusterer = DBSCAN() | 306 clusterer = DBSCAN() |
303 labels = clustering.labels_ | 312 labels = clustering.labels_ |
304 | 313 |
305 # Number of clusters in labels, ignoring noise if present. | 314 # Number of clusters in labels, ignoring noise if present. |
306 n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) | 315 n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) |
307 | 316 |
308 silhouette_avg = silhouette_score(dataset, labels) | |
309 warning("For n_clusters =" + str(n_clusters_) + | |
310 "The average silhouette_score is :" + str(silhouette_avg)) | |
311 | 317 |
312 ##TODO: PLOT SU DBSCAN (no centers) e HIERARCHICAL | 318 ##TODO: PLOT SU DBSCAN (no centers) e HIERARCHICAL |
313 | 319 |
314 # Black removed and is used for noise instead. | 320 |
315 unique_labels = set(labels) | 321 write_to_csv(dataset, labels, 'clustering/dbscan_results.tsv') |
316 colors = [plt.cm.Spectral(each) | |
317 for each in np.linspace(0, 1, len(unique_labels))] | |
318 for k, col in zip(unique_labels, colors): | |
319 if k == -1: | |
320 # Black used for noise. | |
321 col = [0, 0, 0, 1] | |
322 | |
323 class_member_mask = (labels == k) | |
324 | |
325 xy = dataset[class_member_mask & core_samples_mask] | |
326 plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), | |
327 markeredgecolor='k', markersize=14) | |
328 | |
329 xy = dataset[class_member_mask & ~core_samples_mask] | |
330 plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), | |
331 markeredgecolor='k', markersize=6) | |
332 | |
333 plt.title('Estimated number of clusters: %d' % n_clusters_) | |
334 s = 'clustering/dbscan_output/dbscan_plot.png' | |
335 fig = plt.gcf() | |
336 fig.set_size_inches(18.5, 10.5, forward = True) | |
337 fig.savefig(s, dpi=100) | |
338 | |
339 | |
340 write_to_csv(dataset, labels, 'clustering/dbscan_output/dbscan_results.tsv') | |
341 | 322 |
342 ########################## hierachical ####################################### | 323 ########################## hierachical ####################################### |
343 | 324 |
344 def hierachical_agglomerative(dataset, k_min, k_max): | 325 def hierachical_agglomerative(dataset, k_min, k_max): |
345 | 326 |
346 if not os.path.exists('clustering/agglomerative_output'): | 327 if not os.path.exists('clustering'): |
347 os.makedirs('clustering/agglomerative_output') | 328 os.makedirs('clustering') |
348 | 329 |
349 plt.figure(figsize=(10, 7)) | 330 plt.figure(figsize=(10, 7)) |
350 plt.title("Customer Dendograms") | 331 plt.title("Customer Dendograms") |
351 shc.dendrogram(shc.linkage(dataset, method='ward')) | 332 shc.dendrogram(shc.linkage(dataset, method='ward')) |
352 fig = plt.gcf() | 333 fig = plt.gcf() |
353 fig.savefig('clustering/agglomerative_output/dendogram.png', dpi=200) | 334 fig.savefig('clustering/dendogram.png', dpi=200) |
354 | 335 |
355 range_n_clusters = [i for i in range(k_min, k_max+1)] | 336 range_n_clusters = [i for i in range(k_min, k_max+1)] |
356 | 337 |
357 for n_clusters in range_n_clusters: | 338 for n_clusters in range_n_clusters: |
358 | 339 |
359 cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward') | 340 cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward') |
360 cluster.fit_predict(dataset) | 341 cluster.fit_predict(dataset) |
361 cluster_labels = cluster.labels_ | 342 cluster_labels = cluster.labels_ |
362 | 343 |
363 silhouette_avg = silhouette_score(dataset, cluster_labels) | 344 silhouette_avg = silhouette_score(dataset, cluster_labels) |
364 warning("For n_clusters =", n_clusters, | 345 write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv') |
365 "The average silhouette_score is :", silhouette_avg) | 346 #warning("For n_clusters =", n_clusters, |
366 | 347 #"The average silhouette_score is :", silhouette_avg) |
367 plt.clf() | 348 |
368 plt.figure(figsize=(10, 7)) | |
369 plt.title("Agglomerative Hierarchical Clustering\nwith " + str(n_clusters) + " clusters and " + str(silhouette_avg) + " silhouette score") | |
370 plt.scatter(dataset[:,0], dataset[:,1], c = cluster_labels, cmap='rainbow') | |
371 s = 'clustering/agglomerative_output/hierachical_' + str(n_clusters) + '_clusters.png' | |
372 fig = plt.gcf() | |
373 fig.set_size_inches(10, 7, forward = True) | |
374 fig.savefig(s, dpi=200) | |
375 | |
376 write_to_csv(dataset, cluster_labels, 'clustering/agglomerative_output/agglomerative_hierarchical_with_' + str(n_clusters) + '_clusters.tsv') | |
377 | 349 |
378 | 350 |
379 | 351 |
380 | 352 |
381 ############################# main ########################################### | 353 ############################# main ########################################### |
396 | 368 |
397 for i in X.columns: | 369 for i in X.columns: |
398 tmp = X[i][0] | 370 tmp = X[i][0] |
399 if tmp == None: | 371 if tmp == None: |
400 X = X.drop(columns=[i]) | 372 X = X.drop(columns=[i]) |
401 | |
402 X = pd.DataFrame.to_numpy(X) | |
403 | 373 |
404 | 374 |
405 if args.cluster_type == 'kmeans': | 375 if args.cluster_type == 'kmeans': |
406 kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies) | 376 kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies) |
407 | 377 |