Mercurial > repos > bimib > marea
comparison Marea/marea_cluster.py @ 23:a8825e66c3a0 draft
Uploaded
| author | bimib |
|---|---|
| date | Wed, 02 Oct 2019 08:23:53 -0400 |
| parents | c71ac0bb12de |
| children | 9992eba50cfb |
comparison
equal
deleted
inserted
replaced
| 22:ac70e60d5789 | 23:a8825e66c3a0 |
|---|---|
| 1 # -*- coding: utf-8 -*- | 1 # -*- coding: utf-8 -*- |
| 2 """ | 2 """ |
| 3 Created on Mon Jun 3 19:51:00 2019 | 3 Created on Mon Jun 3 19:51:00 2019 |
| 4 | |
| 5 @author: Narger | 4 @author: Narger |
| 6 """ | 5 """ |
| 7 | 6 |
| 8 import sys | 7 import sys |
| 9 import argparse | 8 import argparse |
| 120 return dataset | 119 return dataset |
| 121 | 120 |
| 122 ############################## write to csv ################################## | 121 ############################## write to csv ################################## |
| 123 | 122 |
| 124 def write_to_csv (dataset, labels, name): | 123 def write_to_csv (dataset, labels, name): |
| 125 list_labels = labels | 124 #labels = predict |
| 126 list_values = dataset | 125 predict = [x+1 for x in labels] |
| 127 | 126 |
| 128 list_values = list_values.tolist() | 127 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) |
| 129 d = {'Label' : list_labels, 'Value' : list_values} | 128 |
| 130 | 129 dest = name |
| 131 df = pd.DataFrame(d, columns=['Value','Label']) | 130 classe.to_csv(dest, sep = '\t', index = False, |
| 132 | 131 header = ['Patient_ID', 'Class']) |
| 133 dest = name + '.tsv' | 132 |
| 134 df.to_csv(dest, sep = '\t', index = False, | 133 |
| 135 header = ['Value', 'Label']) | 134 #list_labels = labels |
| 135 #list_values = dataset | |
| 136 | |
| 137 #list_values = list_values.tolist() | |
| 138 #d = {'Label' : list_labels, 'Value' : list_values} | |
| 139 | |
| 140 #df = pd.DataFrame(d, columns=['Value','Label']) | |
| 141 | |
| 142 #dest = name + '.tsv' | |
| 143 #df.to_csv(dest, sep = '\t', index = False, | |
| 144 # header = ['Value', 'Label']) | |
| 136 | 145 |
| 137 ########################### trova il massimo in lista ######################## | 146 ########################### trova il massimo in lista ######################## |
| 138 def max_index (lista): | 147 def max_index (lista): |
| 139 best = -1 | 148 best = -1 |
| 140 best_index = 0 | 149 best_index = 0 |
| 146 return best_index | 155 return best_index |
| 147 | 156 |
| 148 ################################ kmeans ##################################### | 157 ################################ kmeans ##################################### |
| 149 | 158 |
| 150 def kmeans (k_min, k_max, dataset, elbow, silhouette, davies): | 159 def kmeans (k_min, k_max, dataset, elbow, silhouette, davies): |
| 151 if not os.path.exists('clustering/kmeans_output'): | 160 if not os.path.exists('clustering'): |
| 152 os.makedirs('clustering/kmeans_output') | 161 os.makedirs('clustering') |
| 153 | 162 |
| 154 | 163 |
| 155 if elbow == 'true': | 164 if elbow == 'true': |
| 156 elbow = True | 165 elbow = True |
| 157 else: | 166 else: |
| 187 for i in range(len(all_labels)): | 196 for i in range(len(all_labels)): |
| 188 prefix = '' | 197 prefix = '' |
| 189 if (i + k_min == best): | 198 if (i + k_min == best): |
| 190 prefix = '_BEST' | 199 prefix = '_BEST' |
| 191 | 200 |
| 192 write_to_csv(dataset, all_labels[i], 'clustering/kmeans_output/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv') | 201 write_to_csv(dataset, all_labels[i], 'clustering/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv') |
| 193 | 202 |
| 194 if davies: | 203 if davies: |
| 195 with np.errstate(divide='ignore', invalid='ignore'): | 204 with np.errstate(divide='ignore', invalid='ignore'): |
| 196 davies_bouldin = davies_bouldin_score(dataset, all_labels[i]) | 205 davies_bouldin = davies_bouldin_score(dataset, all_labels[i]) |
| 197 warning("\nFor n_clusters = " + str(i + k_min) + | 206 warning("\nFor n_clusters = " + str(i + k_min) + |
| 198 " The average davies bouldin score is: " + str(davies_bouldin)) | 207 " The average davies bouldin score is: " + str(davies_bouldin)) |
| 199 | 208 |
| 200 | 209 |
| 201 if silhouette: | 210 if silhouette: |
| 202 silihouette_draw(dataset, all_labels[i], i + k_min, 'clustering/kmeans_output/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png') | 211 silihouette_draw(dataset, all_labels[i], i + k_min, 'clustering/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png') |
| 203 | 212 |
| 204 | 213 |
| 205 if elbow: | 214 if elbow: |
| 206 elbow_plot(distortions, k_min,k_max) | 215 elbow_plot(distortions, k_min,k_max) |
| 207 | 216 |
| 214 def elbow_plot (distortions, k_min, k_max): | 223 def elbow_plot (distortions, k_min, k_max): |
| 215 plt.figure(0) | 224 plt.figure(0) |
| 216 plt.plot(range(k_min, k_max+1), distortions, marker = 'o') | 225 plt.plot(range(k_min, k_max+1), distortions, marker = 'o') |
| 217 plt.xlabel('Number of cluster') | 226 plt.xlabel('Number of cluster') |
| 218 plt.ylabel('Distortion') | 227 plt.ylabel('Distortion') |
| 219 s = 'clustering/kmeans_output/elbow_plot.png' | 228 s = 'clustering/elbow_plot.png' |
| 220 fig = plt.gcf() | 229 fig = plt.gcf() |
| 221 fig.set_size_inches(18.5, 10.5, forward = True) | 230 fig.set_size_inches(18.5, 10.5, forward = True) |
| 222 fig.savefig(s, dpi=100) | 231 fig.savefig(s, dpi=100) |
| 223 | 232 |
| 224 | 233 |
| 286 plt.savefig(path, bbox_inches='tight') | 295 plt.savefig(path, bbox_inches='tight') |
| 287 | 296 |
| 288 ######################## dbscan ############################################## | 297 ######################## dbscan ############################################## |
| 289 | 298 |
| 290 def dbscan(dataset, eps, min_samples): | 299 def dbscan(dataset, eps, min_samples): |
| 291 if not os.path.exists('clustering/dbscan_output'): | 300 if not os.path.exists('clustering'): |
| 292 os.makedirs('clustering/dbscan_output') | 301 os.makedirs('clustering') |
| 293 | 302 |
| 294 if eps is not None: | 303 if eps is not None: |
| 295 clusterer = DBSCAN(eps = eps, min_samples = min_samples) | 304 clusterer = DBSCAN(eps = eps, min_samples = min_samples) |
| 296 else: | 305 else: |
| 297 clusterer = DBSCAN() | 306 clusterer = DBSCAN() |
| 303 labels = clustering.labels_ | 312 labels = clustering.labels_ |
| 304 | 313 |
| 305 # Number of clusters in labels, ignoring noise if present. | 314 # Number of clusters in labels, ignoring noise if present. |
| 306 n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) | 315 n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) |
| 307 | 316 |
| 308 silhouette_avg = silhouette_score(dataset, labels) | |
| 309 warning("For n_clusters =" + str(n_clusters_) + | |
| 310 "The average silhouette_score is :" + str(silhouette_avg)) | |
| 311 | 317 |
| 312 ##TODO: PLOT SU DBSCAN (no centers) e HIERARCHICAL | 318 ##TODO: PLOT SU DBSCAN (no centers) e HIERARCHICAL |
| 313 | 319 |
| 314 # Black removed and is used for noise instead. | 320 |
| 315 unique_labels = set(labels) | 321 write_to_csv(dataset, labels, 'clustering/dbscan_results.tsv') |
| 316 colors = [plt.cm.Spectral(each) | |
| 317 for each in np.linspace(0, 1, len(unique_labels))] | |
| 318 for k, col in zip(unique_labels, colors): | |
| 319 if k == -1: | |
| 320 # Black used for noise. | |
| 321 col = [0, 0, 0, 1] | |
| 322 | |
| 323 class_member_mask = (labels == k) | |
| 324 | |
| 325 xy = dataset[class_member_mask & core_samples_mask] | |
| 326 plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), | |
| 327 markeredgecolor='k', markersize=14) | |
| 328 | |
| 329 xy = dataset[class_member_mask & ~core_samples_mask] | |
| 330 plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), | |
| 331 markeredgecolor='k', markersize=6) | |
| 332 | |
| 333 plt.title('Estimated number of clusters: %d' % n_clusters_) | |
| 334 s = 'clustering/dbscan_output/dbscan_plot.png' | |
| 335 fig = plt.gcf() | |
| 336 fig.set_size_inches(18.5, 10.5, forward = True) | |
| 337 fig.savefig(s, dpi=100) | |
| 338 | |
| 339 | |
| 340 write_to_csv(dataset, labels, 'clustering/dbscan_output/dbscan_results.tsv') | |
| 341 | 322 |
| 342 ########################## hierachical ####################################### | 323 ########################## hierachical ####################################### |
| 343 | 324 |
| 344 def hierachical_agglomerative(dataset, k_min, k_max): | 325 def hierachical_agglomerative(dataset, k_min, k_max): |
| 345 | 326 |
| 346 if not os.path.exists('clustering/agglomerative_output'): | 327 if not os.path.exists('clustering'): |
| 347 os.makedirs('clustering/agglomerative_output') | 328 os.makedirs('clustering') |
| 348 | 329 |
| 349 plt.figure(figsize=(10, 7)) | 330 plt.figure(figsize=(10, 7)) |
| 350 plt.title("Customer Dendograms") | 331 plt.title("Customer Dendograms") |
| 351 shc.dendrogram(shc.linkage(dataset, method='ward')) | 332 shc.dendrogram(shc.linkage(dataset, method='ward')) |
| 352 fig = plt.gcf() | 333 fig = plt.gcf() |
| 353 fig.savefig('clustering/agglomerative_output/dendogram.png', dpi=200) | 334 fig.savefig('clustering/dendogram.png', dpi=200) |
| 354 | 335 |
| 355 range_n_clusters = [i for i in range(k_min, k_max+1)] | 336 range_n_clusters = [i for i in range(k_min, k_max+1)] |
| 356 | 337 |
| 357 for n_clusters in range_n_clusters: | 338 for n_clusters in range_n_clusters: |
| 358 | 339 |
| 359 cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward') | 340 cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward') |
| 360 cluster.fit_predict(dataset) | 341 cluster.fit_predict(dataset) |
| 361 cluster_labels = cluster.labels_ | 342 cluster_labels = cluster.labels_ |
| 362 | 343 |
| 363 silhouette_avg = silhouette_score(dataset, cluster_labels) | 344 silhouette_avg = silhouette_score(dataset, cluster_labels) |
| 364 warning("For n_clusters =", n_clusters, | 345 write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv') |
| 365 "The average silhouette_score is :", silhouette_avg) | 346 #warning("For n_clusters =", n_clusters, |
| 366 | 347 #"The average silhouette_score is :", silhouette_avg) |
| 367 plt.clf() | 348 |
| 368 plt.figure(figsize=(10, 7)) | |
| 369 plt.title("Agglomerative Hierarchical Clustering\nwith " + str(n_clusters) + " clusters and " + str(silhouette_avg) + " silhouette score") | |
| 370 plt.scatter(dataset[:,0], dataset[:,1], c = cluster_labels, cmap='rainbow') | |
| 371 s = 'clustering/agglomerative_output/hierachical_' + str(n_clusters) + '_clusters.png' | |
| 372 fig = plt.gcf() | |
| 373 fig.set_size_inches(10, 7, forward = True) | |
| 374 fig.savefig(s, dpi=200) | |
| 375 | |
| 376 write_to_csv(dataset, cluster_labels, 'clustering/agglomerative_output/agglomerative_hierarchical_with_' + str(n_clusters) + '_clusters.tsv') | |
| 377 | 349 |
| 378 | 350 |
| 379 | 351 |
| 380 | 352 |
| 381 ############################# main ########################################### | 353 ############################# main ########################################### |
| 396 | 368 |
| 397 for i in X.columns: | 369 for i in X.columns: |
| 398 tmp = X[i][0] | 370 tmp = X[i][0] |
| 399 if tmp == None: | 371 if tmp == None: |
| 400 X = X.drop(columns=[i]) | 372 X = X.drop(columns=[i]) |
| 401 | |
| 402 X = pd.DataFrame.to_numpy(X) | |
| 403 | 373 |
| 404 | 374 |
| 405 if args.cluster_type == 'kmeans': | 375 if args.cluster_type == 'kmeans': |
| 406 kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies) | 376 kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies) |
| 407 | 377 |
