comparison Marea/marea_cluster.py @ 23:a8825e66c3a0 draft

Uploaded
author bimib
date Wed, 02 Oct 2019 08:23:53 -0400
parents c71ac0bb12de
children 9992eba50cfb
comparison
equal deleted inserted replaced
22:ac70e60d5789 23:a8825e66c3a0
1 # -*- coding: utf-8 -*- 1 # -*- coding: utf-8 -*-
2 """ 2 """
3 Created on Mon Jun 3 19:51:00 2019 3 Created on Mon Jun 3 19:51:00 2019
4
5 @author: Narger 4 @author: Narger
6 """ 5 """
7 6
8 import sys 7 import sys
9 import argparse 8 import argparse
120 return dataset 119 return dataset
121 120
122 ############################## write to csv ################################## 121 ############################## write to csv ##################################
123 122
124 def write_to_csv (dataset, labels, name): 123 def write_to_csv (dataset, labels, name):
125 list_labels = labels 124 #labels = predict
126 list_values = dataset 125 predict = [x+1 for x in labels]
127 126
128 list_values = list_values.tolist() 127 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
129 d = {'Label' : list_labels, 'Value' : list_values} 128
130 129 dest = name
131 df = pd.DataFrame(d, columns=['Value','Label']) 130 classe.to_csv(dest, sep = '\t', index = False,
132 131 header = ['Patient_ID', 'Class'])
133 dest = name + '.tsv' 132
134 df.to_csv(dest, sep = '\t', index = False, 133
135 header = ['Value', 'Label']) 134 #list_labels = labels
135 #list_values = dataset
136
137 #list_values = list_values.tolist()
138 #d = {'Label' : list_labels, 'Value' : list_values}
139
140 #df = pd.DataFrame(d, columns=['Value','Label'])
141
142 #dest = name + '.tsv'
143 #df.to_csv(dest, sep = '\t', index = False,
144 # header = ['Value', 'Label'])
136 145
137 ########################### trova il massimo in lista ######################## 146 ########################### trova il massimo in lista ########################
138 def max_index (lista): 147 def max_index (lista):
139 best = -1 148 best = -1
140 best_index = 0 149 best_index = 0
146 return best_index 155 return best_index
147 156
148 ################################ kmeans ##################################### 157 ################################ kmeans #####################################
149 158
150 def kmeans (k_min, k_max, dataset, elbow, silhouette, davies): 159 def kmeans (k_min, k_max, dataset, elbow, silhouette, davies):
151 if not os.path.exists('clustering/kmeans_output'): 160 if not os.path.exists('clustering'):
152 os.makedirs('clustering/kmeans_output') 161 os.makedirs('clustering')
153 162
154 163
155 if elbow == 'true': 164 if elbow == 'true':
156 elbow = True 165 elbow = True
157 else: 166 else:
187 for i in range(len(all_labels)): 196 for i in range(len(all_labels)):
188 prefix = '' 197 prefix = ''
189 if (i + k_min == best): 198 if (i + k_min == best):
190 prefix = '_BEST' 199 prefix = '_BEST'
191 200
192 write_to_csv(dataset, all_labels[i], 'clustering/kmeans_output/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv') 201 write_to_csv(dataset, all_labels[i], 'clustering/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv')
193 202
194 if davies: 203 if davies:
195 with np.errstate(divide='ignore', invalid='ignore'): 204 with np.errstate(divide='ignore', invalid='ignore'):
196 davies_bouldin = davies_bouldin_score(dataset, all_labels[i]) 205 davies_bouldin = davies_bouldin_score(dataset, all_labels[i])
197 warning("\nFor n_clusters = " + str(i + k_min) + 206 warning("\nFor n_clusters = " + str(i + k_min) +
198 " The average davies bouldin score is: " + str(davies_bouldin)) 207 " The average davies bouldin score is: " + str(davies_bouldin))
199 208
200 209
201 if silhouette: 210 if silhouette:
202 silihouette_draw(dataset, all_labels[i], i + k_min, 'clustering/kmeans_output/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png') 211 silihouette_draw(dataset, all_labels[i], i + k_min, 'clustering/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png')
203 212
204 213
205 if elbow: 214 if elbow:
206 elbow_plot(distortions, k_min,k_max) 215 elbow_plot(distortions, k_min,k_max)
207 216
214 def elbow_plot (distortions, k_min, k_max): 223 def elbow_plot (distortions, k_min, k_max):
215 plt.figure(0) 224 plt.figure(0)
216 plt.plot(range(k_min, k_max+1), distortions, marker = 'o') 225 plt.plot(range(k_min, k_max+1), distortions, marker = 'o')
217 plt.xlabel('Number of cluster') 226 plt.xlabel('Number of cluster')
218 plt.ylabel('Distortion') 227 plt.ylabel('Distortion')
219 s = 'clustering/kmeans_output/elbow_plot.png' 228 s = 'clustering/elbow_plot.png'
220 fig = plt.gcf() 229 fig = plt.gcf()
221 fig.set_size_inches(18.5, 10.5, forward = True) 230 fig.set_size_inches(18.5, 10.5, forward = True)
222 fig.savefig(s, dpi=100) 231 fig.savefig(s, dpi=100)
223 232
224 233
286 plt.savefig(path, bbox_inches='tight') 295 plt.savefig(path, bbox_inches='tight')
287 296
288 ######################## dbscan ############################################## 297 ######################## dbscan ##############################################
289 298
290 def dbscan(dataset, eps, min_samples): 299 def dbscan(dataset, eps, min_samples):
291 if not os.path.exists('clustering/dbscan_output'): 300 if not os.path.exists('clustering'):
292 os.makedirs('clustering/dbscan_output') 301 os.makedirs('clustering')
293 302
294 if eps is not None: 303 if eps is not None:
295 clusterer = DBSCAN(eps = eps, min_samples = min_samples) 304 clusterer = DBSCAN(eps = eps, min_samples = min_samples)
296 else: 305 else:
297 clusterer = DBSCAN() 306 clusterer = DBSCAN()
303 labels = clustering.labels_ 312 labels = clustering.labels_
304 313
305 # Number of clusters in labels, ignoring noise if present. 314 # Number of clusters in labels, ignoring noise if present.
306 n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) 315 n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
307 316
308 silhouette_avg = silhouette_score(dataset, labels)
309 warning("For n_clusters =" + str(n_clusters_) +
310 "The average silhouette_score is :" + str(silhouette_avg))
311 317
312 ##TODO: PLOT SU DBSCAN (no centers) e HIERARCHICAL 318 ##TODO: PLOT SU DBSCAN (no centers) e HIERARCHICAL
313 319
314 # Black removed and is used for noise instead. 320
315 unique_labels = set(labels) 321 write_to_csv(dataset, labels, 'clustering/dbscan_results.tsv')
316 colors = [plt.cm.Spectral(each)
317 for each in np.linspace(0, 1, len(unique_labels))]
318 for k, col in zip(unique_labels, colors):
319 if k == -1:
320 # Black used for noise.
321 col = [0, 0, 0, 1]
322
323 class_member_mask = (labels == k)
324
325 xy = dataset[class_member_mask & core_samples_mask]
326 plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
327 markeredgecolor='k', markersize=14)
328
329 xy = dataset[class_member_mask & ~core_samples_mask]
330 plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
331 markeredgecolor='k', markersize=6)
332
333 plt.title('Estimated number of clusters: %d' % n_clusters_)
334 s = 'clustering/dbscan_output/dbscan_plot.png'
335 fig = plt.gcf()
336 fig.set_size_inches(18.5, 10.5, forward = True)
337 fig.savefig(s, dpi=100)
338
339
340 write_to_csv(dataset, labels, 'clustering/dbscan_output/dbscan_results.tsv')
341 322
342 ########################## hierachical ####################################### 323 ########################## hierachical #######################################
343 324
344 def hierachical_agglomerative(dataset, k_min, k_max): 325 def hierachical_agglomerative(dataset, k_min, k_max):
345 326
346 if not os.path.exists('clustering/agglomerative_output'): 327 if not os.path.exists('clustering'):
347 os.makedirs('clustering/agglomerative_output') 328 os.makedirs('clustering')
348 329
349 plt.figure(figsize=(10, 7)) 330 plt.figure(figsize=(10, 7))
350 plt.title("Customer Dendograms") 331 plt.title("Customer Dendograms")
351 shc.dendrogram(shc.linkage(dataset, method='ward')) 332 shc.dendrogram(shc.linkage(dataset, method='ward'))
352 fig = plt.gcf() 333 fig = plt.gcf()
353 fig.savefig('clustering/agglomerative_output/dendogram.png', dpi=200) 334 fig.savefig('clustering/dendogram.png', dpi=200)
354 335
355 range_n_clusters = [i for i in range(k_min, k_max+1)] 336 range_n_clusters = [i for i in range(k_min, k_max+1)]
356 337
357 for n_clusters in range_n_clusters: 338 for n_clusters in range_n_clusters:
358 339
359 cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward') 340 cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')
360 cluster.fit_predict(dataset) 341 cluster.fit_predict(dataset)
361 cluster_labels = cluster.labels_ 342 cluster_labels = cluster.labels_
362 343
363 silhouette_avg = silhouette_score(dataset, cluster_labels) 344 silhouette_avg = silhouette_score(dataset, cluster_labels)
364 warning("For n_clusters =", n_clusters, 345 write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv')
365 "The average silhouette_score is :", silhouette_avg) 346 #warning("For n_clusters =", n_clusters,
366 347 #"The average silhouette_score is :", silhouette_avg)
367 plt.clf() 348
368 plt.figure(figsize=(10, 7))
369 plt.title("Agglomerative Hierarchical Clustering\nwith " + str(n_clusters) + " clusters and " + str(silhouette_avg) + " silhouette score")
370 plt.scatter(dataset[:,0], dataset[:,1], c = cluster_labels, cmap='rainbow')
371 s = 'clustering/agglomerative_output/hierachical_' + str(n_clusters) + '_clusters.png'
372 fig = plt.gcf()
373 fig.set_size_inches(10, 7, forward = True)
374 fig.savefig(s, dpi=200)
375
376 write_to_csv(dataset, cluster_labels, 'clustering/agglomerative_output/agglomerative_hierarchical_with_' + str(n_clusters) + '_clusters.tsv')
377 349
378 350
379 351
380 352
381 ############################# main ########################################### 353 ############################# main ###########################################
396 368
397 for i in X.columns: 369 for i in X.columns:
398 tmp = X[i][0] 370 tmp = X[i][0]
399 if tmp == None: 371 if tmp == None:
400 X = X.drop(columns=[i]) 372 X = X.drop(columns=[i])
401
402 X = pd.DataFrame.to_numpy(X)
403 373
404 374
405 if args.cluster_type == 'kmeans': 375 if args.cluster_type == 'kmeans':
406 kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies) 376 kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies)
407 377