comparison Marea/marea_cluster.py @ 34:1a97d1537623 draft

Lot of bug fixes
author bimib
date Sat, 26 Oct 2019 07:49:31 -0400
parents abf0bfe01c78
children 94c51690d40c
comparison
equal deleted inserted replaced
33:abf0bfe01c78 34:1a97d1537623
206 labels = all_labels[i] 206 labels = all_labels[i]
207 predict = [x+1 for x in labels] 207 predict = [x+1 for x in labels]
208 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) 208 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
209 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) 209 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
210 210
211 211
212 if davies:
213 with np.errstate(divide='ignore', invalid='ignore'):
214 davies_bouldin = davies_bouldin_score(dataset, all_labels[i])
215 warning("\nFor n_clusters = " + str(i + k_min) +
216 " The average davies bouldin score is: " + str(davies_bouldin))
217 212
218 213
219 if silhouette: 214 if silhouette:
220 silihouette_draw(dataset, all_labels[i], i + k_min, 'clustering/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png') 215 silihouette_draw(dataset, all_labels[i], i + k_min, 'clustering/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png')
221 216
327 322
328 # Number of clusters in labels, ignoring noise if present. 323 # Number of clusters in labels, ignoring noise if present.
329 n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) 324 n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
330 325
331 326
332 ##TODO: PLOT SU DBSCAN (no centers) e HIERARCHICAL
333
334 labels = labels 327 labels = labels
335 predict = [x+1 for x in labels] 328 predict = [x+1 for x in labels]
336 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) 329 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
337 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) 330 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
338 331
339 332
340 ########################## hierachical ####################################### 333 ########################## hierachical #######################################
341 334
342 def hierachical_agglomerative(dataset, k_min, k_max, best_cluster): 335 def hierachical_agglomerative(dataset, k_min, k_max, best_cluster, silhouette):
343 336
344 if not os.path.exists('clustering'): 337 if not os.path.exists('clustering'):
345 os.makedirs('clustering') 338 os.makedirs('clustering')
346 339
347 plt.figure(figsize=(10, 7)) 340 plt.figure(figsize=(10, 7))
352 345
353 range_n_clusters = [i for i in range(k_min, k_max+1)] 346 range_n_clusters = [i for i in range(k_min, k_max+1)]
354 347
355 scores = [] 348 scores = []
356 labels = [] 349 labels = []
350
357 for n_clusters in range_n_clusters: 351 for n_clusters in range_n_clusters:
358 cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward') 352 cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')
359 cluster.fit_predict(dataset) 353 cluster.fit_predict(dataset)
360 cluster_labels = cluster.labels_ 354 cluster_labels = cluster.labels_
361 labels.append(cluster_labels) 355 labels.append(cluster_labels)
362 silhouette_avg = silhouette_score(dataset, cluster_labels)
363 write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv') 356 write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv')
364 scores.append(silhouette_avg)
365 #warning("For n_clusters =", n_clusters,
366 #"The average silhouette_score is :", silhouette_avg)
367 357
368 best = max_index(scores) + k_min 358 best = max_index(scores) + k_min
359
360 for i in range(len(labels)):
361 prefix = ''
362 if (i + k_min == best):
363 prefix = '_BEST'
364 if silhouette == 'true':
365 silihouette_draw(dataset, labels[i], i + k_min, 'clustering/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png')
369 366
370 for i in range(len(labels)): 367 for i in range(len(labels)):
371 if (i + k_min == best): 368 if (i + k_min == best):
372 labels = labels[i] 369 labels = labels[i]
373 predict = [x+1 for x in labels] 370 predict = [x+1 for x in labels]
374 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) 371 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
375 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) 372 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
376 373
377
378
379
380
381 374
382 ############################# main ########################################### 375 ############################# main ###########################################
383 376
384 377
385 def main(): 378 def main():
406 399
407 if args.cluster_type == 'dbscan': 400 if args.cluster_type == 'dbscan':
408 dbscan(X, args.eps, args.min_samples, args.best_cluster) 401 dbscan(X, args.eps, args.min_samples, args.best_cluster)
409 402
410 if args.cluster_type == 'hierarchy': 403 if args.cluster_type == 'hierarchy':
411 hierachical_agglomerative(X, args.k_min, args.k_max, args.best_cluster) 404 hierachical_agglomerative(X, args.k_min, args.k_max, args.best_cluster, args.silhouette)
412 405
413 ############################################################################## 406 ##############################################################################
414 407
415 if __name__ == "__main__": 408 if __name__ == "__main__":
416 main() 409 main()