Mercurial > repos > bimib > marea
comparison Marea/marea_cluster.py @ 34:1a97d1537623 draft
Lot of bug fixes
author | bimib |
---|---|
date | Sat, 26 Oct 2019 07:49:31 -0400 |
parents | abf0bfe01c78 |
children | 94c51690d40c |
comparison
equal
deleted
inserted
replaced
33:abf0bfe01c78 | 34:1a97d1537623 |
---|---|
206 labels = all_labels[i] | 206 labels = all_labels[i] |
207 predict = [x+1 for x in labels] | 207 predict = [x+1 for x in labels] |
208 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) | 208 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) |
209 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) | 209 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) |
210 | 210 |
211 | 211 |
212 if davies: | |
213 with np.errstate(divide='ignore', invalid='ignore'): | |
214 davies_bouldin = davies_bouldin_score(dataset, all_labels[i]) | |
215 warning("\nFor n_clusters = " + str(i + k_min) + | |
216 " The average davies bouldin score is: " + str(davies_bouldin)) | |
217 | 212 |
218 | 213 |
219 if silhouette: | 214 if silhouette: |
220 silihouette_draw(dataset, all_labels[i], i + k_min, 'clustering/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png') | 215 silihouette_draw(dataset, all_labels[i], i + k_min, 'clustering/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png') |
221 | 216 |
327 | 322 |
328 # Number of clusters in labels, ignoring noise if present. | 323 # Number of clusters in labels, ignoring noise if present. |
329 n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) | 324 n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) |
330 | 325 |
331 | 326 |
332 ##TODO: PLOT SU DBSCAN (no centers) e HIERARCHICAL | |
333 | |
334 labels = labels | 327 labels = labels |
335 predict = [x+1 for x in labels] | 328 predict = [x+1 for x in labels] |
336 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) | 329 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) |
337 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) | 330 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) |
338 | 331 |
339 | 332 |
340 ########################## hierachical ####################################### | 333 ########################## hierachical ####################################### |
341 | 334 |
342 def hierachical_agglomerative(dataset, k_min, k_max, best_cluster): | 335 def hierachical_agglomerative(dataset, k_min, k_max, best_cluster, silhouette): |
343 | 336 |
344 if not os.path.exists('clustering'): | 337 if not os.path.exists('clustering'): |
345 os.makedirs('clustering') | 338 os.makedirs('clustering') |
346 | 339 |
347 plt.figure(figsize=(10, 7)) | 340 plt.figure(figsize=(10, 7)) |
352 | 345 |
353 range_n_clusters = [i for i in range(k_min, k_max+1)] | 346 range_n_clusters = [i for i in range(k_min, k_max+1)] |
354 | 347 |
355 scores = [] | 348 scores = [] |
356 labels = [] | 349 labels = [] |
350 | |
357 for n_clusters in range_n_clusters: | 351 for n_clusters in range_n_clusters: |
358 cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward') | 352 cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward') |
359 cluster.fit_predict(dataset) | 353 cluster.fit_predict(dataset) |
360 cluster_labels = cluster.labels_ | 354 cluster_labels = cluster.labels_ |
361 labels.append(cluster_labels) | 355 labels.append(cluster_labels) |
362 silhouette_avg = silhouette_score(dataset, cluster_labels) | |
363 write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv') | 356 write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv') |
364 scores.append(silhouette_avg) | |
365 #warning("For n_clusters =", n_clusters, | |
366 #"The average silhouette_score is :", silhouette_avg) | |
367 | 357 |
368 best = max_index(scores) + k_min | 358 best = max_index(scores) + k_min |
359 | |
360 for i in range(len(labels)): | |
361 prefix = '' | |
362 if (i + k_min == best): | |
363 prefix = '_BEST' | |
364 if silhouette == 'true': | |
365 silihouette_draw(dataset, labels[i], i + k_min, 'clustering/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png') | |
369 | 366 |
370 for i in range(len(labels)): | 367 for i in range(len(labels)): |
371 if (i + k_min == best): | 368 if (i + k_min == best): |
372 labels = labels[i] | 369 labels = labels[i] |
373 predict = [x+1 for x in labels] | 370 predict = [x+1 for x in labels] |
374 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) | 371 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) |
375 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) | 372 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) |
376 | 373 |
377 | |
378 | |
379 | |
380 | |
381 | 374 |
382 ############################# main ########################################### | 375 ############################# main ########################################### |
383 | 376 |
384 | 377 |
385 def main(): | 378 def main(): |
406 | 399 |
407 if args.cluster_type == 'dbscan': | 400 if args.cluster_type == 'dbscan': |
408 dbscan(X, args.eps, args.min_samples, args.best_cluster) | 401 dbscan(X, args.eps, args.min_samples, args.best_cluster) |
409 | 402 |
410 if args.cluster_type == 'hierarchy': | 403 if args.cluster_type == 'hierarchy': |
411 hierachical_agglomerative(X, args.k_min, args.k_max, args.best_cluster) | 404 hierachical_agglomerative(X, args.k_min, args.k_max, args.best_cluster, args.silhouette) |
412 | 405 |
413 ############################################################################## | 406 ############################################################################## |
414 | 407 |
415 if __name__ == "__main__": | 408 if __name__ == "__main__": |
416 main() | 409 main() |