diff Marea/marea_cluster.py @ 33:abf0bfe01c78 draft

Uploaded
author bimib
date Wed, 16 Oct 2019 16:25:56 -0400
parents 944e15aa970a
children 1a97d1537623
line wrap: on
line diff
--- a/Marea/marea_cluster.py	Wed Oct 16 07:12:37 2019 -0400
+++ b/Marea/marea_cluster.py	Wed Oct 16 16:25:56 2019 -0400
@@ -72,11 +72,11 @@
                         help = 'your tool directory')
                         
     parser.add_argument('-ms', '--min_samples',
-                        type = int,
+                        type = float,
                         help = 'min samples for dbscan (optional)')
                         
     parser.add_argument('-ep', '--eps',
-                        type = int,
+                        type = float,
                         help = 'eps for dbscan (optional)')
                         
     parser.add_argument('-bc', '--best_cluster',
@@ -310,7 +310,7 @@
             
 ######################## dbscan ##############################################
     
-def dbscan(dataset, eps, min_samples):
+def dbscan(dataset, eps, min_samples, best_cluster):
     if not os.path.exists('clustering'):
         os.makedirs('clustering')
         
@@ -331,12 +331,15 @@
     
     ##TODO: PLOT SU DBSCAN (no centers) e HIERARCHICAL
     
-    
-    write_to_csv(dataset, labels, 'clustering/dbscan_results.tsv')
+    labels = labels
+    predict = [x+1 for x in labels]
+    classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
+    classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
+  
     
 ########################## hierachical #######################################
     
-def hierachical_agglomerative(dataset, k_min, k_max):
+def hierachical_agglomerative(dataset, k_min, k_max, best_cluster):
 
     if not os.path.exists('clustering'):
         os.makedirs('clustering')
@@ -349,16 +352,28 @@
     
     range_n_clusters = [i for i in range(k_min, k_max+1)]
 
-    for n_clusters in range_n_clusters:
-        
+    scores = []
+    labels = []
+    for n_clusters in range_n_clusters:    
         cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')  
         cluster.fit_predict(dataset)  
         cluster_labels = cluster.labels_
-        
+        labels.append(cluster_labels)
         silhouette_avg = silhouette_score(dataset, cluster_labels)
         write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv')
+        scores.append(silhouette_avg)
         #warning("For n_clusters =", n_clusters,
               #"The average silhouette_score is :", silhouette_avg)
+              
+    best = max_index(scores) + k_min
+     
+    for i in range(len(labels)):
+        if (i + k_min == best):
+            labels = labels[i]
+            predict = [x+1 for x in labels]
+            classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
+            classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
+     
         
         
        
@@ -390,10 +405,10 @@
         kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies, args.best_cluster)
     
     if args.cluster_type == 'dbscan':
-        dbscan(X, args.eps, args.min_samples)
+        dbscan(X, args.eps, args.min_samples, args.best_cluster)
         
     if args.cluster_type == 'hierarchy':
-        hierachical_agglomerative(X, args.k_min, args.k_max)
+        hierachical_agglomerative(X, args.k_min, args.k_max, args.best_cluster)
         
 ##############################################################################