| 16 | 1 # -*- coding: utf-8 -*- | 
|  | 2 """ | 
|  | 3 Created on Mon Jun 3 19:51:00 2019 | 
|  | 4 @author: Narger | 
|  | 5 """ | 
|  | 6 | 
| 0 | 7 import sys | 
|  | 8 import argparse | 
| 16 | 9 import os | 
|  | 10 from sklearn.datasets import make_blobs | 
|  | 11 from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering | 
|  | 12 from sklearn.metrics import silhouette_samples, silhouette_score, davies_bouldin_score, cluster | 
| 0 | 13 import matplotlib.pyplot as plt | 
| 16 | 14 import scipy.cluster.hierarchy as shc | 
|  | 15 import matplotlib.cm as cm | 
|  | 16 import numpy as np | 
|  | 17 import pandas as pd | 
| 0 | 18 | 
| 16 | 19 ################################# process args ############################### | 
| 0 | 20 | 
|  | 21 def process_args(args): | 
|  | 22     parser = argparse.ArgumentParser(usage = '%(prog)s [options]', | 
|  | 23                                      description = 'process some value\'s' + | 
|  | 24                                      ' genes to create class.') | 
| 16 | 25 | 
|  | 26     parser.add_argument('-ol', '--out_log', | 
|  | 27                         help = "Output log") | 
|  | 28 | 
|  | 29     parser.add_argument('-in', '--input', | 
| 0 | 30                         type = str, | 
| 16 | 31                         help = 'input dataset') | 
|  | 32 | 
|  | 33     parser.add_argument('-cy', '--cluster_type', | 
| 0 | 34                         type = str, | 
| 16 | 35                         choices = ['kmeans', 'meanshift', 'dbscan', 'hierarchy'], | 
|  | 36                         default = 'kmeans', | 
|  | 37                         help = 'choose clustering algorythm') | 
|  | 38 | 
|  | 39     parser.add_argument('-k1', '--k_min', | 
|  | 40                         type = int, | 
|  | 41                         default = 2, | 
|  | 42                         help = 'choose minimun cluster number to be generated') | 
|  | 43 | 
|  | 44     parser.add_argument('-k2', '--k_max', | 
| 0 | 45                         type = int, | 
| 16 | 46                         default = 7, | 
|  | 47                         help = 'choose maximum cluster number to be generated') | 
|  | 48 | 
|  | 49     parser.add_argument('-el', '--elbow', | 
|  | 50                         type = str, | 
|  | 51                         default = 'false', | 
|  | 52                         choices = ['true', 'false'], | 
|  | 53                         help = 'choose if you want to generate an elbow plot for kmeans') | 
|  | 54 | 
|  | 55     parser.add_argument('-si', '--silhouette', | 
| 0 | 56                         type = str, | 
| 16 | 57                         default = 'false', | 
|  | 58                         choices = ['true', 'false'], | 
|  | 59                         help = 'choose if you want silhouette plots') | 
|  | 60 | 
|  | 61     parser.add_argument('-db', '--davies', | 
| 0 | 62                         type = str, | 
| 16 | 63                         default = 'false', | 
|  | 64                         choices = ['true', 'false'], | 
|  | 65                         help = 'choose if you want davies bouldin scores') | 
|  | 66 | 
| 0 | 67     parser.add_argument('-td', '--tool_dir', | 
|  | 68                         type = str, | 
|  | 69                         required = True, | 
|  | 70                         help = 'your tool directory') | 
| 16 | 71 | 
|  | 72     parser.add_argument('-ms', '--min_samples', | 
|  | 73                         type = int, | 
|  | 74                         help = 'min samples for dbscan (optional)') | 
|  | 75 | 
|  | 76     parser.add_argument('-ep', '--eps', | 
|  | 77                         type = int, | 
|  | 78                         help = 'eps for dbscan (optional)') | 
|  | 79 | 
|  | 80 | 
| 0 | 81     args = parser.parse_args() | 
|  | 82     return args | 
|  | 83 | 
|  | 84 ########################### warning ########################################### | 
|  | 85 | 
|  | 86 def warning(s): | 
|  | 87     args = process_args(sys.argv) | 
|  | 88     with open(args.out_log, 'a') as log: | 
| 16 | 89         log.write(s + "\n\n") | 
|  | 90     print(s) | 
| 0 | 91 | 
| 16 | 92 ########################## read dataset ###################################### | 
|  | 93 | 
|  | 94 def read_dataset(dataset): | 
| 0 | 95     try: | 
| 16 | 96         dataset = pd.read_csv(dataset, sep = '\t', header = 0) | 
| 0 | 97     except pd.errors.EmptyDataError: | 
| 16 | 98         sys.exit('Execution aborted: wrong format of dataset\n') | 
| 0 | 99     if len(dataset.columns) < 2: | 
| 16 | 100         sys.exit('Execution aborted: wrong format of dataset\n') | 
|  | 101     return dataset | 
|  | 102 | 
|  | 103 ############################ rewrite_input ################################### | 
|  | 104 | 
|  | 105 def rewrite_input(dataset): | 
|  | 106     #Riscrivo il dataset come dizionario di liste, | 
|  | 107     #non come dizionario di dizionari | 
|  | 108 | 
|  | 109     for key, val in dataset.items(): | 
|  | 110         l = [] | 
|  | 111         for i in val: | 
|  | 112             if i == 'None': | 
|  | 113                 l.append(None) | 
|  | 114             else: | 
|  | 115                 l.append(float(i)) | 
|  | 116 | 
|  | 117         dataset[key] = l | 
|  | 118 | 
| 0 | 119     return dataset | 
|  | 120 | 
| 16 | 121 ############################## write to csv ################################## | 
| 0 | 122 | 
| 16 | 123 def write_to_csv (dataset, labels, name): | 
| 23 | 124     #labels = predict | 
|  | 125     predict = [x+1 for x in labels] | 
|  | 126 | 
|  | 127     classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) | 
| 0 | 128 | 
| 23 | 129     dest = name | 
|  | 130     classe.to_csv(dest, sep = '\t', index = False, | 
|  | 131                       header = ['Patient_ID', 'Class']) | 
| 0 | 132 | 
| 23 | 133 | 
|  | 134       #list_labels = labels | 
|  | 135     #list_values = dataset | 
| 0 | 136 | 
| 23 | 137     #list_values = list_values.tolist() | 
|  | 138     #d = {'Label' : list_labels, 'Value' : list_values} | 
|  | 139 | 
|  | 140     #df = pd.DataFrame(d, columns=['Value','Label']) | 
|  | 141 | 
|  | 142     #dest = name + '.tsv' | 
|  | 143     #df.to_csv(dest, sep = '\t', index = False, | 
|  | 144      #                 header = ['Value', 'Label']) | 
| 16 | 145 | 
|  | 146 ########################### trova il massimo in lista ######################## | 
|  | 147 def max_index (lista): | 
|  | 148     best = -1 | 
|  | 149     best_index = 0 | 
|  | 150     for i in range(len(lista)): | 
|  | 151         if lista[i] > best: | 
|  | 152             best = lista [i] | 
|  | 153             best_index = i | 
|  | 154 | 
|  | 155     return best_index | 
|  | 156 | 
|  | 157 ################################ kmeans ##################################### | 
|  | 158 | 
|  | 159 def kmeans (k_min, k_max, dataset, elbow, silhouette, davies): | 
| 23 | 160     if not os.path.exists('clustering'): | 
|  | 161         os.makedirs('clustering') | 
| 16 | 162 | 
|  | 163 | 
|  | 164     if elbow == 'true': | 
|  | 165         elbow = True | 
|  | 166     else: | 
|  | 167         elbow = False | 
|  | 168 | 
|  | 169     if silhouette == 'true': | 
|  | 170         silhouette = True | 
|  | 171     else: | 
|  | 172         silhouette = False | 
|  | 173 | 
|  | 174     if davies == 'true': | 
|  | 175         davies = True | 
|  | 176     else: | 
|  | 177         davies = False | 
|  | 178 | 
| 0 | 179 | 
| 16 | 180     range_n_clusters = [i for i in range(k_min, k_max+1)] | 
|  | 181     distortions = [] | 
|  | 182     scores = [] | 
|  | 183     all_labels = [] | 
|  | 184 | 
|  | 185     for n_clusters in range_n_clusters: | 
|  | 186         clusterer = KMeans(n_clusters=n_clusters, random_state=10) | 
|  | 187         cluster_labels = clusterer.fit_predict(dataset) | 
|  | 188 | 
|  | 189         all_labels.append(cluster_labels) | 
|  | 190         silhouette_avg = silhouette_score(dataset, cluster_labels) | 
|  | 191         scores.append(silhouette_avg) | 
|  | 192         distortions.append(clusterer.fit(dataset).inertia_) | 
|  | 193 | 
|  | 194     best = max_index(scores) + k_min | 
|  | 195 | 
|  | 196     for i in range(len(all_labels)): | 
|  | 197         prefix = '' | 
|  | 198         if (i + k_min == best): | 
|  | 199             prefix = '_BEST' | 
|  | 200 | 
| 23 | 201         write_to_csv(dataset, all_labels[i], 'clustering/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv') | 
| 16 | 202 | 
|  | 203         if davies: | 
|  | 204             with np.errstate(divide='ignore', invalid='ignore'): | 
|  | 205                 davies_bouldin = davies_bouldin_score(dataset, all_labels[i]) | 
|  | 206             warning("\nFor n_clusters = " + str(i + k_min) + | 
|  | 207                   " The average davies bouldin score is: " + str(davies_bouldin)) | 
|  | 208 | 
|  | 209 | 
|  | 210         if silhouette: | 
| 23 | 211             silihouette_draw(dataset, all_labels[i], i + k_min, 'clustering/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png') | 
| 16 | 212 | 
|  | 213 | 
|  | 214     if elbow: | 
|  | 215         elbow_plot(distortions, k_min,k_max) | 
| 0 | 216 | 
| 16 | 217 | 
|  | 218 | 
|  | 219 | 
| 0 | 220 | 
| 16 | 221 ############################## elbow_plot #################################### | 
|  | 222 | 
|  | 223 def elbow_plot (distortions, k_min, k_max): | 
|  | 224     plt.figure(0) | 
|  | 225     plt.plot(range(k_min, k_max+1), distortions, marker = 'o') | 
|  | 226     plt.xlabel('Number of cluster') | 
|  | 227     plt.ylabel('Distortion') | 
| 23 | 228     s = 'clustering/elbow_plot.png' | 
| 16 | 229     fig = plt.gcf() | 
|  | 230     fig.set_size_inches(18.5, 10.5, forward = True) | 
|  | 231     fig.savefig(s, dpi=100) | 
|  | 232 | 
|  | 233 | 
|  | 234 ############################## silhouette plot ############################### | 
|  | 235 def silihouette_draw(dataset, labels, n_clusters, path): | 
|  | 236     silhouette_avg = silhouette_score(dataset, labels) | 
|  | 237     warning("For n_clusters = " + str(n_clusters) + | 
|  | 238           " The average silhouette_score is: " + str(silhouette_avg)) | 
|  | 239 | 
|  | 240     plt.close('all') | 
|  | 241     # Create a subplot with 1 row and 2 columns | 
|  | 242     fig, (ax1) = plt.subplots(1, 1) | 
|  | 243 | 
|  | 244     fig.set_size_inches(18, 7) | 
|  | 245 | 
|  | 246     # The 1st subplot is the silhouette plot | 
|  | 247     # The silhouette coefficient can range from -1, 1 but in this example all | 
|  | 248     # lie within [-0.1, 1] | 
|  | 249     ax1.set_xlim([-1, 1]) | 
|  | 250     # The (n_clusters+1)*10 is for inserting blank space between silhouette | 
|  | 251     # plots of individual clusters, to demarcate them clearly. | 
|  | 252     ax1.set_ylim([0, len(dataset) + (n_clusters + 1) * 10]) | 
|  | 253 | 
|  | 254     # Compute the silhouette scores for each sample | 
|  | 255     sample_silhouette_values = silhouette_samples(dataset, labels) | 
|  | 256 | 
|  | 257     y_lower = 10 | 
|  | 258     for i in range(n_clusters): | 
|  | 259         # Aggregate the silhouette scores for samples belonging to | 
|  | 260         # cluster i, and sort them | 
|  | 261         ith_cluster_silhouette_values = \ | 
|  | 262         sample_silhouette_values[labels == i] | 
|  | 263 | 
|  | 264         ith_cluster_silhouette_values.sort() | 
|  | 265 | 
|  | 266         size_cluster_i = ith_cluster_silhouette_values.shape[0] | 
|  | 267         y_upper = y_lower + size_cluster_i | 
|  | 268 | 
|  | 269         color = cm.nipy_spectral(float(i) / n_clusters) | 
|  | 270         ax1.fill_betweenx(np.arange(y_lower, y_upper), | 
|  | 271                           0, ith_cluster_silhouette_values, | 
|  | 272                                      facecolor=color, edgecolor=color, alpha=0.7) | 
|  | 273 | 
|  | 274         # Label the silhouette plots with their cluster numbers at the middle | 
|  | 275         ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) | 
|  | 276 | 
|  | 277         # Compute the new y_lower for next plot | 
|  | 278         y_lower = y_upper + 10  # 10 for the 0 samples | 
|  | 279 | 
|  | 280         ax1.set_title("The silhouette plot for the various clusters.") | 
|  | 281         ax1.set_xlabel("The silhouette coefficient values") | 
|  | 282         ax1.set_ylabel("Cluster label") | 
|  | 283 | 
|  | 284         # The vertical line for average silhouette score of all the values | 
|  | 285         ax1.axvline(x=silhouette_avg, color="red", linestyle="--") | 
|  | 286 | 
|  | 287         ax1.set_yticks([])  # Clear the yaxis labels / ticks | 
|  | 288         ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) | 
|  | 289 | 
|  | 290 | 
|  | 291         plt.suptitle(("Silhouette analysis for clustering on sample data " | 
|  | 292                           "with n_clusters = " + str(n_clusters) + "\nAverage silhouette_score = " + str(silhouette_avg)), fontsize=12, fontweight='bold') | 
|  | 293 | 
|  | 294 | 
|  | 295         plt.savefig(path, bbox_inches='tight') | 
|  | 296 | 
|  | 297 ######################## dbscan ############################################## | 
|  | 298 | 
|  | 299 def dbscan(dataset, eps, min_samples): | 
| 23 | 300     if not os.path.exists('clustering'): | 
|  | 301         os.makedirs('clustering') | 
| 16 | 302 | 
|  | 303     if eps is not None: | 
|  | 304     	clusterer = DBSCAN(eps = eps, min_samples = min_samples) | 
| 0 | 305     else: | 
| 16 | 306     	clusterer = DBSCAN() | 
|  | 307 | 
|  | 308     clustering = clusterer.fit(dataset) | 
|  | 309 | 
|  | 310     core_samples_mask = np.zeros_like(clustering.labels_, dtype=bool) | 
|  | 311     core_samples_mask[clustering.core_sample_indices_] = True | 
|  | 312     labels = clustering.labels_ | 
| 0 | 313 | 
| 16 | 314     # Number of clusters in labels, ignoring noise if present. | 
|  | 315     n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) | 
|  | 316 | 
|  | 317 | 
|  | 318     ##TODO: PLOT SU DBSCAN (no centers) e HIERARCHICAL | 
|  | 319 | 
|  | 320 | 
| 23 | 321     write_to_csv(dataset, labels, 'clustering/dbscan_results.tsv') | 
| 16 | 322 | 
|  | 323 ########################## hierachical ####################################### | 
|  | 324 | 
|  | 325 def hierachical_agglomerative(dataset, k_min, k_max): | 
| 0 | 326 | 
| 23 | 327     if not os.path.exists('clustering'): | 
|  | 328         os.makedirs('clustering') | 
| 16 | 329 | 
|  | 330     plt.figure(figsize=(10, 7)) | 
|  | 331     plt.title("Customer Dendograms") | 
|  | 332     shc.dendrogram(shc.linkage(dataset, method='ward')) | 
|  | 333     fig = plt.gcf() | 
| 23 | 334     fig.savefig('clustering/dendogram.png', dpi=200) | 
| 16 | 335 | 
|  | 336     range_n_clusters = [i for i in range(k_min, k_max+1)] | 
| 0 | 337 | 
| 16 | 338     for n_clusters in range_n_clusters: | 
|  | 339 | 
|  | 340         cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward') | 
|  | 341         cluster.fit_predict(dataset) | 
|  | 342         cluster_labels = cluster.labels_ | 
|  | 343 | 
|  | 344         silhouette_avg = silhouette_score(dataset, cluster_labels) | 
| 23 | 345         write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv') | 
|  | 346         #warning("For n_clusters =", n_clusters, | 
|  | 347               #"The average silhouette_score is :", silhouette_avg) | 
| 16 | 348 | 
|  | 349 | 
|  | 350 | 
| 0 | 351 | 
| 16 | 352 | 
|  | 353 ############################# main ########################################### | 
| 0 | 354 | 
|  | 355 | 
|  | 356 def main(): | 
| 16 | 357     if not os.path.exists('clustering'): | 
|  | 358         os.makedirs('clustering') | 
|  | 359 | 
| 0 | 360     args = process_args(sys.argv) | 
| 16 | 361 | 
|  | 362     #Data read | 
|  | 363 | 
|  | 364     X = read_dataset(args.input) | 
|  | 365     X = pd.DataFrame.to_dict(X, orient='list') | 
|  | 366     X = rewrite_input(X) | 
|  | 367     X = pd.DataFrame.from_dict(X, orient = 'index') | 
|  | 368 | 
|  | 369     for i in X.columns: | 
|  | 370         tmp = X[i][0] | 
|  | 371         if tmp == None: | 
|  | 372             X = X.drop(columns=[i]) | 
|  | 373 | 
|  | 374 | 
|  | 375     if args.cluster_type == 'kmeans': | 
|  | 376         kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.davies) | 
|  | 377 | 
|  | 378     if args.cluster_type == 'dbscan': | 
|  | 379         dbscan(X, args.eps, args.min_samples) | 
|  | 380 | 
|  | 381     if args.cluster_type == 'hierarchy': | 
|  | 382         hierachical_agglomerative(X, args.k_min, args.k_max) | 
|  | 383 | 
|  | 384 ############################################################################## | 
| 0 | 385 | 
|  | 386 if __name__ == "__main__": | 
| 23 | 387     main() |