| 16 | 1 # -*- coding: utf-8 -*- | 
|  | 2 """ | 
|  | 3 Created on Mon Jun 3 19:51:00 2019 | 
|  | 4 @author: Narger | 
|  | 5 """ | 
|  | 6 | 
| 0 | 7 import sys | 
|  | 8 import argparse | 
| 16 | 9 import os | 
|  | 10 from sklearn.datasets import make_blobs | 
|  | 11 from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering | 
|  | 12 from sklearn.metrics import silhouette_samples, silhouette_score, davies_bouldin_score, cluster | 
| 25 | 13 import matplotlib | 
|  | 14 matplotlib.use('agg') | 
| 0 | 15 import matplotlib.pyplot as plt | 
| 16 | 16 import scipy.cluster.hierarchy as shc | 
|  | 17 import matplotlib.cm as cm | 
|  | 18 import numpy as np | 
|  | 19 import pandas as pd | 
| 0 | 20 | 
| 16 | 21 ################################# process args ############################### | 
| 0 | 22 | 
|  | 23 def process_args(args): | 
|  | 24     parser = argparse.ArgumentParser(usage = '%(prog)s [options]', | 
|  | 25                                      description = 'process some value\'s' + | 
|  | 26                                      ' genes to create class.') | 
| 16 | 27 | 
|  | 28     parser.add_argument('-ol', '--out_log', | 
|  | 29                         help = "Output log") | 
|  | 30 | 
|  | 31     parser.add_argument('-in', '--input', | 
| 0 | 32                         type = str, | 
| 16 | 33                         help = 'input dataset') | 
|  | 34 | 
|  | 35     parser.add_argument('-cy', '--cluster_type', | 
| 0 | 36                         type = str, | 
| 37 | 37                         choices = ['kmeans', 'dbscan', 'hierarchy'], | 
| 16 | 38                         default = 'kmeans', | 
|  | 39                         help = 'choose clustering algorythm') | 
|  | 40 | 
|  | 41     parser.add_argument('-k1', '--k_min', | 
|  | 42                         type = int, | 
|  | 43                         default = 2, | 
|  | 44                         help = 'choose minimun cluster number to be generated') | 
|  | 45 | 
|  | 46     parser.add_argument('-k2', '--k_max', | 
| 0 | 47                         type = int, | 
| 16 | 48                         default = 7, | 
|  | 49                         help = 'choose maximum cluster number to be generated') | 
|  | 50 | 
|  | 51     parser.add_argument('-el', '--elbow', | 
|  | 52                         type = str, | 
|  | 53                         default = 'false', | 
|  | 54                         choices = ['true', 'false'], | 
|  | 55                         help = 'choose if you want to generate an elbow plot for kmeans') | 
|  | 56 | 
|  | 57     parser.add_argument('-si', '--silhouette', | 
| 0 | 58                         type = str, | 
| 16 | 59                         default = 'false', | 
|  | 60                         choices = ['true', 'false'], | 
|  | 61                         help = 'choose if you want silhouette plots') | 
|  | 62 | 
| 0 | 63     parser.add_argument('-td', '--tool_dir', | 
|  | 64                         type = str, | 
|  | 65                         required = True, | 
|  | 66                         help = 'your tool directory') | 
| 16 | 67 | 
|  | 68     parser.add_argument('-ms', '--min_samples', | 
| 33 | 69                         type = float, | 
| 16 | 70                         help = 'min samples for dbscan (optional)') | 
|  | 71 | 
|  | 72     parser.add_argument('-ep', '--eps', | 
| 33 | 73                         type = float, | 
| 16 | 74                         help = 'eps for dbscan (optional)') | 
| 28 | 75 | 
|  | 76     parser.add_argument('-bc', '--best_cluster', | 
|  | 77                         type = str, | 
|  | 78                         help = 'output of best cluster tsv') | 
|  | 79 | 
| 16 | 80 | 
|  | 81 | 
| 0 | 82     args = parser.parse_args() | 
|  | 83     return args | 
|  | 84 | 
|  | 85 ########################### warning ########################################### | 
|  | 86 | 
|  | 87 def warning(s): | 
|  | 88     args = process_args(sys.argv) | 
|  | 89     with open(args.out_log, 'a') as log: | 
| 16 | 90         log.write(s + "\n\n") | 
|  | 91     print(s) | 
| 0 | 92 | 
| 16 | 93 ########################## read dataset ###################################### | 
|  | 94 | 
|  | 95 def read_dataset(dataset): | 
| 0 | 96     try: | 
| 16 | 97         dataset = pd.read_csv(dataset, sep = '\t', header = 0) | 
| 0 | 98     except pd.errors.EmptyDataError: | 
| 16 | 99         sys.exit('Execution aborted: wrong format of dataset\n') | 
| 0 | 100     if len(dataset.columns) < 2: | 
| 16 | 101         sys.exit('Execution aborted: wrong format of dataset\n') | 
|  | 102     return dataset | 
|  | 103 | 
|  | 104 ############################ rewrite_input ################################### | 
|  | 105 | 
|  | 106 def rewrite_input(dataset): | 
|  | 107     #Riscrivo il dataset come dizionario di liste, | 
|  | 108     #non come dizionario di dizionari | 
|  | 109 | 
| 36 | 110     dataset.pop('Reactions', None) | 
|  | 111 | 
| 16 | 112     for key, val in dataset.items(): | 
|  | 113         l = [] | 
|  | 114         for i in val: | 
|  | 115             if i == 'None': | 
|  | 116                 l.append(None) | 
|  | 117             else: | 
|  | 118                 l.append(float(i)) | 
|  | 119 | 
|  | 120         dataset[key] = l | 
|  | 121 | 
| 0 | 122     return dataset | 
|  | 123 | 
| 16 | 124 ############################## write to csv ################################## | 
| 0 | 125 | 
| 16 | 126 def write_to_csv (dataset, labels, name): | 
| 23 | 127     #labels = predict | 
|  | 128     predict = [x+1 for x in labels] | 
|  | 129 | 
|  | 130     classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) | 
| 0 | 131 | 
| 23 | 132     dest = name | 
|  | 133     classe.to_csv(dest, sep = '\t', index = False, | 
|  | 134                       header = ['Patient_ID', 'Class']) | 
| 28 | 135 | 
| 16 | 136 ########################### trova il massimo in lista ######################## | 
|  | 137 def max_index (lista): | 
|  | 138     best = -1 | 
|  | 139     best_index = 0 | 
|  | 140     for i in range(len(lista)): | 
|  | 141         if lista[i] > best: | 
|  | 142             best = lista [i] | 
|  | 143             best_index = i | 
|  | 144 | 
|  | 145     return best_index | 
|  | 146 | 
|  | 147 ################################ kmeans ##################################### | 
|  | 148 | 
| 37 | 149 def kmeans (k_min, k_max, dataset, elbow, silhouette, best_cluster): | 
| 23 | 150     if not os.path.exists('clustering'): | 
|  | 151         os.makedirs('clustering') | 
| 16 | 152 | 
|  | 153 | 
|  | 154     if elbow == 'true': | 
|  | 155         elbow = True | 
|  | 156     else: | 
|  | 157         elbow = False | 
|  | 158 | 
|  | 159     if silhouette == 'true': | 
|  | 160         silhouette = True | 
|  | 161     else: | 
|  | 162         silhouette = False | 
|  | 163 | 
|  | 164     range_n_clusters = [i for i in range(k_min, k_max+1)] | 
|  | 165     distortions = [] | 
|  | 166     scores = [] | 
|  | 167     all_labels = [] | 
|  | 168 | 
| 31 | 169     clusterer = KMeans(n_clusters=1, random_state=10) | 
|  | 170     distortions.append(clusterer.fit(dataset).inertia_) | 
|  | 171 | 
|  | 172 | 
| 16 | 173     for n_clusters in range_n_clusters: | 
|  | 174         clusterer = KMeans(n_clusters=n_clusters, random_state=10) | 
|  | 175         cluster_labels = clusterer.fit_predict(dataset) | 
|  | 176 | 
|  | 177         all_labels.append(cluster_labels) | 
| 28 | 178         if n_clusters == 1: | 
|  | 179         	silhouette_avg = 0 | 
|  | 180         else: | 
|  | 181             silhouette_avg = silhouette_score(dataset, cluster_labels) | 
| 16 | 182         scores.append(silhouette_avg) | 
|  | 183         distortions.append(clusterer.fit(dataset).inertia_) | 
|  | 184 | 
|  | 185     best = max_index(scores) + k_min | 
|  | 186 | 
|  | 187     for i in range(len(all_labels)): | 
|  | 188         prefix = '' | 
|  | 189         if (i + k_min == best): | 
|  | 190             prefix = '_BEST' | 
|  | 191 | 
| 23 | 192         write_to_csv(dataset, all_labels[i], 'clustering/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv') | 
| 28 | 193 | 
|  | 194 | 
|  | 195         if (prefix == '_BEST'): | 
|  | 196             labels = all_labels[i] | 
|  | 197             predict = [x+1 for x in labels] | 
|  | 198             classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) | 
|  | 199             classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) | 
|  | 200 | 
| 34 | 201 | 
| 16 | 202 | 
|  | 203 | 
|  | 204         if silhouette: | 
| 23 | 205             silihouette_draw(dataset, all_labels[i], i + k_min, 'clustering/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png') | 
| 16 | 206 | 
|  | 207 | 
|  | 208     if elbow: | 
|  | 209         elbow_plot(distortions, k_min,k_max) | 
| 0 | 210 | 
| 16 | 211 | 
|  | 212 | 
|  | 213 | 
| 0 | 214 | 
| 16 | 215 ############################## elbow_plot #################################### | 
|  | 216 | 
|  | 217 def elbow_plot (distortions, k_min, k_max): | 
|  | 218     plt.figure(0) | 
| 31 | 219     x = list(range(k_min, k_max + 1)) | 
|  | 220     x.insert(0, 1) | 
|  | 221     plt.plot(x, distortions, marker = 'o') | 
|  | 222     plt.xlabel('Number of clusters (k)') | 
| 16 | 223     plt.ylabel('Distortion') | 
| 23 | 224     s = 'clustering/elbow_plot.png' | 
| 16 | 225     fig = plt.gcf() | 
|  | 226     fig.set_size_inches(18.5, 10.5, forward = True) | 
|  | 227     fig.savefig(s, dpi=100) | 
|  | 228 | 
|  | 229 | 
|  | 230 ############################## silhouette plot ############################### | 
|  | 231 def silihouette_draw(dataset, labels, n_clusters, path): | 
| 28 | 232     if n_clusters == 1: | 
|  | 233         return None | 
|  | 234 | 
| 16 | 235     silhouette_avg = silhouette_score(dataset, labels) | 
|  | 236     warning("For n_clusters = " + str(n_clusters) + | 
|  | 237           " The average silhouette_score is: " + str(silhouette_avg)) | 
|  | 238 | 
|  | 239     plt.close('all') | 
|  | 240     # Create a subplot with 1 row and 2 columns | 
|  | 241     fig, (ax1) = plt.subplots(1, 1) | 
|  | 242 | 
|  | 243     fig.set_size_inches(18, 7) | 
|  | 244 | 
|  | 245     # The 1st subplot is the silhouette plot | 
|  | 246     # The silhouette coefficient can range from -1, 1 but in this example all | 
|  | 247     # lie within [-0.1, 1] | 
|  | 248     ax1.set_xlim([-1, 1]) | 
|  | 249     # The (n_clusters+1)*10 is for inserting blank space between silhouette | 
|  | 250     # plots of individual clusters, to demarcate them clearly. | 
|  | 251     ax1.set_ylim([0, len(dataset) + (n_clusters + 1) * 10]) | 
|  | 252 | 
|  | 253     # Compute the silhouette scores for each sample | 
|  | 254     sample_silhouette_values = silhouette_samples(dataset, labels) | 
|  | 255 | 
|  | 256     y_lower = 10 | 
|  | 257     for i in range(n_clusters): | 
|  | 258         # Aggregate the silhouette scores for samples belonging to | 
|  | 259         # cluster i, and sort them | 
|  | 260         ith_cluster_silhouette_values = \ | 
|  | 261         sample_silhouette_values[labels == i] | 
|  | 262 | 
|  | 263         ith_cluster_silhouette_values.sort() | 
|  | 264 | 
|  | 265         size_cluster_i = ith_cluster_silhouette_values.shape[0] | 
|  | 266         y_upper = y_lower + size_cluster_i | 
|  | 267 | 
|  | 268         color = cm.nipy_spectral(float(i) / n_clusters) | 
|  | 269         ax1.fill_betweenx(np.arange(y_lower, y_upper), | 
|  | 270                           0, ith_cluster_silhouette_values, | 
|  | 271                                      facecolor=color, edgecolor=color, alpha=0.7) | 
|  | 272 | 
|  | 273         # Label the silhouette plots with their cluster numbers at the middle | 
|  | 274         ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) | 
|  | 275 | 
|  | 276         # Compute the new y_lower for next plot | 
|  | 277         y_lower = y_upper + 10  # 10 for the 0 samples | 
|  | 278 | 
|  | 279         ax1.set_title("The silhouette plot for the various clusters.") | 
|  | 280         ax1.set_xlabel("The silhouette coefficient values") | 
|  | 281         ax1.set_ylabel("Cluster label") | 
|  | 282 | 
|  | 283         # The vertical line for average silhouette score of all the values | 
|  | 284         ax1.axvline(x=silhouette_avg, color="red", linestyle="--") | 
|  | 285 | 
|  | 286         ax1.set_yticks([])  # Clear the yaxis labels / ticks | 
|  | 287         ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) | 
|  | 288 | 
|  | 289 | 
|  | 290         plt.suptitle(("Silhouette analysis for clustering on sample data " | 
|  | 291                           "with n_clusters = " + str(n_clusters) + "\nAverage silhouette_score = " + str(silhouette_avg)), fontsize=12, fontweight='bold') | 
|  | 292 | 
|  | 293 | 
|  | 294         plt.savefig(path, bbox_inches='tight') | 
|  | 295 | 
|  | 296 ######################## dbscan ############################################## | 
|  | 297 | 
| 33 | 298 def dbscan(dataset, eps, min_samples, best_cluster): | 
| 23 | 299     if not os.path.exists('clustering'): | 
|  | 300         os.makedirs('clustering') | 
| 16 | 301 | 
|  | 302     if eps is not None: | 
|  | 303     	clusterer = DBSCAN(eps = eps, min_samples = min_samples) | 
| 0 | 304     else: | 
| 16 | 305     	clusterer = DBSCAN() | 
|  | 306 | 
|  | 307     clustering = clusterer.fit(dataset) | 
|  | 308 | 
|  | 309     core_samples_mask = np.zeros_like(clustering.labels_, dtype=bool) | 
|  | 310     core_samples_mask[clustering.core_sample_indices_] = True | 
|  | 311     labels = clustering.labels_ | 
| 0 | 312 | 
| 16 | 313     # Number of clusters in labels, ignoring noise if present. | 
|  | 314     n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) | 
|  | 315 | 
|  | 316 | 
| 33 | 317     labels = labels | 
|  | 318     predict = [x+1 for x in labels] | 
|  | 319     classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) | 
|  | 320     classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) | 
|  | 321 | 
| 16 | 322 | 
|  | 323 ########################## hierachical ####################################### | 
|  | 324 | 
| 34 | 325 def hierachical_agglomerative(dataset, k_min, k_max, best_cluster, silhouette): | 
| 0 | 326 | 
| 23 | 327     if not os.path.exists('clustering'): | 
|  | 328         os.makedirs('clustering') | 
| 16 | 329 | 
|  | 330     plt.figure(figsize=(10, 7)) | 
|  | 331     plt.title("Customer Dendograms") | 
| 37 | 332     shc.dendrogram(shc.linkage(dataset, method='ward'), labels=dataset.index.values.tolist()) | 
| 16 | 333     fig = plt.gcf() | 
| 23 | 334     fig.savefig('clustering/dendogram.png', dpi=200) | 
| 16 | 335 | 
|  | 336     range_n_clusters = [i for i in range(k_min, k_max+1)] | 
| 0 | 337 | 
| 33 | 338     scores = [] | 
|  | 339     labels = [] | 
| 34 | 340 | 
| 33 | 341     for n_clusters in range_n_clusters: | 
| 16 | 342         cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward') | 
|  | 343         cluster.fit_predict(dataset) | 
|  | 344         cluster_labels = cluster.labels_ | 
| 33 | 345         labels.append(cluster_labels) | 
| 23 | 346         write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv') | 
| 33 | 347 | 
|  | 348     best = max_index(scores) + k_min | 
| 34 | 349 | 
|  | 350     for i in range(len(labels)): | 
|  | 351         prefix = '' | 
|  | 352         if (i + k_min == best): | 
|  | 353             prefix = '_BEST' | 
|  | 354         if silhouette == 'true': | 
|  | 355             silihouette_draw(dataset, labels[i], i + k_min, 'clustering/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png') | 
| 33 | 356 | 
|  | 357     for i in range(len(labels)): | 
|  | 358         if (i + k_min == best): | 
|  | 359             labels = labels[i] | 
|  | 360             predict = [x+1 for x in labels] | 
|  | 361             classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) | 
|  | 362             classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) | 
| 34 | 363 | 
| 16 | 364 | 
|  | 365 ############################# main ########################################### | 
| 0 | 366 | 
|  | 367 | 
|  | 368 def main(): | 
| 16 | 369     if not os.path.exists('clustering'): | 
|  | 370         os.makedirs('clustering') | 
|  | 371 | 
| 0 | 372     args = process_args(sys.argv) | 
| 16 | 373 | 
|  | 374     #Data read | 
|  | 375 | 
|  | 376     X = read_dataset(args.input) | 
|  | 377     X = pd.DataFrame.to_dict(X, orient='list') | 
|  | 378     X = rewrite_input(X) | 
|  | 379     X = pd.DataFrame.from_dict(X, orient = 'index') | 
|  | 380 | 
|  | 381     for i in X.columns: | 
|  | 382         tmp = X[i][0] | 
|  | 383         if tmp == None: | 
|  | 384             X = X.drop(columns=[i]) | 
|  | 385 | 
|  | 386 | 
|  | 387     if args.cluster_type == 'kmeans': | 
| 37 | 388         kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.best_cluster) | 
| 16 | 389 | 
|  | 390     if args.cluster_type == 'dbscan': | 
| 33 | 391         dbscan(X, args.eps, args.min_samples, args.best_cluster) | 
| 16 | 392 | 
|  | 393     if args.cluster_type == 'hierarchy': | 
| 34 | 394         hierachical_agglomerative(X, args.k_min, args.k_max, args.best_cluster, args.silhouette) | 
| 16 | 395 | 
|  | 396 ############################################################################## | 
| 0 | 397 | 
|  | 398 if __name__ == "__main__": | 
| 28 | 399     main() |