comparison marea_cluster.py @ 38:4e1b466935cd draft

Uploaded
author bimib
date Mon, 25 Nov 2019 12:02:28 -0500
parents
children
comparison
equal deleted inserted replaced
37:2495c7772ca8 38:4e1b466935cd
1 # -*- coding: utf-8 -*-
2 """
3 Created on Mon Jun 3 19:51:00 2019
4 @author: Narger
5 """
6
7 import sys
8 import argparse
9 import os
10 from sklearn.datasets import make_blobs
11 from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
12 from sklearn.metrics import silhouette_samples, silhouette_score, davies_bouldin_score, cluster
13 import matplotlib
14 matplotlib.use('agg')
15 import matplotlib.pyplot as plt
16 import scipy.cluster.hierarchy as shc
17 import matplotlib.cm as cm
18 import numpy as np
19 import pandas as pd
20
21 ################################# process args ###############################
22
23 def process_args(args):
24 parser = argparse.ArgumentParser(usage = '%(prog)s [options]',
25 description = 'process some value\'s' +
26 ' genes to create class.')
27
28 parser.add_argument('-ol', '--out_log',
29 help = "Output log")
30
31 parser.add_argument('-in', '--input',
32 type = str,
33 help = 'input dataset')
34
35 parser.add_argument('-cy', '--cluster_type',
36 type = str,
37 choices = ['kmeans', 'dbscan', 'hierarchy'],
38 default = 'kmeans',
39 help = 'choose clustering algorythm')
40
41 parser.add_argument('-k1', '--k_min',
42 type = int,
43 default = 2,
44 help = 'choose minimun cluster number to be generated')
45
46 parser.add_argument('-k2', '--k_max',
47 type = int,
48 default = 7,
49 help = 'choose maximum cluster number to be generated')
50
51 parser.add_argument('-el', '--elbow',
52 type = str,
53 default = 'false',
54 choices = ['true', 'false'],
55 help = 'choose if you want to generate an elbow plot for kmeans')
56
57 parser.add_argument('-si', '--silhouette',
58 type = str,
59 default = 'false',
60 choices = ['true', 'false'],
61 help = 'choose if you want silhouette plots')
62
63 parser.add_argument('-td', '--tool_dir',
64 type = str,
65 required = True,
66 help = 'your tool directory')
67
68 parser.add_argument('-ms', '--min_samples',
69 type = float,
70 help = 'min samples for dbscan (optional)')
71
72 parser.add_argument('-ep', '--eps',
73 type = float,
74 help = 'eps for dbscan (optional)')
75
76 parser.add_argument('-bc', '--best_cluster',
77 type = str,
78 help = 'output of best cluster tsv')
79
80
81
82 args = parser.parse_args()
83 return args
84
85 ########################### warning ###########################################
86
87 def warning(s):
88 args = process_args(sys.argv)
89 with open(args.out_log, 'a') as log:
90 log.write(s + "\n\n")
91 print(s)
92
93 ########################## read dataset ######################################
94
95 def read_dataset(dataset):
96 try:
97 dataset = pd.read_csv(dataset, sep = '\t', header = 0)
98 except pd.errors.EmptyDataError:
99 sys.exit('Execution aborted: wrong format of dataset\n')
100 if len(dataset.columns) < 2:
101 sys.exit('Execution aborted: wrong format of dataset\n')
102 return dataset
103
104 ############################ rewrite_input ###################################
105
106 def rewrite_input(dataset):
107 #Riscrivo il dataset come dizionario di liste,
108 #non come dizionario di dizionari
109
110 dataset.pop('Reactions', None)
111
112 for key, val in dataset.items():
113 l = []
114 for i in val:
115 if i == 'None':
116 l.append(None)
117 else:
118 l.append(float(i))
119
120 dataset[key] = l
121
122 return dataset
123
124 ############################## write to csv ##################################
125
126 def write_to_csv (dataset, labels, name):
127 #labels = predict
128 predict = [x+1 for x in labels]
129
130 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
131
132 dest = name
133 classe.to_csv(dest, sep = '\t', index = False,
134 header = ['Patient_ID', 'Class'])
135
136 ########################### trova il massimo in lista ########################
137 def max_index (lista):
138 best = -1
139 best_index = 0
140 for i in range(len(lista)):
141 if lista[i] > best:
142 best = lista [i]
143 best_index = i
144
145 return best_index
146
147 ################################ kmeans #####################################
148
149 def kmeans (k_min, k_max, dataset, elbow, silhouette, best_cluster):
150 if not os.path.exists('clustering'):
151 os.makedirs('clustering')
152
153
154 if elbow == 'true':
155 elbow = True
156 else:
157 elbow = False
158
159 if silhouette == 'true':
160 silhouette = True
161 else:
162 silhouette = False
163
164 range_n_clusters = [i for i in range(k_min, k_max+1)]
165 distortions = []
166 scores = []
167 all_labels = []
168
169 clusterer = KMeans(n_clusters=1, random_state=10)
170 distortions.append(clusterer.fit(dataset).inertia_)
171
172
173 for n_clusters in range_n_clusters:
174 clusterer = KMeans(n_clusters=n_clusters, random_state=10)
175 cluster_labels = clusterer.fit_predict(dataset)
176
177 all_labels.append(cluster_labels)
178 if n_clusters == 1:
179 silhouette_avg = 0
180 else:
181 silhouette_avg = silhouette_score(dataset, cluster_labels)
182 scores.append(silhouette_avg)
183 distortions.append(clusterer.fit(dataset).inertia_)
184
185 best = max_index(scores) + k_min
186
187 for i in range(len(all_labels)):
188 prefix = ''
189 if (i + k_min == best):
190 prefix = '_BEST'
191
192 write_to_csv(dataset, all_labels[i], 'clustering/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv')
193
194
195 if (prefix == '_BEST'):
196 labels = all_labels[i]
197 predict = [x+1 for x in labels]
198 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
199 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
200
201
202
203
204 if silhouette:
205 silihouette_draw(dataset, all_labels[i], i + k_min, 'clustering/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png')
206
207
208 if elbow:
209 elbow_plot(distortions, k_min,k_max)
210
211
212
213
214
215 ############################## elbow_plot ####################################
216
217 def elbow_plot (distortions, k_min, k_max):
218 plt.figure(0)
219 x = list(range(k_min, k_max + 1))
220 x.insert(0, 1)
221 plt.plot(x, distortions, marker = 'o')
222 plt.xlabel('Number of clusters (k)')
223 plt.ylabel('Distortion')
224 s = 'clustering/elbow_plot.png'
225 fig = plt.gcf()
226 fig.set_size_inches(18.5, 10.5, forward = True)
227 fig.savefig(s, dpi=100)
228
229
230 ############################## silhouette plot ###############################
231 def silihouette_draw(dataset, labels, n_clusters, path):
232 if n_clusters == 1:
233 return None
234
235 silhouette_avg = silhouette_score(dataset, labels)
236 warning("For n_clusters = " + str(n_clusters) +
237 " The average silhouette_score is: " + str(silhouette_avg))
238
239 plt.close('all')
240 # Create a subplot with 1 row and 2 columns
241 fig, (ax1) = plt.subplots(1, 1)
242
243 fig.set_size_inches(18, 7)
244
245 # The 1st subplot is the silhouette plot
246 # The silhouette coefficient can range from -1, 1 but in this example all
247 # lie within [-0.1, 1]
248 ax1.set_xlim([-1, 1])
249 # The (n_clusters+1)*10 is for inserting blank space between silhouette
250 # plots of individual clusters, to demarcate them clearly.
251 ax1.set_ylim([0, len(dataset) + (n_clusters + 1) * 10])
252
253 # Compute the silhouette scores for each sample
254 sample_silhouette_values = silhouette_samples(dataset, labels)
255
256 y_lower = 10
257 for i in range(n_clusters):
258 # Aggregate the silhouette scores for samples belonging to
259 # cluster i, and sort them
260 ith_cluster_silhouette_values = \
261 sample_silhouette_values[labels == i]
262
263 ith_cluster_silhouette_values.sort()
264
265 size_cluster_i = ith_cluster_silhouette_values.shape[0]
266 y_upper = y_lower + size_cluster_i
267
268 color = cm.nipy_spectral(float(i) / n_clusters)
269 ax1.fill_betweenx(np.arange(y_lower, y_upper),
270 0, ith_cluster_silhouette_values,
271 facecolor=color, edgecolor=color, alpha=0.7)
272
273 # Label the silhouette plots with their cluster numbers at the middle
274 ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
275
276 # Compute the new y_lower for next plot
277 y_lower = y_upper + 10 # 10 for the 0 samples
278
279 ax1.set_title("The silhouette plot for the various clusters.")
280 ax1.set_xlabel("The silhouette coefficient values")
281 ax1.set_ylabel("Cluster label")
282
283 # The vertical line for average silhouette score of all the values
284 ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
285
286 ax1.set_yticks([]) # Clear the yaxis labels / ticks
287 ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
288
289
290 plt.suptitle(("Silhouette analysis for clustering on sample data "
291 "with n_clusters = " + str(n_clusters) + "\nAverage silhouette_score = " + str(silhouette_avg)), fontsize=12, fontweight='bold')
292
293
294 plt.savefig(path, bbox_inches='tight')
295
296 ######################## dbscan ##############################################
297
298 def dbscan(dataset, eps, min_samples, best_cluster):
299 if not os.path.exists('clustering'):
300 os.makedirs('clustering')
301
302 if eps is not None:
303 clusterer = DBSCAN(eps = eps, min_samples = min_samples)
304 else:
305 clusterer = DBSCAN()
306
307 clustering = clusterer.fit(dataset)
308
309 core_samples_mask = np.zeros_like(clustering.labels_, dtype=bool)
310 core_samples_mask[clustering.core_sample_indices_] = True
311 labels = clustering.labels_
312
313 # Number of clusters in labels, ignoring noise if present.
314 n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
315
316
317 labels = labels
318 predict = [x+1 for x in labels]
319 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
320 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
321
322
323 ########################## hierachical #######################################
324
325 def hierachical_agglomerative(dataset, k_min, k_max, best_cluster, silhouette):
326
327 if not os.path.exists('clustering'):
328 os.makedirs('clustering')
329
330 plt.figure(figsize=(10, 7))
331 plt.title("Classes Dendogram")
332 shc.dendrogram(shc.linkage(dataset, method='ward'), labels=dataset.index.values.tolist())
333 fig = plt.gcf()
334 fig.savefig('clustering/dendogram.png', dpi=200)
335
336 range_n_clusters = [i for i in range(k_min, k_max+1)]
337
338 scores = []
339 labels = []
340
341 for n_clusters in range_n_clusters:
342 cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')
343 cluster.fit_predict(dataset)
344 cluster_labels = cluster.labels_
345 labels.append(cluster_labels)
346 write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv')
347
348 best = max_index(scores) + k_min
349
350 for i in range(len(labels)):
351 prefix = ''
352 if (i + k_min == best):
353 prefix = '_BEST'
354 if silhouette == 'true':
355 silihouette_draw(dataset, labels[i], i + k_min, 'clustering/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png')
356
357 for i in range(len(labels)):
358 if (i + k_min == best):
359 labels = labels[i]
360 predict = [x+1 for x in labels]
361 classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
362 classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
363
364
365 ############################# main ###########################################
366
367
368 def main():
369 if not os.path.exists('clustering'):
370 os.makedirs('clustering')
371
372 args = process_args(sys.argv)
373
374 #Data read
375
376 X = read_dataset(args.input)
377 X = pd.DataFrame.to_dict(X, orient='list')
378 X = rewrite_input(X)
379 X = pd.DataFrame.from_dict(X, orient = 'index')
380
381 for i in X.columns:
382 tmp = X[i][0]
383 if tmp == None:
384 X = X.drop(columns=[i])
385
386
387 if args.cluster_type == 'kmeans':
388 kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.best_cluster)
389
390 if args.cluster_type == 'dbscan':
391 dbscan(X, args.eps, args.min_samples, args.best_cluster)
392
393 if args.cluster_type == 'hierarchy':
394 hierachical_agglomerative(X, args.k_min, args.k_max, args.best_cluster, args.silhouette)
395
396 ##############################################################################
397
398 if __name__ == "__main__":
399 main()