1
|
1 #!/usr/bin/env python3
|
|
2 import pandas as pd
|
|
3 from scipy import stats
|
|
4 import matplotlib as mpl
|
|
5 mpl.use('Agg')
|
|
6 import matplotlib.pyplot as plt
|
|
7 from sklearn.decomposition import PCA
|
|
8 import numpy as np
|
|
9 from sklearn.datasets import make_blobs
|
|
10 from sklearn.cluster import MiniBatchKMeans
|
|
11 from yellowbrick.cluster.elbow import kelbow_visualizer
|
|
12 import sys
|
|
13 import os
|
|
14
|
|
15 def main():
|
|
16 #####Input files and user specifications###########
|
|
17 datasets = sys.argv[1].split(',') #the input data file (logss, DAC, both, or discrete versions)
|
|
18 df=pd.DataFrame()
|
|
19 for i in datasets:
|
|
20 dfi=pd.read_csv(i, delim_whitespace=True,index_col = ["name"])
|
|
21 df=pd.concat([df,dfi],axis=0)
|
|
22 # Use the quick method and immediately show the figure
|
|
23 df=df.loc[:, (df != 0).any(axis=0)]
|
|
24
|
|
25 visualizer1=kelbow_visualizer(MiniBatchKMeans(random_state=0,n_init=10),df, k=(2,10),title="optimal k via elbow method")
|
|
26 elbow=visualizer1.elbow_value_
|
|
27 visualizer1.show('elbow.png',clear_figure=True)
|
|
28 visualizer2=kelbow_visualizer(MiniBatchKMeans(random_state=0,n_init=10),df, k=(2,10),metric='silhouette',title="optimal k via silhouette method")
|
|
29 silhouette=visualizer2.elbow_value_
|
|
30 visualizer2.show("silhouette.png")
|
|
31
|
|
32 if elbow==silhouette:
|
|
33 k=elbow
|
|
34 if elbow<silhouette:
|
|
35 k=elbow
|
|
36 if elbow>silhouette:
|
|
37 k=silhouette
|
|
38 #report out final k-means selected
|
|
39 kmeans = MiniBatchKMeans(n_clusters=k,random_state=0,n_init=100).fit(df)
|
|
40 labels = kmeans.labels_
|
|
41 df['clusters'] = labels
|
|
42 df2 = pd.DataFrame(index=df.index)
|
|
43 df2['clusters'] = labels
|
|
44 df2.index.name = 'name'
|
|
45 df2.to_csv('kmeans.txt', sep=' ')
|
|
46 main()
|