comparison tools/myTools/bin/kmeans_full.py @ 1:7e5c71b2e71f draft default tip

Uploaded
author laurenmarazzi
date Wed, 22 Dec 2021 16:00:34 +0000
parents
children
comparison
equal deleted inserted replaced
0:f24d4892aaed 1:7e5c71b2e71f
1 #!/usr/bin/env python3
2 import pandas as pd
3 from scipy import stats
4 import matplotlib as mpl
5 mpl.use('Agg')
6 import matplotlib.pyplot as plt
7 from sklearn.decomposition import PCA
8 import numpy as np
9 from sklearn.datasets import make_blobs
10 from sklearn.cluster import MiniBatchKMeans
11 from yellowbrick.cluster.elbow import kelbow_visualizer
12 import sys
13 import os
14
15 def main():
16 #####Input files and user specifications###########
17 datasets = sys.argv[1].split(',') #the input data file (logss, DAC, both, or discrete versions)
18 df=pd.DataFrame()
19 for i in datasets:
20 dfi=pd.read_csv(i, delim_whitespace=True,index_col = ["name"])
21 df=pd.concat([df,dfi],axis=0)
22 # Use the quick method and immediately show the figure
23 df=df.loc[:, (df != 0).any(axis=0)]
24
25 visualizer1=kelbow_visualizer(MiniBatchKMeans(random_state=0,n_init=10),df, k=(2,10),title="optimal k via elbow method")
26 elbow=visualizer1.elbow_value_
27 visualizer1.show('elbow.png',clear_figure=True)
28 visualizer2=kelbow_visualizer(MiniBatchKMeans(random_state=0,n_init=10),df, k=(2,10),metric='silhouette',title="optimal k via silhouette method")
29 silhouette=visualizer2.elbow_value_
30 visualizer2.show("silhouette.png")
31
32 if elbow==silhouette:
33 k=elbow
34 if elbow<silhouette:
35 k=elbow
36 if elbow>silhouette:
37 k=silhouette
38 #report out final k-means selected
39 kmeans = MiniBatchKMeans(n_clusters=k,random_state=0,n_init=100).fit(df)
40 labels = kmeans.labels_
41 df['clusters'] = labels
42 df2 = pd.DataFrame(index=df.index)
43 df2['clusters'] = labels
44 df2.index.name = 'name'
45 df2.to_csv('kmeans.txt', sep=' ')
46 main()