Mercurial > repos > laurenmarazzi > netisce_test
comparison tools/myTools/bin/kmeans_full.py @ 1:7e5c71b2e71f draft default tip
Uploaded
author | laurenmarazzi |
---|---|
date | Wed, 22 Dec 2021 16:00:34 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:f24d4892aaed | 1:7e5c71b2e71f |
---|---|
1 #!/usr/bin/env python3 | |
2 import pandas as pd | |
3 from scipy import stats | |
4 import matplotlib as mpl | |
5 mpl.use('Agg') | |
6 import matplotlib.pyplot as plt | |
7 from sklearn.decomposition import PCA | |
8 import numpy as np | |
9 from sklearn.datasets import make_blobs | |
10 from sklearn.cluster import MiniBatchKMeans | |
11 from yellowbrick.cluster.elbow import kelbow_visualizer | |
12 import sys | |
13 import os | |
14 | |
15 def main(): | |
16 #####Input files and user specifications########### | |
17 datasets = sys.argv[1].split(',') #the input data file (logss, DAC, both, or discrete versions) | |
18 df=pd.DataFrame() | |
19 for i in datasets: | |
20 dfi=pd.read_csv(i, delim_whitespace=True,index_col = ["name"]) | |
21 df=pd.concat([df,dfi],axis=0) | |
22 # Use the quick method and immediately show the figure | |
23 df=df.loc[:, (df != 0).any(axis=0)] | |
24 | |
25 visualizer1=kelbow_visualizer(MiniBatchKMeans(random_state=0,n_init=10),df, k=(2,10),title="optimal k via elbow method") | |
26 elbow=visualizer1.elbow_value_ | |
27 visualizer1.show('elbow.png',clear_figure=True) | |
28 visualizer2=kelbow_visualizer(MiniBatchKMeans(random_state=0,n_init=10),df, k=(2,10),metric='silhouette',title="optimal k via silhouette method") | |
29 silhouette=visualizer2.elbow_value_ | |
30 visualizer2.show("silhouette.png") | |
31 | |
32 if elbow==silhouette: | |
33 k=elbow | |
34 if elbow<silhouette: | |
35 k=elbow | |
36 if elbow>silhouette: | |
37 k=silhouette | |
38 #report out final k-means selected | |
39 kmeans = MiniBatchKMeans(n_clusters=k,random_state=0,n_init=100).fit(df) | |
40 labels = kmeans.labels_ | |
41 df['clusters'] = labels | |
42 df2 = pd.DataFrame(index=df.index) | |
43 df2['clusters'] = labels | |
44 df2.index.name = 'name' | |
45 df2.to_csv('kmeans.txt', sep=' ') | |
46 main() |