annotate tools/myTools/bin/kmeans_full.py @ 1:7e5c71b2e71f draft default tip

Uploaded
author laurenmarazzi
date Wed, 22 Dec 2021 16:00:34 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
1 #!/usr/bin/env python3
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
2 import pandas as pd
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
3 from scipy import stats
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
4 import matplotlib as mpl
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
5 mpl.use('Agg')
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
6 import matplotlib.pyplot as plt
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
7 from sklearn.decomposition import PCA
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
8 import numpy as np
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
9 from sklearn.datasets import make_blobs
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
10 from sklearn.cluster import MiniBatchKMeans
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
11 from yellowbrick.cluster.elbow import kelbow_visualizer
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
12 import sys
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
13 import os
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
14
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
15 def main():
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
16 #####Input files and user specifications###########
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
17 datasets = sys.argv[1].split(',') #the input data file (logss, DAC, both, or discrete versions)
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
18 df=pd.DataFrame()
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
19 for i in datasets:
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
20 dfi=pd.read_csv(i, delim_whitespace=True,index_col = ["name"])
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
21 df=pd.concat([df,dfi],axis=0)
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
22 # Use the quick method and immediately show the figure
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
23 df=df.loc[:, (df != 0).any(axis=0)]
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
24
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
25 visualizer1=kelbow_visualizer(MiniBatchKMeans(random_state=0,n_init=10),df, k=(2,10),title="optimal k via elbow method")
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
26 elbow=visualizer1.elbow_value_
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
27 visualizer1.show('elbow.png',clear_figure=True)
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
28 visualizer2=kelbow_visualizer(MiniBatchKMeans(random_state=0,n_init=10),df, k=(2,10),metric='silhouette',title="optimal k via silhouette method")
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
29 silhouette=visualizer2.elbow_value_
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
30 visualizer2.show("silhouette.png")
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
31
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
32 if elbow==silhouette:
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
33 k=elbow
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
34 if elbow<silhouette:
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
35 k=elbow
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
36 if elbow>silhouette:
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
37 k=silhouette
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
38 #report out final k-means selected
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
39 kmeans = MiniBatchKMeans(n_clusters=k,random_state=0,n_init=100).fit(df)
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
40 labels = kmeans.labels_
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
41 df['clusters'] = labels
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
42 df2 = pd.DataFrame(index=df.index)
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
43 df2['clusters'] = labels
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
44 df2.index.name = 'name'
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
45 df2.to_csv('kmeans.txt', sep=' ')
7e5c71b2e71f Uploaded
laurenmarazzi
parents:
diff changeset
46 main()