# HG changeset patch # User bimib # Date 1574701354 18000 # Node ID 56faa11c6c789c1255da3dab21fe3485e932d05e # Parent 4e1b466935cd7694475f47c07729aa242afeb3d7 Deleted selected files diff -r 4e1b466935cd -r 56faa11c6c78 marea_cluster.py --- a/marea_cluster.py Mon Nov 25 12:02:28 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,399 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Mon Jun 3 19:51:00 2019 -@author: Narger -""" - -import sys -import argparse -import os -from sklearn.datasets import make_blobs -from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering -from sklearn.metrics import silhouette_samples, silhouette_score, davies_bouldin_score, cluster -import matplotlib -matplotlib.use('agg') -import matplotlib.pyplot as plt -import scipy.cluster.hierarchy as shc -import matplotlib.cm as cm -import numpy as np -import pandas as pd - -################################# process args ############################### - -def process_args(args): - parser = argparse.ArgumentParser(usage = '%(prog)s [options]', - description = 'process some value\'s' + - ' genes to create class.') - - parser.add_argument('-ol', '--out_log', - help = "Output log") - - parser.add_argument('-in', '--input', - type = str, - help = 'input dataset') - - parser.add_argument('-cy', '--cluster_type', - type = str, - choices = ['kmeans', 'dbscan', 'hierarchy'], - default = 'kmeans', - help = 'choose clustering algorythm') - - parser.add_argument('-k1', '--k_min', - type = int, - default = 2, - help = 'choose minimun cluster number to be generated') - - parser.add_argument('-k2', '--k_max', - type = int, - default = 7, - help = 'choose maximum cluster number to be generated') - - parser.add_argument('-el', '--elbow', - type = str, - default = 'false', - choices = ['true', 'false'], - help = 'choose if you want to generate an elbow plot for kmeans') - - parser.add_argument('-si', '--silhouette', - type = str, - default = 'false', - choices = ['true', 'false'], - help = 'choose if you want silhouette plots') - - parser.add_argument('-td', '--tool_dir', - type = str, - required = True, - help = 'your tool directory') - - parser.add_argument('-ms', '--min_samples', - type = float, - help = 'min samples for dbscan (optional)') - - parser.add_argument('-ep', '--eps', - type = float, - help = 'eps for dbscan (optional)') - - parser.add_argument('-bc', '--best_cluster', - type = str, - help = 'output of best cluster tsv') - - - - args = parser.parse_args() - return args - -########################### warning ########################################### - -def warning(s): - args = process_args(sys.argv) - with open(args.out_log, 'a') as log: - log.write(s + "\n\n") - print(s) - -########################## read dataset ###################################### - -def read_dataset(dataset): - try: - dataset = pd.read_csv(dataset, sep = '\t', header = 0) - except pd.errors.EmptyDataError: - sys.exit('Execution aborted: wrong format of dataset\n') - if len(dataset.columns) < 2: - sys.exit('Execution aborted: wrong format of dataset\n') - return dataset - -############################ rewrite_input ################################### - -def rewrite_input(dataset): - #Riscrivo il dataset come dizionario di liste, - #non come dizionario di dizionari - - dataset.pop('Reactions', None) - - for key, val in dataset.items(): - l = [] - for i in val: - if i == 'None': - l.append(None) - else: - l.append(float(i)) - - dataset[key] = l - - return dataset - -############################## write to csv ################################## - -def write_to_csv (dataset, labels, name): - #labels = predict - predict = [x+1 for x in labels] - - classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) - - dest = name - classe.to_csv(dest, sep = '\t', index = False, - header = ['Patient_ID', 'Class']) - -########################### trova il massimo in lista ######################## -def max_index (lista): - best = -1 - best_index = 0 - for i in range(len(lista)): - if lista[i] > best: - best = lista [i] - best_index = i - - return best_index - -################################ kmeans ##################################### - -def kmeans (k_min, k_max, dataset, elbow, silhouette, best_cluster): - if not os.path.exists('clustering'): - os.makedirs('clustering') - - - if elbow == 'true': - elbow = True - else: - elbow = False - - if silhouette == 'true': - silhouette = True - else: - silhouette = False - - range_n_clusters = [i for i in range(k_min, k_max+1)] - distortions = [] - scores = [] - all_labels = [] - - clusterer = KMeans(n_clusters=1, random_state=10) - distortions.append(clusterer.fit(dataset).inertia_) - - - for n_clusters in range_n_clusters: - clusterer = KMeans(n_clusters=n_clusters, random_state=10) - cluster_labels = clusterer.fit_predict(dataset) - - all_labels.append(cluster_labels) - if n_clusters == 1: - silhouette_avg = 0 - else: - silhouette_avg = silhouette_score(dataset, cluster_labels) - scores.append(silhouette_avg) - distortions.append(clusterer.fit(dataset).inertia_) - - best = max_index(scores) + k_min - - for i in range(len(all_labels)): - prefix = '' - if (i + k_min == best): - prefix = '_BEST' - - write_to_csv(dataset, all_labels[i], 'clustering/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv') - - - if (prefix == '_BEST'): - labels = all_labels[i] - predict = [x+1 for x in labels] - classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) - classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) - - - - - if silhouette: - silihouette_draw(dataset, all_labels[i], i + k_min, 'clustering/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png') - - - if elbow: - elbow_plot(distortions, k_min,k_max) - - - - - -############################## elbow_plot #################################### - -def elbow_plot (distortions, k_min, k_max): - plt.figure(0) - x = list(range(k_min, k_max + 1)) - x.insert(0, 1) - plt.plot(x, distortions, marker = 'o') - plt.xlabel('Number of clusters (k)') - plt.ylabel('Distortion') - s = 'clustering/elbow_plot.png' - fig = plt.gcf() - fig.set_size_inches(18.5, 10.5, forward = True) - fig.savefig(s, dpi=100) - - -############################## silhouette plot ############################### -def silihouette_draw(dataset, labels, n_clusters, path): - if n_clusters == 1: - return None - - silhouette_avg = silhouette_score(dataset, labels) - warning("For n_clusters = " + str(n_clusters) + - " The average silhouette_score is: " + str(silhouette_avg)) - - plt.close('all') - # Create a subplot with 1 row and 2 columns - fig, (ax1) = plt.subplots(1, 1) - - fig.set_size_inches(18, 7) - - # The 1st subplot is the silhouette plot - # The silhouette coefficient can range from -1, 1 but in this example all - # lie within [-0.1, 1] - ax1.set_xlim([-1, 1]) - # The (n_clusters+1)*10 is for inserting blank space between silhouette - # plots of individual clusters, to demarcate them clearly. - ax1.set_ylim([0, len(dataset) + (n_clusters + 1) * 10]) - - # Compute the silhouette scores for each sample - sample_silhouette_values = silhouette_samples(dataset, labels) - - y_lower = 10 - for i in range(n_clusters): - # Aggregate the silhouette scores for samples belonging to - # cluster i, and sort them - ith_cluster_silhouette_values = \ - sample_silhouette_values[labels == i] - - ith_cluster_silhouette_values.sort() - - size_cluster_i = ith_cluster_silhouette_values.shape[0] - y_upper = y_lower + size_cluster_i - - color = cm.nipy_spectral(float(i) / n_clusters) - ax1.fill_betweenx(np.arange(y_lower, y_upper), - 0, ith_cluster_silhouette_values, - facecolor=color, edgecolor=color, alpha=0.7) - - # Label the silhouette plots with their cluster numbers at the middle - ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) - - # Compute the new y_lower for next plot - y_lower = y_upper + 10 # 10 for the 0 samples - - ax1.set_title("The silhouette plot for the various clusters.") - ax1.set_xlabel("The silhouette coefficient values") - ax1.set_ylabel("Cluster label") - - # The vertical line for average silhouette score of all the values - ax1.axvline(x=silhouette_avg, color="red", linestyle="--") - - ax1.set_yticks([]) # Clear the yaxis labels / ticks - ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) - - - plt.suptitle(("Silhouette analysis for clustering on sample data " - "with n_clusters = " + str(n_clusters) + "\nAverage silhouette_score = " + str(silhouette_avg)), fontsize=12, fontweight='bold') - - - plt.savefig(path, bbox_inches='tight') - -######################## dbscan ############################################## - -def dbscan(dataset, eps, min_samples, best_cluster): - if not os.path.exists('clustering'): - os.makedirs('clustering') - - if eps is not None: - clusterer = DBSCAN(eps = eps, min_samples = min_samples) - else: - clusterer = DBSCAN() - - clustering = clusterer.fit(dataset) - - core_samples_mask = np.zeros_like(clustering.labels_, dtype=bool) - core_samples_mask[clustering.core_sample_indices_] = True - labels = clustering.labels_ - - # Number of clusters in labels, ignoring noise if present. - n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) - - - labels = labels - predict = [x+1 for x in labels] - classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) - classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) - - -########################## hierachical ####################################### - -def hierachical_agglomerative(dataset, k_min, k_max, best_cluster, silhouette): - - if not os.path.exists('clustering'): - os.makedirs('clustering') - - plt.figure(figsize=(10, 7)) - plt.title("Classes Dendogram") - shc.dendrogram(shc.linkage(dataset, method='ward'), labels=dataset.index.values.tolist()) - fig = plt.gcf() - fig.savefig('clustering/dendogram.png', dpi=200) - - range_n_clusters = [i for i in range(k_min, k_max+1)] - - scores = [] - labels = [] - - for n_clusters in range_n_clusters: - cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward') - cluster.fit_predict(dataset) - cluster_labels = cluster.labels_ - labels.append(cluster_labels) - write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv') - - best = max_index(scores) + k_min - - for i in range(len(labels)): - prefix = '' - if (i + k_min == best): - prefix = '_BEST' - if silhouette == 'true': - silihouette_draw(dataset, labels[i], i + k_min, 'clustering/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png') - - for i in range(len(labels)): - if (i + k_min == best): - labels = labels[i] - predict = [x+1 for x in labels] - classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) - classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) - - -############################# main ########################################### - - -def main(): - if not os.path.exists('clustering'): - os.makedirs('clustering') - - args = process_args(sys.argv) - - #Data read - - X = read_dataset(args.input) - X = pd.DataFrame.to_dict(X, orient='list') - X = rewrite_input(X) - X = pd.DataFrame.from_dict(X, orient = 'index') - - for i in X.columns: - tmp = X[i][0] - if tmp == None: - X = X.drop(columns=[i]) - - - if args.cluster_type == 'kmeans': - kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.best_cluster) - - if args.cluster_type == 'dbscan': - dbscan(X, args.eps, args.min_samples, args.best_cluster) - - if args.cluster_type == 'hierarchy': - hierachical_agglomerative(X, args.k_min, args.k_max, args.best_cluster, args.silhouette) - -############################################################################## - -if __name__ == "__main__": - main()