Previous changeset 38:4e1b466935cd (2019-11-25) Next changeset 40:2a082b4aed02 (2019-11-25) |
Commit message:
Deleted selected files |
removed:
marea_cluster.py |
b |
diff -r 4e1b466935cd -r 56faa11c6c78 marea_cluster.py --- a/marea_cluster.py Mon Nov 25 12:02:28 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 |
[ |
b'@@ -1,399 +0,0 @@\n-# -*- coding: utf-8 -*-\n-"""\n-Created on Mon Jun 3 19:51:00 2019\n-@author: Narger\n-"""\n-\n-import sys\n-import argparse\n-import os\n-from sklearn.datasets import make_blobs\n-from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering\n-from sklearn.metrics import silhouette_samples, silhouette_score, davies_bouldin_score, cluster\n-import matplotlib\n-matplotlib.use(\'agg\')\n-import matplotlib.pyplot as plt\n-import scipy.cluster.hierarchy as shc \n-import matplotlib.cm as cm\n-import numpy as np\n-import pandas as pd\n-\n-################################# process args ###############################\n-\n-def process_args(args):\n- parser = argparse.ArgumentParser(usage = \'%(prog)s [options]\',\n- description = \'process some value\\\'s\' +\n- \' genes to create class.\')\n-\n- parser.add_argument(\'-ol\', \'--out_log\', \n- help = "Output log")\n- \n- parser.add_argument(\'-in\', \'--input\',\n- type = str,\n- help = \'input dataset\')\n- \n- parser.add_argument(\'-cy\', \'--cluster_type\',\n- type = str,\n- choices = [\'kmeans\', \'dbscan\', \'hierarchy\'],\n- default = \'kmeans\',\n- help = \'choose clustering algorythm\')\n- \n- parser.add_argument(\'-k1\', \'--k_min\', \n- type = int,\n- default = 2,\n- help = \'choose minimun cluster number to be generated\')\n- \n- parser.add_argument(\'-k2\', \'--k_max\', \n- type = int,\n- default = 7,\n- help = \'choose maximum cluster number to be generated\')\n- \n- parser.add_argument(\'-el\', \'--elbow\', \n- type = str,\n- default = \'false\',\n- choices = [\'true\', \'false\'],\n- help = \'choose if you want to generate an elbow plot for kmeans\')\n- \n- parser.add_argument(\'-si\', \'--silhouette\', \n- type = str,\n- default = \'false\',\n- choices = [\'true\', \'false\'],\n- help = \'choose if you want silhouette plots\')\n- \n- parser.add_argument(\'-td\', \'--tool_dir\',\n- type = str,\n- required = True,\n- help = \'your tool directory\')\n- \n- parser.add_argument(\'-ms\', \'--min_samples\',\n- type = float,\n- help = \'min samples for dbscan (optional)\')\n- \n- parser.add_argument(\'-ep\', \'--eps\',\n- type = float,\n- help = \'eps for dbscan (optional)\')\n- \n- parser.add_argument(\'-bc\', \'--best_cluster\',\n- type = str,\n- help = \'output of best cluster tsv\')\n- \t\t\t\t\n- \n- \n- args = parser.parse_args()\n- return args\n-\n-########################### warning ###########################################\n-\n-def warning(s):\n- args = process_args(sys.argv)\n- with open(args.out_log, \'a\') as log:\n- log.write(s + "\\n\\n")\n- print(s)\n-\n-########################## read dataset ######################################\n- \n-def read_dataset(dataset):\n- try:\n- dataset = pd.read_csv(dataset, sep = \'\\t\', header = 0)\n- except pd.errors.EmptyDataError:\n- sys.exit(\'Execution aborted: wrong format of dataset\\n\')\n- if len(dataset.columns) < 2:\n- sys.exit(\'Execution aborted: wrong format of dataset\\n\')\n- return dataset\n-\n-############################ rewrite_input ###################################\n- \n-def rewrite_input(dataset):\n- #Riscrivo il dataset come dizionario di liste, \n- #non come dizionario di dizionari\n- \n- dataset.pop(\'Reactions\', None)\n- \n- for key'..b'ilhouette analysis for clustering on sample data "\n- "with n_clusters = " + str(n_clusters) + "\\nAverage silhouette_score = " + str(silhouette_avg)), fontsize=12, fontweight=\'bold\')\n- \n- \n- plt.savefig(path, bbox_inches=\'tight\')\n- \n-######################## dbscan ##############################################\n- \n-def dbscan(dataset, eps, min_samples, best_cluster):\n- if not os.path.exists(\'clustering\'):\n- os.makedirs(\'clustering\')\n- \n- if eps is not None:\n- \tclusterer = DBSCAN(eps = eps, min_samples = min_samples)\n- else:\n- \tclusterer = DBSCAN()\n- \n- clustering = clusterer.fit(dataset)\n- \n- core_samples_mask = np.zeros_like(clustering.labels_, dtype=bool)\n- core_samples_mask[clustering.core_sample_indices_] = True\n- labels = clustering.labels_\n-\n- # Number of clusters in labels, ignoring noise if present.\n- n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)\n- \n- \n- labels = labels\n- predict = [x+1 for x in labels]\n- classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)\n- classe.to_csv(best_cluster, sep = \'\\t\', index = False, header = [\'Patient_ID\', \'Class\'])\n- \n- \n-########################## hierachical #######################################\n- \n-def hierachical_agglomerative(dataset, k_min, k_max, best_cluster, silhouette):\n-\n- if not os.path.exists(\'clustering\'):\n- os.makedirs(\'clustering\')\n- \n- plt.figure(figsize=(10, 7)) \n- plt.title("Classes Dendogram") \n- shc.dendrogram(shc.linkage(dataset, method=\'ward\'), labels=dataset.index.values.tolist()) \n- fig = plt.gcf()\n- fig.savefig(\'clustering/dendogram.png\', dpi=200)\n- \n- range_n_clusters = [i for i in range(k_min, k_max+1)]\n-\n- scores = []\n- labels = []\n- \n- for n_clusters in range_n_clusters: \n- cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity=\'euclidean\', linkage=\'ward\') \n- cluster.fit_predict(dataset) \n- cluster_labels = cluster.labels_\n- labels.append(cluster_labels)\n- write_to_csv(dataset, cluster_labels, \'clustering/hierarchical_with_\' + str(n_clusters) + \'_clusters.tsv\')\n- \n- best = max_index(scores) + k_min\n- \n- for i in range(len(labels)):\n- prefix = \'\'\n- if (i + k_min == best):\n- prefix = \'_BEST\'\n- if silhouette == \'true\':\n- silihouette_draw(dataset, labels[i], i + k_min, \'clustering/silhouette_with_\' + str(i + k_min) + prefix + \'_clusters.png\')\n- \n- for i in range(len(labels)):\n- if (i + k_min == best):\n- labels = labels[i]\n- predict = [x+1 for x in labels]\n- classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)\n- classe.to_csv(best_cluster, sep = \'\\t\', index = False, header = [\'Patient_ID\', \'Class\'])\n- \n- \n-############################# main ###########################################\n-\n-\n-def main():\n- if not os.path.exists(\'clustering\'):\n- os.makedirs(\'clustering\')\n-\n- args = process_args(sys.argv)\n- \n- #Data read\n- \n- X = read_dataset(args.input)\n- X = pd.DataFrame.to_dict(X, orient=\'list\')\n- X = rewrite_input(X)\n- X = pd.DataFrame.from_dict(X, orient = \'index\')\n- \n- for i in X.columns:\n- tmp = X[i][0]\n- if tmp == None:\n- X = X.drop(columns=[i])\n- \n- \n- if args.cluster_type == \'kmeans\':\n- kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.best_cluster)\n- \n- if args.cluster_type == \'dbscan\':\n- dbscan(X, args.eps, args.min_samples, args.best_cluster)\n- \n- if args.cluster_type == \'hierarchy\':\n- hierachical_agglomerative(X, args.k_min, args.k_max, args.best_cluster, args.silhouette)\n- \n-##############################################################################\n-\n-if __name__ == "__main__":\n- main()\n' |