Previous changeset 37:2495c7772ca8 (2019-11-25) Next changeset 39:56faa11c6c78 (2019-11-25) |
Commit message:
Uploaded |
added:
marea_cluster.py |
b |
diff -r 2495c7772ca8 -r 4e1b466935cd marea_cluster.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/marea_cluster.py Mon Nov 25 12:02:28 2019 -0500 |
[ |
b'@@ -0,0 +1,399 @@\n+# -*- coding: utf-8 -*-\n+"""\n+Created on Mon Jun 3 19:51:00 2019\n+@author: Narger\n+"""\n+\n+import sys\n+import argparse\n+import os\n+from sklearn.datasets import make_blobs\n+from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering\n+from sklearn.metrics import silhouette_samples, silhouette_score, davies_bouldin_score, cluster\n+import matplotlib\n+matplotlib.use(\'agg\')\n+import matplotlib.pyplot as plt\n+import scipy.cluster.hierarchy as shc \n+import matplotlib.cm as cm\n+import numpy as np\n+import pandas as pd\n+\n+################################# process args ###############################\n+\n+def process_args(args):\n+ parser = argparse.ArgumentParser(usage = \'%(prog)s [options]\',\n+ description = \'process some value\\\'s\' +\n+ \' genes to create class.\')\n+\n+ parser.add_argument(\'-ol\', \'--out_log\', \n+ help = "Output log")\n+ \n+ parser.add_argument(\'-in\', \'--input\',\n+ type = str,\n+ help = \'input dataset\')\n+ \n+ parser.add_argument(\'-cy\', \'--cluster_type\',\n+ type = str,\n+ choices = [\'kmeans\', \'dbscan\', \'hierarchy\'],\n+ default = \'kmeans\',\n+ help = \'choose clustering algorythm\')\n+ \n+ parser.add_argument(\'-k1\', \'--k_min\', \n+ type = int,\n+ default = 2,\n+ help = \'choose minimun cluster number to be generated\')\n+ \n+ parser.add_argument(\'-k2\', \'--k_max\', \n+ type = int,\n+ default = 7,\n+ help = \'choose maximum cluster number to be generated\')\n+ \n+ parser.add_argument(\'-el\', \'--elbow\', \n+ type = str,\n+ default = \'false\',\n+ choices = [\'true\', \'false\'],\n+ help = \'choose if you want to generate an elbow plot for kmeans\')\n+ \n+ parser.add_argument(\'-si\', \'--silhouette\', \n+ type = str,\n+ default = \'false\',\n+ choices = [\'true\', \'false\'],\n+ help = \'choose if you want silhouette plots\')\n+ \n+ parser.add_argument(\'-td\', \'--tool_dir\',\n+ type = str,\n+ required = True,\n+ help = \'your tool directory\')\n+ \n+ parser.add_argument(\'-ms\', \'--min_samples\',\n+ type = float,\n+ help = \'min samples for dbscan (optional)\')\n+ \n+ parser.add_argument(\'-ep\', \'--eps\',\n+ type = float,\n+ help = \'eps for dbscan (optional)\')\n+ \n+ parser.add_argument(\'-bc\', \'--best_cluster\',\n+ type = str,\n+ help = \'output of best cluster tsv\')\n+ \t\t\t\t\n+ \n+ \n+ args = parser.parse_args()\n+ return args\n+\n+########################### warning ###########################################\n+\n+def warning(s):\n+ args = process_args(sys.argv)\n+ with open(args.out_log, \'a\') as log:\n+ log.write(s + "\\n\\n")\n+ print(s)\n+\n+########################## read dataset ######################################\n+ \n+def read_dataset(dataset):\n+ try:\n+ dataset = pd.read_csv(dataset, sep = \'\\t\', header = 0)\n+ except pd.errors.EmptyDataError:\n+ sys.exit(\'Execution aborted: wrong format of dataset\\n\')\n+ if len(dataset.columns) < 2:\n+ sys.exit(\'Execution aborted: wrong format of dataset\\n\')\n+ return dataset\n+\n+############################ rewrite_input ###################################\n+ \n+def rewrite_input(dataset):\n+ #Riscrivo il dataset come dizionario di liste, \n+ #non come dizionario di dizionari\n+ \n+ dataset.pop(\'Reactions\', None)\n+ \n+ for key'..b'ilhouette analysis for clustering on sample data "\n+ "with n_clusters = " + str(n_clusters) + "\\nAverage silhouette_score = " + str(silhouette_avg)), fontsize=12, fontweight=\'bold\')\n+ \n+ \n+ plt.savefig(path, bbox_inches=\'tight\')\n+ \n+######################## dbscan ##############################################\n+ \n+def dbscan(dataset, eps, min_samples, best_cluster):\n+ if not os.path.exists(\'clustering\'):\n+ os.makedirs(\'clustering\')\n+ \n+ if eps is not None:\n+ \tclusterer = DBSCAN(eps = eps, min_samples = min_samples)\n+ else:\n+ \tclusterer = DBSCAN()\n+ \n+ clustering = clusterer.fit(dataset)\n+ \n+ core_samples_mask = np.zeros_like(clustering.labels_, dtype=bool)\n+ core_samples_mask[clustering.core_sample_indices_] = True\n+ labels = clustering.labels_\n+\n+ # Number of clusters in labels, ignoring noise if present.\n+ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)\n+ \n+ \n+ labels = labels\n+ predict = [x+1 for x in labels]\n+ classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)\n+ classe.to_csv(best_cluster, sep = \'\\t\', index = False, header = [\'Patient_ID\', \'Class\'])\n+ \n+ \n+########################## hierachical #######################################\n+ \n+def hierachical_agglomerative(dataset, k_min, k_max, best_cluster, silhouette):\n+\n+ if not os.path.exists(\'clustering\'):\n+ os.makedirs(\'clustering\')\n+ \n+ plt.figure(figsize=(10, 7)) \n+ plt.title("Classes Dendogram") \n+ shc.dendrogram(shc.linkage(dataset, method=\'ward\'), labels=dataset.index.values.tolist()) \n+ fig = plt.gcf()\n+ fig.savefig(\'clustering/dendogram.png\', dpi=200)\n+ \n+ range_n_clusters = [i for i in range(k_min, k_max+1)]\n+\n+ scores = []\n+ labels = []\n+ \n+ for n_clusters in range_n_clusters: \n+ cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity=\'euclidean\', linkage=\'ward\') \n+ cluster.fit_predict(dataset) \n+ cluster_labels = cluster.labels_\n+ labels.append(cluster_labels)\n+ write_to_csv(dataset, cluster_labels, \'clustering/hierarchical_with_\' + str(n_clusters) + \'_clusters.tsv\')\n+ \n+ best = max_index(scores) + k_min\n+ \n+ for i in range(len(labels)):\n+ prefix = \'\'\n+ if (i + k_min == best):\n+ prefix = \'_BEST\'\n+ if silhouette == \'true\':\n+ silihouette_draw(dataset, labels[i], i + k_min, \'clustering/silhouette_with_\' + str(i + k_min) + prefix + \'_clusters.png\')\n+ \n+ for i in range(len(labels)):\n+ if (i + k_min == best):\n+ labels = labels[i]\n+ predict = [x+1 for x in labels]\n+ classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)\n+ classe.to_csv(best_cluster, sep = \'\\t\', index = False, header = [\'Patient_ID\', \'Class\'])\n+ \n+ \n+############################# main ###########################################\n+\n+\n+def main():\n+ if not os.path.exists(\'clustering\'):\n+ os.makedirs(\'clustering\')\n+\n+ args = process_args(sys.argv)\n+ \n+ #Data read\n+ \n+ X = read_dataset(args.input)\n+ X = pd.DataFrame.to_dict(X, orient=\'list\')\n+ X = rewrite_input(X)\n+ X = pd.DataFrame.from_dict(X, orient = \'index\')\n+ \n+ for i in X.columns:\n+ tmp = X[i][0]\n+ if tmp == None:\n+ X = X.drop(columns=[i])\n+ \n+ \n+ if args.cluster_type == \'kmeans\':\n+ kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.best_cluster)\n+ \n+ if args.cluster_type == \'dbscan\':\n+ dbscan(X, args.eps, args.min_samples, args.best_cluster)\n+ \n+ if args.cluster_type == \'hierarchy\':\n+ hierachical_agglomerative(X, args.k_min, args.k_max, args.best_cluster, args.silhouette)\n+ \n+##############################################################################\n+\n+if __name__ == "__main__":\n+ main()\n' |