| 
4
 | 
     1 # -*- coding: utf-8 -*-
 | 
| 
 | 
     2 """
 | 
| 
 | 
     3 Created on Mon Jun 3 19:51:00 2019
 | 
| 
 | 
     4 @author: Narger
 | 
| 
 | 
     5 """
 | 
| 
 | 
     6 
 | 
| 
 | 
     7 import sys
 | 
| 
 | 
     8 import argparse
 | 
| 
 | 
     9 import os
 | 
| 
 | 
    10 import numpy as np
 | 
| 
 | 
    11 import pandas as pd
 | 
| 
 | 
    12 from sklearn.datasets import make_blobs
 | 
| 
 | 
    13 from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
 | 
| 
 | 
    14 from sklearn.metrics import silhouette_samples, silhouette_score, cluster
 | 
| 
 | 
    15 import matplotlib
 | 
| 
 | 
    16 matplotlib.use('agg')
 | 
| 
 | 
    17 import matplotlib.pyplot as plt
 | 
| 
 | 
    18 import scipy.cluster.hierarchy as shc   
 | 
| 
 | 
    19 import matplotlib.cm as cm
 | 
| 
 | 
    20 from typing import Optional, Dict, List
 | 
| 
 | 
    21 
 | 
| 
 | 
    22 ################################# process args ###############################
 | 
| 
155
 | 
    23 def process_args(args_in :List[str] = None) -> argparse.Namespace:
 | 
| 
4
 | 
    24     """
 | 
| 
 | 
    25     Processes command-line arguments.
 | 
| 
 | 
    26 
 | 
| 
 | 
    27     Args:
 | 
| 
 | 
    28         args (list): List of command-line arguments.
 | 
| 
 | 
    29 
 | 
| 
 | 
    30     Returns:
 | 
| 
 | 
    31         Namespace: An object containing parsed arguments.
 | 
| 
 | 
    32     """
 | 
| 
 | 
    33     parser = argparse.ArgumentParser(usage = '%(prog)s [options]',
 | 
| 
 | 
    34                                      description = 'process some value\'s' +
 | 
| 
 | 
    35                                      ' genes to create class.')
 | 
| 
 | 
    36 
 | 
| 
 | 
    37     parser.add_argument('-ol', '--out_log', 
 | 
| 
 | 
    38                         help = "Output log")
 | 
| 
 | 
    39     
 | 
| 
 | 
    40     parser.add_argument('-in', '--input',
 | 
| 
 | 
    41                         type = str,
 | 
| 
 | 
    42                         help = 'input dataset')
 | 
| 
 | 
    43     
 | 
| 
 | 
    44     parser.add_argument('-cy', '--cluster_type',
 | 
| 
 | 
    45                         type = str,
 | 
| 
 | 
    46                         choices = ['kmeans', 'dbscan', 'hierarchy'],
 | 
| 
 | 
    47                         default = 'kmeans',
 | 
| 
 | 
    48                         help = 'choose clustering algorythm')
 | 
| 
 | 
    49     
 | 
| 
 | 
    50     parser.add_argument('-k1', '--k_min', 
 | 
| 
 | 
    51                         type = int,
 | 
| 
 | 
    52                         default = 2,
 | 
| 
 | 
    53                         help = 'choose minimun cluster number to be generated')
 | 
| 
 | 
    54     
 | 
| 
 | 
    55     parser.add_argument('-k2', '--k_max', 
 | 
| 
 | 
    56                         type = int,
 | 
| 
 | 
    57                         default = 7,
 | 
| 
 | 
    58                         help = 'choose maximum cluster number to be generated')
 | 
| 
 | 
    59     
 | 
| 
 | 
    60     parser.add_argument('-el', '--elbow', 
 | 
| 
 | 
    61                         type = str,
 | 
| 
317
 | 
    62                         default = 'false',
 | 
| 
 | 
    63                         choices = ['true', 'false'],
 | 
| 
4
 | 
    64                         help = 'choose if you want to generate an elbow plot for kmeans')
 | 
| 
 | 
    65     
 | 
| 
 | 
    66     parser.add_argument('-si', '--silhouette', 
 | 
| 
 | 
    67                         type = str,
 | 
| 
317
 | 
    68                         default = 'false',
 | 
| 
 | 
    69                         choices = ['true', 'false'],
 | 
| 
4
 | 
    70                         help = 'choose if you want silhouette plots')
 | 
| 
 | 
    71     
 | 
| 
 | 
    72     parser.add_argument('-td', '--tool_dir',
 | 
| 
 | 
    73                         type = str,
 | 
| 
 | 
    74                         required = True,
 | 
| 
 | 
    75                         help = 'your tool directory')
 | 
| 
 | 
    76                         
 | 
| 
 | 
    77     parser.add_argument('-ms', '--min_samples',
 | 
| 
309
 | 
    78                         type = int,
 | 
| 
4
 | 
    79                         help = 'min samples for dbscan (optional)')
 | 
| 
 | 
    80                         
 | 
| 
 | 
    81     parser.add_argument('-ep', '--eps',
 | 
| 
 | 
    82                         type = float,
 | 
| 
 | 
    83                         help = 'eps for dbscan (optional)')
 | 
| 
 | 
    84                         
 | 
| 
 | 
    85     parser.add_argument('-bc', '--best_cluster',
 | 
| 
 | 
    86                         type = str,
 | 
| 
 | 
    87                         help = 'output of best cluster tsv')
 | 
| 
 | 
    88     				
 | 
| 
147
 | 
    89     parser.add_argument(
 | 
| 
 | 
    90         '-idop', '--output_path', 
 | 
| 
 | 
    91         type = str,
 | 
| 
 | 
    92         default='result',
 | 
| 
 | 
    93         help = 'output path for maps')
 | 
| 
4
 | 
    94     
 | 
| 
155
 | 
    95     args_in = parser.parse_args(args_in)
 | 
| 
 | 
    96     return args_in
 | 
| 
4
 | 
    97 
 | 
| 
 | 
    98 ########################### warning ###########################################
 | 
| 
 | 
    99 def warning(s :str) -> None:
 | 
| 
 | 
   100     """
 | 
| 
 | 
   101     Log a warning message to an output log file and print it to the console.
 | 
| 
 | 
   102 
 | 
| 
 | 
   103     Args:
 | 
| 
 | 
   104         s (str): The warning message to be logged and printed.
 | 
| 
 | 
   105     
 | 
| 
 | 
   106     Returns:
 | 
| 
 | 
   107       None
 | 
| 
 | 
   108     """
 | 
| 
309
 | 
   109 
 | 
| 
4
 | 
   110     with open(args.out_log, 'a') as log:
 | 
| 
 | 
   111         log.write(s + "\n\n")
 | 
| 
 | 
   112     print(s)
 | 
| 
 | 
   113 
 | 
| 
 | 
   114 ########################## read dataset ######################################
 | 
| 
 | 
   115 def read_dataset(dataset :str) -> pd.DataFrame:
 | 
| 
 | 
   116     """
 | 
| 
 | 
   117     Read dataset from a CSV file and return it as a Pandas DataFrame.
 | 
| 
 | 
   118 
 | 
| 
 | 
   119     Args:
 | 
| 
 | 
   120         dataset (str): the path to the dataset to convert into a DataFrame
 | 
| 
 | 
   121 
 | 
| 
 | 
   122     Returns:
 | 
| 
 | 
   123         pandas.DataFrame: The dataset loaded as a Pandas DataFrame.
 | 
| 
 | 
   124 
 | 
| 
 | 
   125     Raises:
 | 
| 
 | 
   126         pandas.errors.EmptyDataError: If the dataset file is empty.
 | 
| 
 | 
   127         sys.exit: If the dataset file has the wrong format (e.g., fewer than 2 columns)
 | 
| 
 | 
   128     """
 | 
| 
 | 
   129     try:
 | 
| 
 | 
   130         dataset = pd.read_csv(dataset, sep = '\t', header = 0)
 | 
| 
 | 
   131     except pd.errors.EmptyDataError:
 | 
| 
 | 
   132         sys.exit('Execution aborted: wrong format of dataset\n')
 | 
| 
 | 
   133     if len(dataset.columns) < 2:
 | 
| 
 | 
   134         sys.exit('Execution aborted: wrong format of dataset\n')
 | 
| 
 | 
   135     return dataset
 | 
| 
 | 
   136 
 | 
| 
 | 
   137 ############################ rewrite_input ###################################
 | 
| 
154
 | 
   138 def rewrite_input(dataset :Dict) -> Dict[str, List[Optional[float]]]:
 | 
| 
4
 | 
   139     """
 | 
| 
 | 
   140     Rewrite the dataset as a dictionary of lists instead of as a dictionary of dictionaries.
 | 
| 
 | 
   141 
 | 
| 
 | 
   142     Args:
 | 
| 
 | 
   143         dataset (pandas.DataFrame): The dataset to be rewritten.
 | 
| 
 | 
   144 
 | 
| 
 | 
   145     Returns:
 | 
| 
 | 
   146         dict: The rewritten dataset as a dictionary of lists.
 | 
| 
 | 
   147     """
 | 
| 
 | 
   148     #Riscrivo il dataset come dizionario di liste, 
 | 
| 
 | 
   149     #non come dizionario di dizionari
 | 
| 
153
 | 
   150     #dataset.pop('Reactions', None)
 | 
| 
4
 | 
   151     
 | 
| 
 | 
   152     for key, val in dataset.items():
 | 
| 
 | 
   153         l = []
 | 
| 
 | 
   154         for i in val:
 | 
| 
 | 
   155             if i == 'None':
 | 
| 
 | 
   156                 l.append(None)
 | 
| 
 | 
   157             else:
 | 
| 
 | 
   158                 l.append(float(i))
 | 
| 
 | 
   159    
 | 
| 
 | 
   160         dataset[key] = l
 | 
| 
 | 
   161     
 | 
| 
 | 
   162     return dataset
 | 
| 
 | 
   163 
 | 
| 
 | 
   164 ############################## write to csv ##################################
 | 
| 
 | 
   165 def write_to_csv (dataset :pd.DataFrame, labels :List[str], name :str) -> None:
 | 
| 
 | 
   166     """
 | 
| 
 | 
   167     Write dataset and predicted labels to a CSV file.
 | 
| 
 | 
   168 
 | 
| 
 | 
   169     Args:
 | 
| 
 | 
   170         dataset (pandas.DataFrame): The dataset to be written.
 | 
| 
 | 
   171         labels (list): The predicted labels for each data point.
 | 
| 
 | 
   172         name (str): The name of the output CSV file.
 | 
| 
 | 
   173 
 | 
| 
 | 
   174     Returns:
 | 
| 
 | 
   175         None
 | 
| 
 | 
   176     """
 | 
| 
 | 
   177     #labels = predict
 | 
| 
 | 
   178     predict = [x+1 for x in labels]
 | 
| 
 | 
   179   
 | 
| 
 | 
   180     classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
 | 
| 
 | 
   181 
 | 
| 
 | 
   182     dest = name
 | 
| 
 | 
   183     classe.to_csv(dest, sep = '\t', index = False,
 | 
| 
 | 
   184                       header = ['Patient_ID', 'Class'])
 | 
| 
 | 
   185    
 | 
| 
 | 
   186 ########################### trova il massimo in lista ########################
 | 
| 
 | 
   187 def max_index (lista :List[int]) -> int:
 | 
| 
 | 
   188     """
 | 
| 
 | 
   189     Find the index of the maximum value in a list.
 | 
| 
 | 
   190 
 | 
| 
 | 
   191     Args:
 | 
| 
 | 
   192         lista (list): The list in which we search for the index of the maximum value.
 | 
| 
 | 
   193 
 | 
| 
 | 
   194     Returns:
 | 
| 
 | 
   195         int: The index of the maximum value in the list.
 | 
| 
 | 
   196     """
 | 
| 
 | 
   197     best = -1
 | 
| 
 | 
   198     best_index = 0
 | 
| 
 | 
   199     for i in range(len(lista)):
 | 
| 
 | 
   200         if lista[i] > best:
 | 
| 
 | 
   201             best = lista [i]
 | 
| 
 | 
   202             best_index = i
 | 
| 
 | 
   203             
 | 
| 
 | 
   204     return best_index
 | 
| 
 | 
   205     
 | 
| 
 | 
   206 ################################ kmeans #####################################
 | 
| 
 | 
   207 def kmeans (k_min: int, k_max: int, dataset: pd.DataFrame, elbow: str, silhouette: str, best_cluster: str) -> None:
 | 
| 
 | 
   208     """
 | 
| 
 | 
   209     Perform k-means clustering on the given dataset, which is an algorithm used to partition a dataset into groups (clusters) based on their characteristics.
 | 
| 
 | 
   210     The goal is to divide the data into homogeneous groups, where the elements within each group are similar to each other and different from the elements in other groups.
 | 
| 
 | 
   211 
 | 
| 
 | 
   212     Args:
 | 
| 
 | 
   213         k_min (int): The minimum number of clusters to consider.
 | 
| 
 | 
   214         k_max (int): The maximum number of clusters to consider.
 | 
| 
 | 
   215         dataset (pandas.DataFrame): The dataset to perform clustering on.
 | 
| 
309
 | 
   216         elbow (str): Whether to generate an elbow plot for kmeans ('True' or 'False').
 | 
| 
 | 
   217         silhouette (str): Whether to generate silhouette plots ('True' or 'False').
 | 
| 
4
 | 
   218         best_cluster (str): The file path to save the output of the best cluster.
 | 
| 
 | 
   219 
 | 
| 
 | 
   220     Returns:
 | 
| 
 | 
   221         None
 | 
| 
 | 
   222     """
 | 
| 
147
 | 
   223     if not os.path.exists(args.output_path):
 | 
| 
 | 
   224         os.makedirs(args.output_path)
 | 
| 
4
 | 
   225     
 | 
| 
 | 
   226         
 | 
| 
317
 | 
   227     if elbow == 'true':
 | 
| 
4
 | 
   228         elbow = True
 | 
| 
 | 
   229     else:
 | 
| 
 | 
   230         elbow = False
 | 
| 
 | 
   231         
 | 
| 
317
 | 
   232     if silhouette == 'true':
 | 
| 
4
 | 
   233         silhouette = True
 | 
| 
 | 
   234     else:
 | 
| 
 | 
   235         silhouette = False
 | 
| 
 | 
   236         
 | 
| 
 | 
   237     range_n_clusters = [i for i in range(k_min, k_max+1)]
 | 
| 
 | 
   238     distortions = []
 | 
| 
 | 
   239     scores = []
 | 
| 
 | 
   240     all_labels = []
 | 
| 
 | 
   241     
 | 
| 
 | 
   242     clusterer = KMeans(n_clusters=1, random_state=10)
 | 
| 
 | 
   243     distortions.append(clusterer.fit(dataset).inertia_)
 | 
| 
 | 
   244     
 | 
| 
 | 
   245     
 | 
| 
 | 
   246     for n_clusters in range_n_clusters:
 | 
| 
 | 
   247         clusterer = KMeans(n_clusters=n_clusters, random_state=10)
 | 
| 
 | 
   248         cluster_labels = clusterer.fit_predict(dataset)
 | 
| 
 | 
   249         
 | 
| 
 | 
   250         all_labels.append(cluster_labels)
 | 
| 
 | 
   251         if n_clusters == 1:
 | 
| 
 | 
   252             silhouette_avg = 0
 | 
| 
 | 
   253         else:
 | 
| 
 | 
   254             silhouette_avg = silhouette_score(dataset, cluster_labels)
 | 
| 
 | 
   255         scores.append(silhouette_avg)
 | 
| 
 | 
   256         distortions.append(clusterer.fit(dataset).inertia_)
 | 
| 
 | 
   257         
 | 
| 
 | 
   258     best = max_index(scores) + k_min
 | 
| 
 | 
   259         
 | 
| 
 | 
   260     for i in range(len(all_labels)):
 | 
| 
 | 
   261         prefix = ''
 | 
| 
 | 
   262         if (i + k_min == best):
 | 
| 
 | 
   263             prefix = '_BEST'
 | 
| 
 | 
   264             
 | 
| 
147
 | 
   265         write_to_csv(dataset, all_labels[i], f'{args.output_path}/kmeans_with_' + str(i + k_min) + prefix + '_clusters.tsv')
 | 
| 
4
 | 
   266         
 | 
| 
 | 
   267         
 | 
| 
 | 
   268         if (prefix == '_BEST'):
 | 
| 
 | 
   269             labels = all_labels[i]
 | 
| 
 | 
   270             predict = [x+1 for x in labels]
 | 
| 
 | 
   271             classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
 | 
| 
 | 
   272             classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
 | 
| 
 | 
   273             
 | 
| 
 | 
   274           
 | 
| 
 | 
   275         
 | 
| 
 | 
   276        
 | 
| 
 | 
   277         if silhouette:
 | 
| 
147
 | 
   278             silhouette_draw(dataset, all_labels[i], i + k_min, f'{args.output_path}/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png')
 | 
| 
4
 | 
   279         
 | 
| 
 | 
   280         
 | 
| 
 | 
   281     if elbow:
 | 
| 
 | 
   282         elbow_plot(distortions, k_min,k_max) 
 | 
| 
 | 
   283 
 | 
| 
 | 
   284    
 | 
| 
 | 
   285     
 | 
| 
 | 
   286     
 | 
| 
 | 
   287 
 | 
| 
 | 
   288 ############################## elbow_plot ####################################
 | 
| 
 | 
   289 def elbow_plot (distortions: List[float], k_min: int, k_max: int) -> None:
 | 
| 
 | 
   290     """
 | 
| 
 | 
   291     Generate an elbow plot to visualize the distortion for different numbers of clusters.
 | 
| 
 | 
   292     The elbow plot is a graphical tool used in clustering analysis to help identifying the appropriate number of clusters by looking for the point where the rate of decrease
 | 
| 
 | 
   293     in distortion sharply decreases, indicating the optimal balance between model complexity and clustering quality.
 | 
| 
 | 
   294 
 | 
| 
 | 
   295     Args:
 | 
| 
 | 
   296         distortions (list): List of distortion values for different numbers of clusters.
 | 
| 
 | 
   297         k_min (int): The minimum number of clusters considered.
 | 
| 
 | 
   298         k_max (int): The maximum number of clusters considered.
 | 
| 
 | 
   299 
 | 
| 
 | 
   300     Returns:
 | 
| 
 | 
   301         None
 | 
| 
 | 
   302     """
 | 
| 
 | 
   303     plt.figure(0)
 | 
| 
 | 
   304     x = list(range(k_min, k_max + 1))
 | 
| 
 | 
   305     x.insert(0, 1)
 | 
| 
 | 
   306     plt.plot(x, distortions, marker = 'o')
 | 
| 
 | 
   307     plt.xlabel('Number of clusters (k)')
 | 
| 
 | 
   308     plt.ylabel('Distortion')
 | 
| 
147
 | 
   309     s = f'{args.output_path}/elbow_plot.png'
 | 
| 
4
 | 
   310     fig = plt.gcf()
 | 
| 
 | 
   311     fig.set_size_inches(18.5, 10.5, forward = True)
 | 
| 
 | 
   312     fig.savefig(s, dpi=100)
 | 
| 
 | 
   313     
 | 
| 
 | 
   314     
 | 
| 
 | 
   315 ############################## silhouette plot ###############################
 | 
| 
 | 
   316 def silhouette_draw(dataset: pd.DataFrame, labels: List[str], n_clusters: int, path:str) -> None:
 | 
| 
 | 
   317     """
 | 
| 
 | 
   318     Generate a silhouette plot for the clustering results.
 | 
| 
 | 
   319     The silhouette coefficient is a measure used to evaluate the quality of clusters obtained from a clustering algorithmand it quantifies how similar an object is to its own cluster compared to other clusters.
 | 
| 
 | 
   320     The silhouette coefficient ranges from -1 to 1, where:
 | 
| 
 | 
   321     - A value close to +1 indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. This implies that the object is in a dense, well-separated cluster.
 | 
| 
 | 
   322     - A value close to 0 indicates that the object is close to the decision boundary between two neighboring clusters.
 | 
| 
 | 
   323     - A value close to -1 indicates that the object may have been assigned to the wrong cluster.
 | 
| 
 | 
   324 
 | 
| 
 | 
   325     Args:
 | 
| 
 | 
   326         dataset (pandas.DataFrame): The dataset used for clustering.
 | 
| 
 | 
   327         labels (list): The cluster labels assigned to each data point.
 | 
| 
 | 
   328         n_clusters (int): The number of clusters.
 | 
| 
 | 
   329         path (str): The path to save the silhouette plot image.
 | 
| 
 | 
   330 
 | 
| 
 | 
   331     Returns:
 | 
| 
 | 
   332         None
 | 
| 
 | 
   333     """
 | 
| 
 | 
   334     if n_clusters == 1:
 | 
| 
 | 
   335         return None
 | 
| 
 | 
   336         
 | 
| 
 | 
   337     silhouette_avg = silhouette_score(dataset, labels)
 | 
| 
 | 
   338     warning("For n_clusters = " + str(n_clusters) +
 | 
| 
 | 
   339           " The average silhouette_score is: " + str(silhouette_avg))
 | 
| 
 | 
   340            
 | 
| 
 | 
   341     plt.close('all')
 | 
| 
 | 
   342     # Create a subplot with 1 row and 2 columns
 | 
| 
 | 
   343     fig, (ax1) = plt.subplots(1, 1)
 | 
| 
 | 
   344     
 | 
| 
 | 
   345     fig.set_size_inches(18, 7)
 | 
| 
 | 
   346         
 | 
| 
 | 
   347     # The 1st subplot is the silhouette plot
 | 
| 
 | 
   348     # The silhouette coefficient can range from -1, 1 but in this example all
 | 
| 
 | 
   349     # lie within [-0.1, 1]
 | 
| 
 | 
   350     ax1.set_xlim([-1, 1])
 | 
| 
 | 
   351     # The (n_clusters+1)*10 is for inserting blank space between silhouette
 | 
| 
 | 
   352     # plots of individual clusters, to demarcate them clearly.
 | 
| 
 | 
   353     ax1.set_ylim([0, len(dataset) + (n_clusters + 1) * 10])
 | 
| 
 | 
   354     
 | 
| 
 | 
   355     # Compute the silhouette scores for each sample
 | 
| 
 | 
   356     sample_silhouette_values = silhouette_samples(dataset, labels)
 | 
| 
 | 
   357         
 | 
| 
 | 
   358     y_lower = 10
 | 
| 
 | 
   359     for i in range(n_clusters):
 | 
| 
 | 
   360         # Aggregate the silhouette scores for samples belonging to
 | 
| 
 | 
   361         # cluster i, and sort them
 | 
| 
 | 
   362         ith_cluster_silhouette_values = \
 | 
| 
 | 
   363         sample_silhouette_values[labels == i]
 | 
| 
 | 
   364         
 | 
| 
 | 
   365         ith_cluster_silhouette_values.sort()
 | 
| 
 | 
   366     
 | 
| 
 | 
   367         size_cluster_i = ith_cluster_silhouette_values.shape[0]
 | 
| 
 | 
   368         y_upper = y_lower + size_cluster_i
 | 
| 
 | 
   369     
 | 
| 
 | 
   370         color = cm.nipy_spectral(float(i) / n_clusters)
 | 
| 
 | 
   371         ax1.fill_betweenx(np.arange(y_lower, y_upper),
 | 
| 
 | 
   372                           0, ith_cluster_silhouette_values,
 | 
| 
 | 
   373                                      facecolor=color, edgecolor=color, alpha=0.7)
 | 
| 
 | 
   374         
 | 
| 
 | 
   375         # Label the silhouette plots with their cluster numbers at the middle
 | 
| 
 | 
   376         ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
 | 
| 
 | 
   377         
 | 
| 
 | 
   378         # Compute the new y_lower for next plot
 | 
| 
 | 
   379         y_lower = y_upper + 10  # 10 for the 0 samples
 | 
| 
 | 
   380     
 | 
| 
 | 
   381         ax1.set_title("The silhouette plot for the various clusters.")
 | 
| 
 | 
   382         ax1.set_xlabel("The silhouette coefficient values")
 | 
| 
 | 
   383         ax1.set_ylabel("Cluster label")
 | 
| 
 | 
   384         
 | 
| 
 | 
   385         # The vertical line for average silhouette score of all the values
 | 
| 
 | 
   386         ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
 | 
| 
 | 
   387     
 | 
| 
 | 
   388         ax1.set_yticks([])  # Clear the yaxis labels / ticks
 | 
| 
 | 
   389         ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
 | 
| 
 | 
   390         
 | 
| 
 | 
   391         
 | 
| 
 | 
   392         plt.suptitle(("Silhouette analysis for clustering on sample data "
 | 
| 
 | 
   393                           "with n_clusters = " + str(n_clusters) + "\nAverage silhouette_score = " + str(silhouette_avg)), fontsize=12, fontweight='bold')
 | 
| 
 | 
   394             
 | 
| 
 | 
   395             
 | 
| 
 | 
   396         plt.savefig(path, bbox_inches='tight')
 | 
| 
 | 
   397             
 | 
| 
 | 
   398 ######################## dbscan ##############################################
 | 
| 
 | 
   399 def dbscan(dataset: pd.DataFrame, eps: float, min_samples: float, best_cluster: str) -> None:
 | 
| 
 | 
   400     """
 | 
| 
 | 
   401     Perform DBSCAN clustering on the given dataset, which is a clustering algorithm that groups together closely packed points based on the notion of density.
 | 
| 
 | 
   402 
 | 
| 
 | 
   403     Args:
 | 
| 
 | 
   404         dataset (pandas.DataFrame): The dataset to be clustered.
 | 
| 
 | 
   405         eps (float): The maximum distance between two samples for one to be considered as in the neighborhood of the other.
 | 
| 
 | 
   406         min_samples (float): The number of samples in a neighborhood for a point to be considered as a core point.
 | 
| 
 | 
   407         best_cluster (str): The file path to save the output of the best cluster.
 | 
| 
 | 
   408 
 | 
| 
 | 
   409     Returns:
 | 
| 
 | 
   410         None
 | 
| 
 | 
   411     """
 | 
| 
147
 | 
   412     if not os.path.exists(args.output_path):
 | 
| 
 | 
   413         os.makedirs(args.output_path)
 | 
| 
4
 | 
   414         
 | 
| 
 | 
   415     if eps is not None:
 | 
| 
 | 
   416         clusterer = DBSCAN(eps = eps, min_samples = min_samples)
 | 
| 
 | 
   417     else:
 | 
| 
 | 
   418         clusterer = DBSCAN()
 | 
| 
 | 
   419     
 | 
| 
 | 
   420     clustering = clusterer.fit(dataset)
 | 
| 
 | 
   421     
 | 
| 
 | 
   422     core_samples_mask = np.zeros_like(clustering.labels_, dtype=bool)
 | 
| 
 | 
   423     core_samples_mask[clustering.core_sample_indices_] = True
 | 
| 
 | 
   424     labels = clustering.labels_
 | 
| 
 | 
   425 
 | 
| 
 | 
   426     # Number of clusters in labels, ignoring noise if present.
 | 
| 
 | 
   427     n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
 | 
| 
 | 
   428     
 | 
| 
 | 
   429     
 | 
| 
 | 
   430     labels = labels
 | 
| 
 | 
   431     predict = [x+1 for x in labels]
 | 
| 
 | 
   432     classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
 | 
| 
 | 
   433     classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
 | 
| 
 | 
   434   
 | 
| 
 | 
   435     
 | 
| 
 | 
   436 ########################## hierachical #######################################
 | 
| 
 | 
   437 def hierachical_agglomerative(dataset: pd.DataFrame, k_min: int, k_max: int, best_cluster: str, silhouette: str) -> None:
 | 
| 
 | 
   438     """
 | 
| 
 | 
   439     Perform hierarchical agglomerative clustering on the given dataset.
 | 
| 
 | 
   440 
 | 
| 
 | 
   441     Args:
 | 
| 
 | 
   442         dataset (pandas.DataFrame): The dataset to be clustered.
 | 
| 
 | 
   443         k_min (int): The minimum number of clusters to consider.
 | 
| 
 | 
   444         k_max (int): The maximum number of clusters to consider.
 | 
| 
 | 
   445         best_cluster (str): The file path to save the output of the best cluster.
 | 
| 
309
 | 
   446         silhouette (str): Whether to generate silhouette plots ('True' or 'False').
 | 
| 
4
 | 
   447 
 | 
| 
 | 
   448     Returns:
 | 
| 
 | 
   449         None
 | 
| 
 | 
   450     """
 | 
| 
147
 | 
   451     if not os.path.exists(args.output_path):
 | 
| 
 | 
   452         os.makedirs(args.output_path)
 | 
| 
4
 | 
   453     
 | 
| 
 | 
   454     plt.figure(figsize=(10, 7))  
 | 
| 
 | 
   455     plt.title("Customer Dendograms")  
 | 
| 
 | 
   456     shc.dendrogram(shc.linkage(dataset, method='ward'), labels=dataset.index.values.tolist())  
 | 
| 
 | 
   457     fig = plt.gcf()
 | 
| 
147
 | 
   458     fig.savefig(f'{args.output_path}/dendogram.png', dpi=200)
 | 
| 
4
 | 
   459     
 | 
| 
 | 
   460     range_n_clusters = [i for i in range(k_min, k_max+1)]
 | 
| 
 | 
   461 
 | 
| 
 | 
   462     scores = []
 | 
| 
 | 
   463     labels = []
 | 
| 
 | 
   464     
 | 
| 
 | 
   465     n_classi = dataset.shape[0]
 | 
| 
 | 
   466     
 | 
| 
 | 
   467     for n_clusters in range_n_clusters:  
 | 
| 
 | 
   468         cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')  
 | 
| 
 | 
   469         cluster.fit_predict(dataset)  
 | 
| 
 | 
   470         cluster_labels = cluster.labels_
 | 
| 
 | 
   471         labels.append(cluster_labels)
 | 
| 
147
 | 
   472         write_to_csv(dataset, cluster_labels, f'{args.output_path}/hierarchical_with_' + str(n_clusters) + '_clusters.tsv')
 | 
| 
4
 | 
   473         
 | 
| 
 | 
   474     best = max_index(scores) + k_min
 | 
| 
 | 
   475     
 | 
| 
 | 
   476     for i in range(len(labels)):
 | 
| 
 | 
   477         prefix = ''
 | 
| 
 | 
   478         if (i + k_min == best):
 | 
| 
 | 
   479             prefix = '_BEST'
 | 
| 
317
 | 
   480         if silhouette == 'true':
 | 
| 
147
 | 
   481             silhouette_draw(dataset, labels[i], i + k_min, f'{args.output_path}/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png')
 | 
| 
4
 | 
   482      
 | 
| 
 | 
   483     for i in range(len(labels)):
 | 
| 
 | 
   484         if (i + k_min == best):
 | 
| 
 | 
   485             labels = labels[i]
 | 
| 
 | 
   486             predict = [x+1 for x in labels]
 | 
| 
 | 
   487             classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str)
 | 
| 
 | 
   488             classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class'])
 | 
| 
 | 
   489             
 | 
| 
 | 
   490     
 | 
| 
 | 
   491 ############################# main ###########################################
 | 
| 
147
 | 
   492 def main(args_in:List[str] = None) -> None:
 | 
| 
4
 | 
   493     """
 | 
| 
 | 
   494     Initializes everything and sets the program in motion based on the fronted input arguments.
 | 
| 
 | 
   495 
 | 
| 
 | 
   496     Returns:
 | 
| 
 | 
   497         None
 | 
| 
 | 
   498     """
 | 
| 
147
 | 
   499     global args
 | 
| 
 | 
   500     args = process_args(args_in)
 | 
| 
4
 | 
   501 
 | 
| 
147
 | 
   502     if not os.path.exists(args.output_path):
 | 
| 
 | 
   503         os.makedirs(args.output_path)
 | 
| 
4
 | 
   504     
 | 
| 
 | 
   505     #Data read
 | 
| 
 | 
   506     
 | 
| 
 | 
   507     X = read_dataset(args.input)
 | 
| 
154
 | 
   508     X = X.iloc[:, 1:]
 | 
| 
4
 | 
   509     X = pd.DataFrame.to_dict(X, orient='list')
 | 
| 
 | 
   510     X = rewrite_input(X)
 | 
| 
 | 
   511     X = pd.DataFrame.from_dict(X, orient = 'index')
 | 
| 
 | 
   512     
 | 
| 
 | 
   513     for i in X.columns:
 | 
| 
224
 | 
   514         if any(val is None or np.isnan(val) for val in X[i]):
 | 
| 
4
 | 
   515             X = X.drop(columns=[i])
 | 
| 
 | 
   516             
 | 
| 
 | 
   517     if args.k_max != None:
 | 
| 
 | 
   518        numero_classi = X.shape[0]
 | 
| 
 | 
   519        while args.k_max >= numero_classi:
 | 
| 
 | 
   520           err = 'Skipping k = ' + str(args.k_max) + ' since it is >= number of classes of dataset'
 | 
| 
 | 
   521           warning(err)
 | 
| 
 | 
   522           args.k_max = args.k_max - 1
 | 
| 
 | 
   523     
 | 
| 
 | 
   524     
 | 
| 
 | 
   525     if args.cluster_type == 'kmeans':
 | 
| 
 | 
   526         kmeans(args.k_min, args.k_max, X, args.elbow, args.silhouette, args.best_cluster)
 | 
| 
 | 
   527     
 | 
| 
 | 
   528     if args.cluster_type == 'dbscan':
 | 
| 
 | 
   529         dbscan(X, args.eps, args.min_samples, args.best_cluster)
 | 
| 
 | 
   530         
 | 
| 
 | 
   531     if args.cluster_type == 'hierarchy':
 | 
| 
 | 
   532         hierachical_agglomerative(X, args.k_min, args.k_max, args.best_cluster, args.silhouette)
 | 
| 
 | 
   533         
 | 
| 
 | 
   534 ##############################################################################
 | 
| 
 | 
   535 if __name__ == "__main__":
 | 
| 
 | 
   536     main()
 |