Previous changeset 33:abf0bfe01c78 (2019-10-16) Next changeset 35:7b1971251c63 (2019-11-25) |
Commit message:
Lot of bug fixes |
modified:
Marea/marea.py Marea/marea.xml Marea/marea_cluster.py Marea/marea_cluster.xml |
b |
diff -r abf0bfe01c78 -r 1a97d1537623 Marea/marea.py --- a/Marea/marea.py Wed Oct 16 16:25:56 2019 -0400 +++ b/Marea/marea.py Sat Oct 26 07:49:31 2019 -0400 |
b |
@@ -836,6 +836,8 @@ 'will be considered NaN\n') if resolve_rules != None: class_pat = split_class(classes, resolve_rules) + if generate_ras: + create_ras(resolve_rules, name, False) if args.rules_selector == 'Custom': |
b |
diff -r abf0bfe01c78 -r 1a97d1537623 Marea/marea.xml --- a/Marea/marea.xml Wed Oct 16 16:25:56 2019 -0400 +++ b/Marea/marea.xml Sat Oct 26 07:49:31 2019 -0400 |
b |
@@ -1,4 +1,4 @@ -<tool id="MaREA" name="Metabolic Reaction Enrichment Analysis" version="1.0.4"> +<tool id="MaREA" name="Metabolic Reaction Enrichment Analysis" version="1.0.5"> <description></description> <macros> <import>marea_macros.xml</import> |
b |
diff -r abf0bfe01c78 -r 1a97d1537623 Marea/marea_cluster.py --- a/Marea/marea_cluster.py Wed Oct 16 16:25:56 2019 -0400 +++ b/Marea/marea_cluster.py Sat Oct 26 07:49:31 2019 -0400 |
[ |
@@ -208,12 +208,7 @@ classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) - - if davies: - with np.errstate(divide='ignore', invalid='ignore'): - davies_bouldin = davies_bouldin_score(dataset, all_labels[i]) - warning("\nFor n_clusters = " + str(i + k_min) + - " The average davies bouldin score is: " + str(davies_bouldin)) + if silhouette: @@ -329,8 +324,6 @@ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) - ##TODO: PLOT SU DBSCAN (no centers) e HIERARCHICAL - labels = labels predict = [x+1 for x in labels] classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) @@ -339,7 +332,7 @@ ########################## hierachical ####################################### -def hierachical_agglomerative(dataset, k_min, k_max, best_cluster): +def hierachical_agglomerative(dataset, k_min, k_max, best_cluster, silhouette): if not os.path.exists('clustering'): os.makedirs('clustering') @@ -354,18 +347,22 @@ scores = [] labels = [] + for n_clusters in range_n_clusters: cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward') cluster.fit_predict(dataset) cluster_labels = cluster.labels_ labels.append(cluster_labels) - silhouette_avg = silhouette_score(dataset, cluster_labels) write_to_csv(dataset, cluster_labels, 'clustering/hierarchical_with_' + str(n_clusters) + '_clusters.tsv') - scores.append(silhouette_avg) - #warning("For n_clusters =", n_clusters, - #"The average silhouette_score is :", silhouette_avg) best = max_index(scores) + k_min + + for i in range(len(labels)): + prefix = '' + if (i + k_min == best): + prefix = '_BEST' + if silhouette == 'true': + silihouette_draw(dataset, labels[i], i + k_min, 'clustering/silhouette_with_' + str(i + k_min) + prefix + '_clusters.png') for i in range(len(labels)): if (i + k_min == best): @@ -373,11 +370,7 @@ predict = [x+1 for x in labels] classe = (pd.DataFrame(list(zip(dataset.index, predict)))).astype(str) classe.to_csv(best_cluster, sep = '\t', index = False, header = ['Patient_ID', 'Class']) - - - - - + ############################# main ########################################### @@ -408,7 +401,7 @@ dbscan(X, args.eps, args.min_samples, args.best_cluster) if args.cluster_type == 'hierarchy': - hierachical_agglomerative(X, args.k_min, args.k_max, args.best_cluster) + hierachical_agglomerative(X, args.k_min, args.k_max, args.best_cluster, args.silhouette) ############################################################################## |
b |
diff -r abf0bfe01c78 -r 1a97d1537623 Marea/marea_cluster.xml --- a/Marea/marea_cluster.xml Wed Oct 16 16:25:56 2019 -0400 +++ b/Marea/marea_cluster.xml Sat Oct 26 07:49:31 2019 -0400 |
b |
@@ -1,4 +1,4 @@ -<tool id="MaREA_cluester" name="Cluster Analysis" version="1.0.7"> +<tool id="MaREA_cluester" name="Cluster Analysis" version="1.0.8"> <description></description> <macros> <import>marea_macros.xml</import> @@ -34,6 +34,7 @@ #if $data.clust_type == 'hierarchy': --k_min ${data.k_min} --k_max ${data.k_max} + --silhouette ${data.silhouette} #end if ]]> </command> @@ -67,8 +68,9 @@ </conditional> </when> <when value="hierarchy"> - <param name="k_min" argument="--k_min" type="integer" min="2" max="99" value="3" label="Min number of clusters (k) to be tested" /> - <param name="k_max" argument="--k_max" type="integer" min="3" max="99" value="5" label="Max number of clusters (k) to be tested" /> + <param name="k_min" argument="--k_min" type="integer" min="2" max="20" value="2" label="Min number of clusters (k) to be tested" /> + <param name="k_max" argument="--k_max" type="integer" min="3" max="20" value="3" label="Max number of clusters (k) to be tested" /> + <param name="silhouette" argument="--silhouette" type="boolean" value="true" label="Draw the Silhouette plot from k-min to k-max"/> </when> </conditional> </inputs> @@ -87,6 +89,63 @@ What it does ------------- +The tool performs cluster analysis of any dataset, according to most used algorithms: K-means, agglomerative +clustering and DBSCAN (Density Based Spatial Clustering of Applications with Noise). + +Accepted files are: + - Tabular files in which rows indicate different variables and columns different observations. The first row reports the observations’ labels. + + +Example of input dataset: +------------------------- + ++----------+----------+----------+ +|TCGAA62670|TCGAA62671|TCGAA62672| ++==========+==========+==========+ +| 0.523167 | 0.371355 | 0.925661 | ++----------+----------+----------+ +| 0.568765 | 0.765567 | 0.456789 | ++----------+----------+----------+ +| 0.876545 | 0.768933 | 0.987654 | ++----------+----------+----------+ +| 0.456788 | 0.876543 | 0.876542 | ++----------+----------+----------+ +| 0.876543 | 0.786543 | 0.897654 | ++----------+----------+----------+ + +. + + +Options: +-------- + +The following clustering types can be chosen: + - K-means. This option requires the number of clusters (k) to be set. Different values of k can be tested. + - Agglomerative clustering. Different values of k can be set, to cut the resulting dendrogram. + - DBSCAN. The DBSCAN method chooses the number of clusters based on parameters that define when a region is to be considered dense. Custom parameters may be used, namely the maximum distance between two samples for one to be considered as in the neighborhood of the other and the number of samples in a neighborhood for a point to be considered as a core point. + +The tool generates: + - a tab-separated file: reporting the affiliation of each observation to a cluster. In case different numbers of clusters have been tested, the best cluster assignment is reported according to maximum average silhouette score. If desired, the elbow plot is generated, as well as silhouette plot for each k. + - a list of items, including: 1) the cluster assignment for each tested number of clusters 2) the dendrogram in case of agglomerative clustering 3) elbow and silhouete plots in case of k-means clustering. + - a log file (.txt). + + +.. class:: infomark + +**TIP**: This tool has been conceived to cluster gene expression data, by using the RAS scores computed by the tool +`MaREA`_ as feature + +.. class:: infomark + +**TIP**: If your data is not TAB delimited, use `Convert delimiters to TAB`_. + + + +@REFERENCE@ + +.. _MaREA: https://www.biorxiv.org/content/early/2018/01/16/248724 +.. _Convert delimiters to TAB: https://usegalaxy.org/?tool_id=Convert+characters1&version=1.0.0&__identifer=6t22teyofhj + ]]> </help> |