comparison marea_cluster.xml @ 81:70413509ac7e draft

Uploaded
author bimib
date Mon, 07 Jun 2021 14:10:03 +0000
parents
children
comparison
equal deleted inserted replaced
80:788730c95809 81:70413509ac7e
1 <tool id="MaREA_cluester" name="Cluster Analysis" version="1.1.2">
2 <description></description>
3 <macros>
4 <import>marea_macros.xml</import>
5 </macros>
6 <requirements>
7 <requirement type="package" version="0.25.3">pandas</requirement>
8 <requirement type="package" version="1.6.3">scipy</requirement>
9 <requirement type="package" version="0.24.2">scikit-learn</requirement>
10 <requirement type="package" version="3.4.2">matplotlib</requirement>
11 <requirement type="package" version="1.20.3">numpy</requirement>
12 </requirements>
13 <command detect_errors="exit_code">
14 <![CDATA[
15 python $__tool_directory__/marea_cluster.py
16 --input $input
17 --tool_dir $__tool_directory__
18 --out_log $log
19 --best_cluster $best_cluster
20 --cluster_type ${data.clust_type}
21 #if $data.clust_type == 'kmeans':
22 --k_min ${data.k_min}
23 --k_max ${data.k_max}
24 --elbow ${data.elbow}
25 --silhouette ${data.silhouette}
26 #end if
27 #if $data.clust_type == 'dbscan':
28 #if $data.dbscan_advanced.advanced == 'true'
29 --eps ${data.dbscan_advanced.eps}
30 --min_samples ${data.dbscan_advanced.min_samples}
31 #end if
32 #end if
33 #if $data.clust_type == 'hierarchy':
34 --k_min ${data.k_min}
35 --k_max ${data.k_max}
36 --silhouette ${data.silhouette}
37 #end if
38 ]]>
39 </command>
40 <inputs>
41 <param name="input" argument="--input" type="data" format="tabular, csv, tsv" label="Input dataset" />
42
43 <conditional name="data">
44 <param name="clust_type" argument="--cluster_type" type="select" label="Choose clustering type:">
45 <option value="kmeans" selected="true">KMeans</option>
46 <option value="dbscan">DBSCAN</option>
47 <option value="hierarchy">Agglomerative Hierarchical</option>
48 </param>
49 <when value="kmeans">
50 <param name="k_min" argument="--k_min" type="integer" min="2" max="20" value="2" label="Min number of clusters (k) to be tested" />
51 <param name="k_max" argument="--k_max" type="integer" min="2" max="20" value="3" label="Max number of clusters (k) to be tested" />
52 <param name="elbow" argument="--elbow" type="boolean" value="true" label="Draw the elbow plot from k-min to k-max"/>
53 <param name="silhouette" argument="--silhouette" type="boolean" value="true" label="Draw the Silhouette plot from k-min to k-max"/>
54 </when>
55 <when value="dbscan">
56 <conditional name="dbscan_advanced">
57 <param name="advanced" type="boolean" value="false" label="Want to use custom params for DBSCAN? (if not optimal values will be used)">
58 <option value="true">Yes</option>
59 <option value="false">No</option>
60 </param>
61 <when value="false"></when>
62 <when value="true">
63 <param name="eps" argument="--eps" type="float" value="0.5" label="Epsilon - The maximum distance between two samples for one to be considered as in the neighborhood of the other" />
64 <param name="min_samples" argument="min_samples" type="integer" value="5" label="Min samples - The number of samples in a neighborhood for a point to be considered as a core point (this includes the point itself)"/>
65
66 </when>
67 </conditional>
68 </when>
69 <when value="hierarchy">
70 <param name="k_min" argument="--k_min" type="integer" min="2" max="20" value="2" label="Min number of clusters (k) to be tested" />
71 <param name="k_max" argument="--k_max" type="integer" min="3" max="20" value="3" label="Max number of clusters (k) to be tested" />
72 <param name="silhouette" argument="--silhouette" type="boolean" value="true" label="Draw the Silhouette plot from k-min to k-max"/>
73 </when>
74 </conditional>
75 </inputs>
76
77 <outputs>
78 <data format="txt" name="log" label="${tool.name} - Log" />
79 <data format="tabular" name="best_cluster" label="${tool.name} - best cluster assignment" />
80 <collection name="results" type="list" label="${tool.name} - Plots and results">
81 <discover_datasets pattern="__name_and_ext__" directory="clustering"/>
82 <filter>data['clust_type'] == "kmeans" or data['clust_type'] == "hierarchy"</filter>
83 </collection>
84 </outputs>
85 <help>
86 <![CDATA[
87
88 What it does
89 -------------
90
91 The tool performs cluster analysis of any dataset, according to most used algorithms: K-means, agglomerative
92 clustering and DBSCAN (Density Based Spatial Clustering of Applications with Noise).
93
94 Accepted files are:
95 - Tabular files in which rows indicate different variables and columns different observations. The first row reports the observations’ labels.
96
97
98 Example of input dataset:
99 -------------------------
100
101 +----------+----------+----------+
102 |TCGAA62670|TCGAA62671|TCGAA62672|
103 +==========+==========+==========+
104 | 0.523167 | 0.371355 | 0.925661 |
105 +----------+----------+----------+
106 | 0.568765 | 0.765567 | 0.456789 |
107 +----------+----------+----------+
108 | 0.876545 | 0.768933 | 0.987654 |
109 +----------+----------+----------+
110 | 0.456788 | 0.876543 | 0.876542 |
111 +----------+----------+----------+
112 | 0.876543 | 0.786543 | 0.897654 |
113 +----------+----------+----------+
114
115 .
116
117
118 Options:
119 --------
120
121 The following clustering types can be chosen:
122 - K-means. This option requires the number of clusters (k) to be set. Different values of k can be tested.
123 - Agglomerative clustering. Different values of k can be set, to cut the resulting dendrogram.
124 - DBSCAN. The DBSCAN method chooses the number of clusters based on parameters that define when a region is to be considered dense. Custom parameters may be used, namely the maximum distance between two samples for one to be considered as in the neighborhood of the other and the number of samples in a neighborhood for a point to be considered as a core point.
125
126 The tool generates:
127 - a tab-separated file: reporting the affiliation of each observation to a cluster. In case different numbers of clusters have been tested, the best cluster assignment is reported according to maximum average silhouette score. If desired, the elbow plot is generated, as well as silhouette plot for each k.
128 - a list of items, including: 1) the cluster assignment for each tested number of clusters 2) the dendrogram in case of agglomerative clustering 3) elbow and silhouete plots in case of k-means clustering.
129 - a log file (.txt).
130
131
132 .. class:: infomark
133
134 **TIP**: This tool has been conceived to cluster gene expression data, by using the RAS scores computed by `Ras tool`_.
135
136 .. class:: infomark
137
138 **TIP**: If your data is not TAB delimited, use `Convert delimiters to TAB`_.
139
140 @REFERENCE@
141
142 .. _Ras tool: http://bimib.disco.unimib.it:5555/?tool_id=toolshed.g2.bx.psu.edu%2Frepos%2Fbimib%2Fmarea%2FMaREA+RAS+Generator%2F1.0.6&version=1.0.6&__identifer=auulv6gbp76
143 .. _Convert delimiters to TAB: http://bimib.disco.unimib.it:5555/?tool_id=Convert+characters1&version=1.0.0&__identifer=76g7trea4j6
144
145 ]]>
146 </help>
147 <expand macro="citations" />
148 </tool>
149
150