Mercurial > repos > bgruening > numeric_clustering
comparison numeric_clustering.xml @ 0:a3fd214e7555 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/numeric_clustering commit bafd56379ff227fb81f8cd61d708ebc39814da54
| author | bgruening |
|---|---|
| date | Fri, 01 Jan 2016 18:37:54 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:a3fd214e7555 |
|---|---|
| 1 <tool id="numeric_clustering" name="Numeric Clustering" version="@VERSION@"> | |
| 2 <description></description> | |
| 3 <requirements> | |
| 4 <requirement type="package" version="2.3.0">anaconda</requirement> | |
| 5 </requirements> | |
| 6 <stdio> | |
| 7 <exit_code level="fatal" range="1:"/> | |
| 8 </stdio> | |
| 9 <macros> | |
| 10 <token name="@VERSION@">0.9</token> | |
| 11 <macro name="n_clusters" token_default_value="8"> | |
| 12 <param name="n_clusters" type="integer" optional="true" value="@DEFAULT_VALUE@" label="Number of clusters" | |
| 13 help="default value is @DEFAULT_VALUE@ (--n_clusters)"/> | |
| 14 </macro> | |
| 15 <macro name="n_init"> | |
| 16 <param name="n_init" type="integer" optional="true" value="" label="Number of runs with different centroid seeds"/> | |
| 17 </macro> | |
| 18 <macro name="max_iter"> | |
| 19 <param name="max_iter" type="integer" optional="true" value="" label="Maximum number of iterations per single run"/> | |
| 20 </macro> | |
| 21 <macro name="random_state"> | |
| 22 <param name="random_state" type="integer" optional="true" value="" label="Initialize centers"/> | |
| 23 </macro> | |
| 24 <macro name="affinity"> | |
| 25 <param name="affinity" type="text" optional="true" value="" label="Affinity"/> | |
| 26 </macro> | |
| 27 <macro name="tol"> | |
| 28 <param name="tol" type="float" optional="true" value="" label="Relative tolerance"/> | |
| 29 </macro> | |
| 30 <macro name="init"> | |
| 31 <param name="init" type="select" label="Select initialization method"> | |
| 32 <option value="k-means++">k-means++</option> | |
| 33 <option value="random">random</option> | |
| 34 </param> | |
| 35 </macro> | |
| 36 </macros> | |
| 37 <version_command>echo "@VERSION@"</version_command> | |
| 38 <command><![CDATA[ | |
| 39 cat "$cluster_script" >&2 | |
| 40 && | |
| 41 #import json | |
| 42 #set $params = dict() | |
| 43 #for $key, $value in $algorithm_options.items(): | |
| 44 #if not $key.startswith('__') and $key.strip() != 'selected_algorithm' and str($value).strip(): | |
| 45 #if str($value).strip() == 'false': | |
| 46 #set $value = False | |
| 47 #elif str($value).strip() == 'true': | |
| 48 #set $value = True | |
| 49 #else: | |
| 50 #try: | |
| 51 #set $val = float($value) | |
| 52 #try: | |
| 53 #set $value = int($value) | |
| 54 #except: | |
| 55 #set $value = float($value) | |
| 56 #end try | |
| 57 #except: | |
| 58 #set $value = str($value) | |
| 59 #end try | |
| 60 #end if | |
| 61 $params.update({str($key): $value}) | |
| 62 #end if | |
| 63 #end for | |
| 64 #set $json_string = json.dumps( $params ) | |
| 65 | |
| 66 python "$cluster_script" '$json_string' | |
| 67 | |
| 68 ]]> | |
| 69 </command> | |
| 70 <configfiles> | |
| 71 <configfile name="cluster_script"> | |
| 72 <![CDATA[ | |
| 73 import sys | |
| 74 import json | |
| 75 import numpy as np | |
| 76 import sklearn.cluster | |
| 77 import pandas | |
| 78 | |
| 79 data = pandas.read_csv("$infile", sep='\t', header=0, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False ) | |
| 80 my_class = getattr(sklearn.cluster, "$algorithm_options.selected_algorithm") | |
| 81 cluster_object = my_class() | |
| 82 | |
| 83 params = json.loads( sys.argv[1] ) | |
| 84 cluster_object.set_params(**params) | |
| 85 #if $end_column and $start_column: | |
| 86 | |
| 87 if $end_column >= $start_column: | |
| 88 data_matrix = data.values[:, $start_column-1:$end_column] | |
| 89 else: | |
| 90 data_matrix = data.values | |
| 91 | |
| 92 #else: | |
| 93 data_matrix = data.values | |
| 94 #end if | |
| 95 prediction = cluster_object.fit_predict( data_matrix ) | |
| 96 prediction_df = pandas.DataFrame(prediction) | |
| 97 res = pandas.concat([data, prediction_df], axis=1) | |
| 98 res.to_csv(path_or_buf = "$outfile", sep="\t", index=False) | |
| 99 ]]> | |
| 100 </configfile> | |
| 101 </configfiles> | |
| 102 <inputs> | |
| 103 <param name="infile" type="data" format="tabular" label="Data file with numeric values" /> | |
| 104 <param name="start_column" type="data_column" data_ref="infile" optional="True" label="Clustering column from" /> | |
| 105 <param name="end_column" type="data_column" data_ref="infile" optional="True" label="to" /> | |
| 106 <conditional name="algorithm_options"> | |
| 107 <param name="selected_algorithm" type="select" label="Clustering Algorithm"> | |
| 108 <option value="KMeans">KMeans</option> | |
| 109 <option value="DBSCAN">DBSCAN</option> | |
| 110 <option value="Birch">Birch</option> | |
| 111 <option value="MeanShift">MeanShift</option> | |
| 112 <option value="AffinityPropagation">Affinity Propagation</option> | |
| 113 <option value="AgglomerativeClustering">Agglomerative Clustering</option> | |
| 114 <option value="SpectralClustering">Spectral Clustering</option> | |
| 115 <option value="MiniBatchKMeans">Mini Batch KMeans</option> | |
| 116 </param> | |
| 117 <when value="KMeans"> | |
| 118 <expand macro="n_clusters" default_label="8"/> | |
| 119 <expand macro="init"/> | |
| 120 <expand macro="n_init"/> | |
| 121 <expand macro="max_iter"/> | |
| 122 <expand macro="tol"/> | |
| 123 <param name="precompute_distances" type="text" optional="true" value="" label="Precompute distances"/> | |
| 124 <expand macro="random_state"/> | |
| 125 <param name="copy_x" type="boolean" optional="true" truevalue="--copy_x" falsevale="" label="Do not modify original data"/> | |
| 126 </when> | |
| 127 <when value="DBSCAN"> | |
| 128 <param name="eps" type="float" optional="true" value="0.5" label="Maximum neghborhood distance"/> | |
| 129 <param name="min_samples" type="integer" optional="true" value="5" label="Core point minimum population"/> | |
| 130 <param name="metric" type="text" optional="true" value="euclidean" label="Metric"/> | |
| 131 <param name="algorithm" type="select" optional="true" value="auto" label="Pointwise distance algorithm"> | |
| 132 <option value="auto">auto</option> | |
| 133 <option value="ball_tree">ball_tree</option> | |
| 134 <option value="kd_tree">kd_tree</option> | |
| 135 <option value="brute">brute</option> | |
| 136 </param> | |
| 137 <param name="leaf_size" type="integer" optional="true" value="30" label="Leaf size"/> | |
| 138 </when> | |
| 139 <when value="Birch"> | |
| 140 <param name="threshold" type="float" optional="true" value="0.5" label="Subcluster radius threshold"/> | |
| 141 <param name="branching_factor" type="integer" optional="true" value="50" label="Maximum number of subclusters per branch"/> | |
| 142 <expand macro="n_clusters" default_label="3" /> <!-- default to 3--> | |
| 143 <!--param name="compute_labels" type="boolean" optional="true" truevalue="true" falsevale="false" label="Compute labels for each fit"/--> | |
| 144 </when> | |
| 145 <when value="AffinityPropagation"> | |
| 146 <param name="damping" type="float" optional="true" value="0.5" label="Damping factor"/> | |
| 147 <expand macro="max_iter"/> <!--default to 200 --> | |
| 148 <param name="convergence_iter" type="integer" optional="true" value="15" label="Number of iterations at each convergence step"/> | |
| 149 <param name="copy" type="boolean" optional="true" truevalue="true" falsevale="false" label="Make a copy of input data"/> | |
| 150 <!--param name="preference" type="text" optional="true" value="None" label="Array like shape (n_samples,)"/--> | |
| 151 <expand macro="affinity"/> <!--default = euclidean--> | |
| 152 </when> | |
| 153 <when value="MeanShift"> | |
| 154 <param name="bandwidth" type="float" optional="true" value="" label="RBF kernel bandwidth"/> | |
| 155 <!--param name="seeds" type="list" optional="true" value="None" label=""/--> | |
| 156 <param name="bin_seeding" type="boolean" optional="true" truevalue="true" falsevale="false" label="Discretize initial kernel locations"/> | |
| 157 <param name="min_bin_freq" type="integer" optional="true" value="1" label="Minimum number of seeds per bin"/> | |
| 158 <param name="cluster_all" type="boolean" optional="true" truevalue="true" falsevale="false" label="Cluster all"/> | |
| 159 </when> | |
| 160 <when value="AgglomerativeClustering"> | |
| 161 <expand macro="n_clusters" default_label="2" /> <!-- deafault 2--> | |
| 162 <expand macro="affinity"/> <!--default = euclidean--> | |
| 163 <!--param name="memory" type="callable" optional="true" value="Memory(cachedir=None)" label="Caching path"/--> | |
| 164 <!--param name="connectivity" type="list array-like or callable" optional="true" value="None" label="Connectivity matrix"/--> | |
| 165 <param name="n_components" type="integer" optional="true" value="" label="Number of connected components"/> | |
| 166 <!--param name="compute_full_tree" type="text or boolean" optional="true" value="auto" label=""/--> | |
| 167 <param name="linkage" type="select" optional="true" value="ward" label="Linkage"> | |
| 168 <option value="ward">ward</option> | |
| 169 <option value="complete">complete</option> | |
| 170 <option value="average">average</option> | |
| 171 </param> | |
| 172 <!--param name="pooling_func" type="callable" optional="np.mean" value="None" label=""/--> | |
| 173 </when> | |
| 174 <when value="SpectralClustering"> | |
| 175 <expand macro="n_clusters" default_label="8" /> | |
| 176 <param name="eigen_solver" type="select" value="arpack" label="Eigenvalue decomposition strategy"> | |
| 177 <option value="arpack">arpack</option> | |
| 178 <option value="lobpcg">lobpcg</option> | |
| 179 <option value="amg">amg</option> | |
| 180 </param> | |
| 181 <expand macro="random_state"/> | |
| 182 <!-- Todo: extend random_state type to int seed, RandomState instance, or None. --> | |
| 183 <expand macro="n_init"/> <!-- default to 10--> | |
| 184 <param name="gamma" type="float" optional="true" value="1.0" label="Kernel scaling factor"/> | |
| 185 <expand macro="affinity"/> <!--default =rbf--> | |
| 186 <param name="n_neighbors" type="integer" optional="true" value="10" label="Number of neighbors"/> | |
| 187 <!--param name="eigen_tol" type="float" optional="true" value="0.0" label="arpack eigendecomposition stopping threshold"/--> | |
| 188 <param name="assign_labels" type="select" optional="true" value="kmeans" label="Assign labels"> | |
| 189 <option value="kmeans">kmeans</option> | |
| 190 <option value="discretize">discretize</option> | |
| 191 </param> | |
| 192 <param name="degree" type="integer" optional="true" value="3" label="Degree of the polynomial (polynomial kernel only)"/> | |
| 193 <param name="coef0" type="integer" optional="true" value="1" label="Zero coefficient (polynomial and sigmoid kernels only)"/> | |
| 194 <!--param name="kernel_params" type="dict" optional="true" value="None" label=""/--> | |
| 195 </when> | |
| 196 <when value="MiniBatchKMeans"> | |
| 197 <expand macro="n_clusters" default_label="8"/> | |
| 198 <expand macro="init"/> | |
| 199 <expand macro="n_init"/> <!-- default to 3--> | |
| 200 <expand macro="max_iter"/> <!--default to 100--> | |
| 201 <expand macro="tol"/> <!--default = 0.0--> | |
| 202 <expand macro="random_state"/> | |
| 203 <param name="batch_size" type="integer" optional="true" value="100" label="Mini batch size"/> | |
| 204 <!--param name="compute_labels" type="boolean" optional="true" truevalue="true" falsevale="false" label="Compute labels for all data"/--> | |
| 205 <param name="max_no_improvement" type="integer" optional="true" value="10" label="Maximum number of improvement attempts"/> | |
| 206 <param name="init_size" type="integer" optional="true" value="" label="Number of random init samples"/> | |
| 207 <param name="reassignment_ratio" type="float" optional="true" value="0.01" label="Re-assignment ratio"/> | |
| 208 </when> | |
| 209 </conditional> | |
| 210 </inputs> | |
| 211 <outputs> | |
| 212 <data format_source="infile" name="outfile"/> | |
| 213 </outputs> | |
| 214 <tests> | |
| 215 <test> | |
| 216 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 217 <param name="selected_algorithm" value="KMeans"/> | |
| 218 <param name="start_column" value="2" /> | |
| 219 <param name="end_column" value="4" /> | |
| 220 <param name="n_clusters" value="4" /> | |
| 221 <param name="init" value="k-means++" /> | |
| 222 <param name="random_state" value="100"/> | |
| 223 <output name="outfile" file="cluster_result01.txt"/> | |
| 224 </test> | |
| 225 <test> | |
| 226 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 227 <param name="selected_algorithm" value="KMeans"/> | |
| 228 <param name="start_column" value="2" /> | |
| 229 <param name="end_column" value="4" /> | |
| 230 <param name="n_clusters" value="4" /> | |
| 231 <param name="init" value="random" /> | |
| 232 <param name="random_state" value="100"/> | |
| 233 <output name="outfile" file="cluster_result02.txt"/> | |
| 234 </test> | |
| 235 <test> | |
| 236 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 237 <param name="selected_algorithm" value="DBSCAN"/> | |
| 238 <param name="start_column" value="2" /> | |
| 239 <param name="end_column" value="4" /> | |
| 240 <param name="algorithm" value="kd_tree"/> | |
| 241 <param name="leaf_size" value="10"/> | |
| 242 <param name="eps" value="1.0"/> | |
| 243 <output name="outfile" file="cluster_result03.txt"/> | |
| 244 </test> | |
| 245 <test> | |
| 246 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 247 <param name="selected_algorithm" value="Birch"/> | |
| 248 <param name="start_column" value="2" /> | |
| 249 <param name="end_column" value="4" /> | |
| 250 <param name="n_clusters" value="4"/> | |
| 251 <param name="threshold" value="0.008"/> | |
| 252 <output name="outfile" file="cluster_result04.txt"/> | |
| 253 </test> | |
| 254 <test> | |
| 255 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 256 <param name="selected_algorithm" value="Birch"/> | |
| 257 <param name="start_column" value="2" /> | |
| 258 <param name="end_column" value="4" /> | |
| 259 <param name="branching_factor" value="20"/> | |
| 260 <output name="outfile" file="cluster_result05.txt"/> | |
| 261 </test> | |
| 262 <test> | |
| 263 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 264 <param name="selected_algorithm" value="AffinityPropagation"/> | |
| 265 <param name="start_column" value="2" /> | |
| 266 <param name="end_column" value="4" /> | |
| 267 <param name="affinity" value="euclidean"/> | |
| 268 <param name="copy" value="false"/> | |
| 269 <output name="outfile" file="cluster_result06.txt"/> | |
| 270 </test> | |
| 271 <test> | |
| 272 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 273 <param name="selected_algorithm" value="AffinityPropagation"/> | |
| 274 <param name="start_column" value="2" /> | |
| 275 <param name="end_column" value="4" /> | |
| 276 <param name="damping" value="0.8"/> | |
| 277 <output name="outfile" file="cluster_result07.txt"/> | |
| 278 </test> | |
| 279 <test> | |
| 280 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 281 <param name="selected_algorithm" value="MeanShift"/> | |
| 282 <param name="start_column" value="2" /> | |
| 283 <param name="end_column" value="4" /> | |
| 284 <param name="min_bin_freq" value="3"/> | |
| 285 <output name="outfile" file="cluster_result08.txt"/> | |
| 286 </test> | |
| 287 <test> | |
| 288 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 289 <param name="selected_algorithm" value="MeanShift"/> | |
| 290 <param name="start_column" value="2" /> | |
| 291 <param name="end_column" value="4" /> | |
| 292 <param name="cluster_all" value="False"/> | |
| 293 <output name="outfile" file="cluster_result09.txt"/> | |
| 294 </test> | |
| 295 <test> | |
| 296 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 297 <param name="selected_algorithm" value="AgglomerativeClustering"/> | |
| 298 <param name="start_column" value="2" /> | |
| 299 <param name="end_column" value="4" /> | |
| 300 <param name="affinity" value="euclidean"/> | |
| 301 <param name="linkage" value="average"/> | |
| 302 <param name="n_clusters" value="4"/> | |
| 303 <output name="outfile" file="cluster_result10.txt"/> | |
| 304 </test> | |
| 305 <test> | |
| 306 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 307 <param name="selected_algorithm" value="AgglomerativeClustering"/> | |
| 308 <param name="start_column" value="2" /> | |
| 309 <param name="end_column" value="4" /> | |
| 310 <param name="linkage" value="complete"/> | |
| 311 <param name="n_clusters" value="4"/> | |
| 312 <output name="outfile" file="cluster_result11.txt"/> | |
| 313 </test> | |
| 314 <test> | |
| 315 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 316 <param name="selected_algorithm" value="SpectralClustering"/> | |
| 317 <param name="start_column" value="2" /> | |
| 318 <param name="end_column" value="4" /> | |
| 319 <param name="eigen_solver" value="arpack"/> | |
| 320 <param name="n_neighbors" value="12"/> | |
| 321 <param name="n_clusters" value="4"/> | |
| 322 <param name="assign_labels" value="discretize"/> | |
| 323 <param name="random_state" value="100"/> | |
| 324 <output name="outfile" file="cluster_result12.txt"/> | |
| 325 </test> | |
| 326 <test> | |
| 327 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 328 <param name="selected_algorithm" value="SpectralClustering"/> | |
| 329 <param name="start_column" value="2" /> | |
| 330 <param name="end_column" value="4" /> | |
| 331 <param name="assign_labels" value="discretize"/> | |
| 332 <param name="random_state" value="100"/> | |
| 333 <param name="degree" value="2"/> | |
| 334 <output name="outfile" file="cluster_result13.txt"/> | |
| 335 </test> | |
| 336 <test> | |
| 337 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 338 <param name="selected_algorithm" value="MiniBatchKMeans"/> | |
| 339 <param name="start_column" value="2" /> | |
| 340 <param name="end_column" value="4" /> | |
| 341 <param name="tol" value="0.5"/> | |
| 342 <param name="random_state" value="100"/> | |
| 343 <output name="outfile" file="cluster_result14.txt"/> | |
| 344 </test> | |
| 345 <test> | |
| 346 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 347 <param name="selected_algorithm" value="MiniBatchKMeans"/> | |
| 348 <param name="n_init" value="5"/> | |
| 349 <param name="start_column" value="2" /> | |
| 350 <param name="end_column" value="4" /> | |
| 351 <param name="batch_size" value="10"/> | |
| 352 <param name="n_clusters" value="4"/> | |
| 353 <param name="random_state" value="100"/> | |
| 354 <param name="reassignment_ratio" value="1.0"/> | |
| 355 <output name="outfile" file="cluster_result15.txt"/> | |
| 356 </test> | |
| 357 <test> | |
| 358 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | |
| 359 <param name="selected_algorithm" value="KMeans"/> | |
| 360 <param name="start_column" value="1" /> | |
| 361 <param name="end_column" value="1" /> | |
| 362 <param name="n_clusters" value="4" /> | |
| 363 <param name="random_state" value="100"/> | |
| 364 <output name="outfile" file="cluster_result16.txt"/> | |
| 365 </test> | |
| 366 </tests> | |
| 367 <help><![CDATA[ | |
| 368 **What it does** | |
| 369 | |
| 370 This clustering tool offers different clustering algorithms which are provided by | |
| 371 scikit-learn to find similarities among samples and cluster the samples based on these similarities. | |
| 372 | |
| 373 ]]></help> | |
| 374 <citations> | |
| 375 <citation type="bibtex"> | |
| 376 @article{scikit-learn, | |
| 377 title={Scikit-learn: Machine Learning in {P}ython}, | |
| 378 author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. | |
| 379 and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. | |
| 380 and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and | |
| 381 Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, | |
| 382 journal={Journal of Machine Learning Research}, | |
| 383 volume={12}, | |
| 384 pages={2825--2830}, | |
| 385 year={2011} | |
| 386 url = {https://github.com/scikit-learn/scikit-learn} | |
| 387 } | |
| 388 </citation> | |
| 389 </citations> | |
| 390 </tool> |
