Mercurial > repos > eschen42 > w4mkmeans
diff w4mkmeans.xml @ 0:6ccbe18131a6 draft
planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit 299e5c7fdb0d6eb0773f3660009f6d63c2082a8d
author | eschen42 |
---|---|
date | Tue, 08 Aug 2017 15:30:38 -0400 |
parents | |
children | 02cafb660b72 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/w4mkmeans.xml Tue Aug 08 15:30:38 2017 -0400 @@ -0,0 +1,319 @@ +<tool id="w4mkmeans" name="Kmeans_for_W4M" version="0.98.1"> + <description>Calculate K-means for dataMatrix features or samples</description> + + <requirements> + <requirement type="package" version="3.3.2">r-base</requirement> + <requirement type="package" version="1.1_4">r-batch</requirement> + </requirements> + + <stdio> + <exit_code range="1:" level="fatal" /> + </stdio> + + + <command detect_errors="aggressive"><![CDATA[ + Rscript $__tool_directory__/w4mkmeans_wrapper.R + tool_directory $__tool_directory__ + data_matrix_path '$dataMatrix_in' + variable_metadata_path '$variableMetadata_in' + sample_metadata_path '$sampleMetadata_in' + ksamples '$ksamples' + kfeatures '$kfeatures' + iter_max '$iter_max' + nstart '$nstart' + algorithm '$algorithm' + scores_out '$scores_out' + sampleMetadata_out '$sampleMetadata_out' + variableMetadata_out '$variableMetadata_out' + slots "\${GALAXY_SLOTS:-1}" + ; echo exit code $? + ]]></command> + + <inputs> + <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="variable x sample, decimal: '.', missing: NA, mode: numerical, separator: tab" /> + <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="sample x metadata columns, separator: tab" /> + <param name="variableMetadata_in" label="Variable metadata file" type="data" format="tabular" help="variable x metadata columns, separator: tab" /> + <param name="ksamples" label="K value(s) for samples" type="text" value = "0" help="[ksamples] Single K or comma-separated Ks for samples, or 0 for none." /> + <param name="kfeatures" label="K value(s) for features" type="text" value = "0" help="[kfeatures] Single K or comma-separated Ks for features (variables), or 0 for none." /> + <param name="iter_max" label="Max number of iterations" type="text" value = "10" help="[iter_max] The maximum number of iterations allowed; default 10." /> + <param name="nstart" label="Number of random sets" type="text" value = "1" help="[nstart] How many random sets should be chosen; default 1." /> + <param name="algorithm" label="Algorithm for clustering" type="select" value = "Hartigan-Wong" help="[algorithm] K-means clustering algorithm, default 'Hartigan-Wong'; alternatives 'Lloyd', 'MacQueen'; 'Forgy' is a synonym for 'Lloyd', see references for further info."> + <option value="Forgy">Forgy</option> + <option value="Hartigan-Wong" selected="True">Hartigan-Wong</option> + <option value="Lloyd">Lloyd</option> + <option value="MacQueen">MacQueen</option> + </param> + </inputs> + + <outputs> + <data name="sampleMetadata_out" label="${tool.name}_${sampleMetadata_in.name}" format="tabular" ></data> + <data name="variableMetadata_out" label="${tool.name}_${variableMetadata_in.name}" format="tabular" ></data> + <data name="scores_out" label="${tool.name}_${dataMatrix_in.name}.kmeans" format="tabular" ></data> + </outputs> + + <tests> + <test> + <param name="dataMatrix_in" value="input_dataMatrix.tsv"/> + <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/> + <param name="variableMetadata_in" value="input_variableMetadata.tsv"/> + <param name="ksamples" value="3,4"/> + <param name="kfeatures" value="5,6,7"/> + <param name="iter_max" value="10"/> + <param name="nstart" value="1"/> + <param name="algorithm" value="Hartigan-Wong"/> + <output name="scores_out"> + <assert_contents> + <has_text text="proportion" /> + <has_text text="0.87482" /> + <has_text text="0.89248" /> + <has_text text="0.95355" /> + <has_text text="0.95673" /> + <has_text text="0.95963" /> + </assert_contents> + </output> + </test> + </tests> + + <help> + <![CDATA[ + +**Author** - Arthur Eschenlauer (University of Minnesota, esch0041@umn.edu) + +--------------------------------------------------------------------------- + + +**Source** - The source code for the w4mkmeans tool is available (from the Hegeman lab github repository) at https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper + +**R code used** - The R code invoked by this wrapper is the R 'stats::kmeans' package + +---------------------------------------------------------------------------------------------------------------------------------------------------------------------- + + +**Tool updates** + +See the **NEWS** section at the bottom of this page + +--------------------------------------------------- + +=========================== +K-means for W4M data matrix +=========================== + +----------- +Description +----------- + +Calculate K-means for sample-clusters (or feature-clusters, or both) using W4M dataMatrix (i.e., XCMS-preprocessed data files) as input. + +*Please note that XCMS refers to features as 'variables'. This documentation does not use either term consistently.* + + +----------------- +Workflow Position +----------------- + + - Tool category: Statistical Analysis + - Upstream tool category: Preprocessing + - Downstream tool categories: Statistical Analysis + + +---------- +Motivation +---------- + +This tool clusters samples, features (variables), or both from the W4M dataMatrix and writes the results to new columns in sampleMetadata, variableMetadata, or both, respectively. + + - If several, comma-separated K's are supplied, then one column is added for each K. + - This clustering is **not** hierarchical; each member of a cluster is not a member of any other cluster. + - For feature-clustering, each feature is assigned to a cluster such that the feature's response for all samples is closer to the mean of all features for that cluster than to the mean for any other cluster. + - For sample-clustering, each sample is assigned to a cluster such that the sample's response for all features is closer to the mean of all samples for that cluster than to the mean for any other cluster. + + +----------- +Input files +----------- + ++--------------------------------------------+------------+ +| File | Format | ++============================================+============+ +| Data matrix | tabular | ++--------------------------------------------+------------+ +| Sample metadata | tabular | ++--------------------------------------------+------------+ +| Variable (i.e., feature) metadata | tabular | ++--------------------------------------------+------------+ + + +---------- +Parameters +---------- + +**Data matrix** - input-file dataset + + - XCMS variable x sample 'dataMatrix' (tabular separated values) file of the numeric data matrix, with . as decimal, and NA for missing values; the table must not contain metadata apart from row and column names; the row and column names must be identical to the rownames of the sample and feature metadata, respectively (see below) + +**Sample metadata** - input-file dataset + + - XCMS sample x metadata 'sampleMetadata' (tabular separated values) file of the numeric and/or character sample metadata, with . as decimal and NA for missing values + +**Feature metadata** - input-file dataset + + - XCMS variable x metadata 'variableMetadata' (tabular separated values) file of the numeric and/or character feature metadata, with . as decimal and NA for missing values + +**kfeatures** - K or K's for features (default = 0) + + - integer or comma-separated integers ; zero (the default) or less will result in no calculation. + +**ksamples** - K or K-range for samples (default = 0) + + - integer or comma-separated integers ; zero (the default) or less will result in no calculation. + +**iter_max** - maximum_iterations (default = 10) + + - maximum number of iterations per calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html). + +**nstart** - how many random sets should be chosen (default = 1) + + - maximum number of iterations per calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html). + +------------ +Output files +------------ + +**XCMS sampleMetadata** - (tabular separated values) file identical to the Sample metadata file given as an input argument, excepting one column added for each K + + - **k#** - cluster number for clustering samples with K = # + +**XCMS variableMetadata** - (tabular separated values) file identical to the Feature metadata file given as an input argument, excepting one column added for each K + + - **k#** - cluster number for clustering features with K = # + +**scores** - (tabular separated values) file with one line for each K. + + - **clusterOn** - what was clustered - either 'sample' or 'feature' + - **k** - the chosen K for clustering + - **totalSS** - total (*between-treatements* plus total of *within-treatements*) sum of squares + - **betweenSS** - *between-treatements* sum of squares + - **proportion** - betweenSS / totalSS + +--------------- +Working example +--------------- + +**Input files** + ++-------------------+-------------------------------------------------------------------------------------------------------------------+ +| Input File | Download from URL | ++===================+===================================================================================================================+ +| Data matrix | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_dataMatrix.tsv | ++-------------------+-------------------------------------------------------------------------------------------------------------------+ +| Sample metadata | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_sampleMetadata.tsv | ++-------------------+-------------------------------------------------------------------------------------------------------------------+ +| Feature metadata | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_variableMetadata.tsv | ++-------------------+-------------------------------------------------------------------------------------------------------------------+ + +**Other input parameters** + ++-----------------+---------------+ +| Input Parameter | Value | ++=================+===============+ +| ksamples | 3,4 | ++-----------------+---------------+ +| kfeatures | 5,6,7 | ++-----------------+---------------+ +| iter_max | 10 | ++-----------------+---------------+ +| nstart | 1 | ++-----------------+---------------+ +| algorithm | Hartigan-Wong | ++-----------------+---------------+ + +---- +NEWS +---- + +August 2017, Version 0.98.1 - First release + +--------- +Citations +--------- + + ]]> + </help> + <citations> + <citation type="bibtex"><![CDATA[ +@incollection{RCoreTeam2017, + title = {stats::kmeans - K-Means Clustering}, + booktitle = {R: A Language and Environment for Statistical Computing}, + author = {{R Core Team}}, + publisher = {R Foundation for Statistical Computing}, + address = {Vienna, Austria}, + year = {2017}, + url = {https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html}, +} + ]]></citation> + <!-- Forgy algorithm --> + <citation type="bibtex"><![CDATA[ +@article{forgy65, + added-at = {2006-03-23T12:22:43.000+0100}, + author = {Forgy, E.}, + biburl = {https://www.bibsonomy.org/bibtex/21e31409932ce91df646c4731350e1207/hotho}, + interhash = {c86383cba8cfe00d5e6ef200016aca3f}, + intrahash = {1e31409932ce91df646c4731350e1207}, + journal = {Biometrics}, + keywords = {clustering kmeans}, + number = 3, + pages = {768-769}, + timestamp = {2006-03-23T12:22:43.000+0100}, + title = {Cluster Analysis of Multivariate Data: Efficiency versus Interpretability of Classification}, + volume = 21, + year = 1965 +} + ]]></citation> + <!-- W4M 3.0 - Guitton et al. 2017--> + <citation type="doi">10.1016/j.biocel.2017.07.002</citation> + <!-- W4M 2.5 - Giacomini et al. 2014 --> + <citation type="doi">10.1093/bioinformatics/btu813</citation> + <!-- Hartigan and Wong algorithm --> + <citation type="bibtex"><![CDATA[ +@article{Hartigan79, + added-at = {2007-02-27T16:22:09.000+0100}, + author = {Hartigan, J. and Wong, M.}, + biburl = {https://www.bibsonomy.org/bibtex/23d8bfc440c5725783876929c022f67ce/pierpaolo.pk81}, + description = {WSD}, + interhash = {10d6d33920d9af578a4d0a556dc1477d}, + intrahash = {3d8bfc440c5725783876929c022f67ce}, + journal = {Applied Statistics}, + keywords = {imported}, + pages = {100-108}, + timestamp = {2007-02-27T16:22:11.000+0100}, + title = {Algorithm AS136: A k-means clustering algorithm}, + volume = 28, + year = 1979 +} + ]]></citation> + <!-- Lloyd algorithm --> + <citation type="doi">10.1109/TIT.1982.1056489</citation> + <!-- MacQueen algorithm --> + <citation type="bibtex"><![CDATA[ +@inproceedings{MacQueen1967, + added-at = {2011-01-11T13:35:01.000+0100}, + author = {MacQueen, J. B.}, + biburl = {https://www.bibsonomy.org/bibtex/25dcdb8cd9fba78e0e791af619d61d66d/enitsirhc}, + booktitle = {Proc. of the fifth Berkeley Symposium on Mathematical Statistics and Probability}, + editor = {Cam, L. M. Le and Neyman, J.}, + interhash = {8d7d4dfe7d3a06b8c9c3c2bb7aa91e28}, + intrahash = {5dcdb8cd9fba78e0e791af619d61d66d}, + keywords = {kmeans clustering}, + pages = {281-297}, + publisher = {University of California Press}, + timestamp = {2011-01-11T13:35:01.000+0100}, + title = {Some Methods for Classification and Analysis of MultiVariate Observations}, + volume = 1, + year = 1967 +} + ]]></citation> + </citations> + <!-- + vim:et:sw=2:ts=2: +--> </tool>