Mercurial > repos > eschen42 > w4mkmeans

diff w4mkmeans.xml @ 0:6ccbe18131a6 draft
planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit 299e5c7fdb0d6eb0773f3660009f6d63c2082a8d
author: eschen42
date: Tue, 08 Aug 2017 15:30:38 -0400
children: 02cafb660b72
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/w4mkmeans.xml	Tue Aug 08 15:30:38 2017 -0400
@@ -0,0 +1,319 @@
+<tool id="w4mkmeans" name="Kmeans_for_W4M" version="0.98.1">
+  <description>Calculate K-means for dataMatrix features or samples</description>
+
+  <requirements>
+    <requirement type="package" version="3.3.2">r-base</requirement>
+    <requirement type="package" version="1.1_4">r-batch</requirement>
+  </requirements>
+
+  <stdio>
+    <exit_code range="1:" level="fatal" />
+  </stdio>
+
+
+  <command detect_errors="aggressive"><![CDATA[
+    Rscript $__tool_directory__/w4mkmeans_wrapper.R
+      tool_directory $__tool_directory__
+      data_matrix_path '$dataMatrix_in'
+      variable_metadata_path '$variableMetadata_in'
+      sample_metadata_path '$sampleMetadata_in'
+      ksamples '$ksamples'
+      kfeatures '$kfeatures'
+      iter_max '$iter_max'
+      nstart '$nstart'
+      algorithm '$algorithm'
+      scores_out '$scores_out'
+      sampleMetadata_out '$sampleMetadata_out'
+      variableMetadata_out '$variableMetadata_out'
+      slots "\${GALAXY_SLOTS:-1}"
+    ; echo exit code $?
+  ]]></command>
+
+  <inputs>
+    <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="variable x sample, decimal: '.', missing: NA, mode: numerical, separator: tab" />
+    <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="sample x metadata columns, separator: tab" />
+    <param name="variableMetadata_in" label="Variable metadata file" type="data" format="tabular" help="variable x metadata columns, separator: tab" />
+    <param name="ksamples" label="K value(s) for samples" type="text" value = "0" help="[ksamples] Single K or comma-separated Ks for samples, or 0 for none." />
+    <param name="kfeatures" label="K value(s) for features" type="text" value = "0" help="[kfeatures] Single K or comma-separated Ks for features (variables), or 0 for none." />
+    <param name="iter_max" label="Max number of iterations" type="text" value = "10" help="[iter_max] The maximum number of iterations allowed; default 10." />
+    <param name="nstart" label="Number of random sets" type="text" value = "1" help="[nstart] How many random sets should be chosen; default 1." />
+    <param name="algorithm" label="Algorithm for clustering" type="select" value = "Hartigan-Wong" help="[algorithm] K-means clustering algorithm, default 'Hartigan-Wong'; alternatives 'Lloyd', 'MacQueen'; 'Forgy' is a synonym for 'Lloyd', see references for further info.">
+      <option value="Forgy">Forgy</option>
+      <option value="Hartigan-Wong" selected="True">Hartigan-Wong</option>
+      <option value="Lloyd">Lloyd</option>
+      <option value="MacQueen">MacQueen</option>
+    </param>
+  </inputs>
+
+  <outputs>
+    <data name="sampleMetadata_out" label="${tool.name}_${sampleMetadata_in.name}" format="tabular" ></data>
+    <data name="variableMetadata_out" label="${tool.name}_${variableMetadata_in.name}" format="tabular" ></data>
+    <data name="scores_out" label="${tool.name}_${dataMatrix_in.name}.kmeans" format="tabular" ></data>
+  </outputs>
+
+  <tests>
+        <test>
+      <param name="dataMatrix_in" value="input_dataMatrix.tsv"/>
+      <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
+      <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
+      <param name="ksamples" value="3,4"/>
+      <param name="kfeatures" value="5,6,7"/>
+      <param name="iter_max" value="10"/>
+      <param name="nstart" value="1"/>
+      <param name="algorithm" value="Hartigan-Wong"/>
+      <output name="scores_out">
+        <assert_contents>
+          <has_text     text="proportion" />
+          <has_text     text="0.87482" />
+          <has_text     text="0.89248" />
+          <has_text     text="0.95355" />
+          <has_text     text="0.95673" />
+          <has_text     text="0.95963" />
+        </assert_contents>
+      </output>
+    </test>
+  </tests>
+
+  <help>
+    <![CDATA[
+
+**Author** - Arthur Eschenlauer (University of Minnesota, esch0041@umn.edu)
+
+---------------------------------------------------------------------------
+
+
+**Source** - The source code for the w4mkmeans tool is available (from the Hegeman lab github repository) at https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper
+
+**R code used** - The R code invoked by this wrapper is the R 'stats::kmeans' package
+
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+
+**Tool updates**
+
+See the **NEWS** section at the bottom of this page
+
+---------------------------------------------------
+
+===========================
+K-means for W4M data matrix
+===========================
+
+-----------
+Description
+-----------
+
+Calculate K-means for sample-clusters (or feature-clusters, or both) using W4M dataMatrix (i.e., XCMS-preprocessed data files) as input.
+
+*Please note that XCMS refers to features as 'variables'.  This documentation does not use either term consistently.*
+
+
+-----------------
+Workflow Position
+-----------------
+
+  - Tool category: Statistical Analysis
+  - Upstream tool category: Preprocessing
+  - Downstream tool categories: Statistical Analysis
+
+
+----------
+Motivation
+----------
+
+This tool clusters samples, features (variables), or both from the W4M dataMatrix and writes the results to new columns in sampleMetadata, variableMetadata, or both, respectively.
+
+  - If several, comma-separated K's are supplied, then one column is added for each K.
+  - This clustering is **not** hierarchical; each member of a cluster is not a member of any other cluster.
+  - For feature-clustering, each feature is assigned to a cluster such that the feature's response for all samples is closer to the mean of all features for that cluster than to the mean for any other cluster.
+  - For sample-clustering, each sample is assigned to a cluster such that the sample's response for all features is closer to the mean of all samples for that cluster than to the mean for any other cluster.
+
+
+-----------
+Input files
+-----------
+
++--------------------------------------------+------------+
+| File                                       |   Format   |
++============================================+============+
+|     Data matrix                            |   tabular  |
++--------------------------------------------+------------+
+|     Sample metadata                        |   tabular  |
++--------------------------------------------+------------+
+|     Variable (i.e., feature) metadata      |   tabular  |
++--------------------------------------------+------------+
+
+
+----------
+Parameters
+----------
+
+**Data matrix** - input-file dataset
+
+  - XCMS variable x sample 'dataMatrix' (tabular separated values) file of the numeric data matrix, with . as decimal, and NA for missing values; the table must not contain metadata apart from row and column names; the row and column names must be identical to the rownames of the sample and feature metadata, respectively (see below)
+
+**Sample metadata** - input-file dataset
+
+  - XCMS sample x metadata 'sampleMetadata' (tabular separated values) file of the numeric and/or character sample metadata, with . as decimal and NA for missing values
+
+**Feature metadata** - input-file dataset
+
+  - XCMS variable x metadata 'variableMetadata' (tabular separated values) file of the numeric and/or character feature metadata, with . as decimal and NA for missing values
+
+**kfeatures** - K or K's for features (default = 0)
+
+  - integer or comma-separated integers ; zero (the default) or less will result in no calculation.
+
+**ksamples** - K or K-range for samples (default = 0)
+
+  - integer or comma-separated integers ; zero (the default) or less will result in no calculation.
+
+**iter_max** - maximum_iterations (default = 10)
+
+  - maximum number of iterations per calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html).
+
+**nstart** - how many random sets should be chosen (default = 1)
+
+  - maximum number of iterations per calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html).
+
+------------
+Output files
+------------
+
+**XCMS sampleMetadata** - (tabular separated values) file identical to the Sample metadata file given as an input argument, excepting one column added for each K
+
+  - **k#** - cluster number for clustering samples with K = #
+
+**XCMS variableMetadata** - (tabular separated values) file identical to the Feature metadata file given as an input argument, excepting one column added for each K
+
+  - **k#** - cluster number for clustering features with K = #
+
+**scores** - (tabular separated values) file with one line for each K.
+
+  - **clusterOn** - what was clustered - either 'sample' or 'feature'
+  - **k** - the chosen K for clustering
+  - **totalSS** - total (*between-treatements* plus total of *within-treatements*) sum of squares
+  - **betweenSS** - *between-treatements* sum of squares
+  - **proportion** - betweenSS / totalSS
+
+---------------
+Working example
+---------------
+
+**Input files**
+
++-------------------+-------------------------------------------------------------------------------------------------------------------+
+| Input File        | Download from URL                                                                                                 |
++===================+===================================================================================================================+
+| Data matrix       | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_dataMatrix.tsv       |
++-------------------+-------------------------------------------------------------------------------------------------------------------+
+| Sample metadata   | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_sampleMetadata.tsv   |
++-------------------+-------------------------------------------------------------------------------------------------------------------+
+| Feature metadata  | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_variableMetadata.tsv |
++-------------------+-------------------------------------------------------------------------------------------------------------------+
+
+**Other input parameters**
+
++-----------------+---------------+
+| Input Parameter | Value         |
++=================+===============+
+| ksamples        | 3,4           |
++-----------------+---------------+
+| kfeatures       | 5,6,7         |
++-----------------+---------------+
+| iter_max        | 10            |
++-----------------+---------------+
+| nstart          | 1             |
++-----------------+---------------+
+| algorithm       | Hartigan-Wong |
++-----------------+---------------+
+
+----
+NEWS
+----
+
+August 2017, Version 0.98.1 - First release
+
+---------
+Citations
+---------
+
+    ]]>
+  </help>
+  <citations>
+    <citation type="bibtex"><![CDATA[
+@incollection{RCoreTeam2017,
+  title = {stats::kmeans - K-Means Clustering},
+  booktitle = {R: A Language and Environment for Statistical Computing},
+  author = {{R Core Team}},
+  publisher = {R Foundation for Statistical Computing},
+  address = {Vienna, Austria},
+  year = {2017},
+  url = {https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html},
+}
+    ]]></citation>
+    <!-- Forgy algorithm -->
+    <citation type="bibtex"><![CDATA[
+@article{forgy65,
+  added-at = {2006-03-23T12:22:43.000+0100},
+  author = {Forgy, E.},
+  biburl = {https://www.bibsonomy.org/bibtex/21e31409932ce91df646c4731350e1207/hotho},
+  interhash = {c86383cba8cfe00d5e6ef200016aca3f},
+  intrahash = {1e31409932ce91df646c4731350e1207},
+  journal = {Biometrics},
+  keywords = {clustering kmeans},
+  number = 3,
+  pages = {768-769},
+  timestamp = {2006-03-23T12:22:43.000+0100},
+  title = {Cluster Analysis of Multivariate Data: Efficiency versus Interpretability of Classification},
+  volume = 21,
+  year = 1965
+}
+    ]]></citation>
+    <!-- W4M 3.0 - Guitton et al. 2017-->
+    <citation type="doi">10.1016/j.biocel.2017.07.002</citation>
+    <!-- W4M 2.5 - Giacomini et al. 2014 -->
+    <citation type="doi">10.1093/bioinformatics/btu813</citation>
+    <!-- Hartigan and Wong algorithm -->
+    <citation type="bibtex"><![CDATA[
+@article{Hartigan79,
+  added-at = {2007-02-27T16:22:09.000+0100},
+  author = {Hartigan, J. and Wong, M.},
+  biburl = {https://www.bibsonomy.org/bibtex/23d8bfc440c5725783876929c022f67ce/pierpaolo.pk81},
+  description = {WSD},
+  interhash = {10d6d33920d9af578a4d0a556dc1477d},
+  intrahash = {3d8bfc440c5725783876929c022f67ce},
+  journal = {Applied Statistics},
+  keywords = {imported},
+  pages = {100-108},
+  timestamp = {2007-02-27T16:22:11.000+0100},
+  title = {Algorithm AS136: A k-means clustering algorithm},
+  volume = 28,
+  year = 1979
+}
+    ]]></citation>
+    <!-- Lloyd algorithm -->
+    <citation type="doi">10.1109/TIT.1982.1056489</citation>
+    <!-- MacQueen algorithm -->
+    <citation type="bibtex"><![CDATA[
+@inproceedings{MacQueen1967,
+  added-at = {2011-01-11T13:35:01.000+0100},
+  author = {MacQueen, J. B.},
+  biburl = {https://www.bibsonomy.org/bibtex/25dcdb8cd9fba78e0e791af619d61d66d/enitsirhc},
+  booktitle = {Proc. of the fifth Berkeley Symposium on Mathematical Statistics and Probability},
+  editor = {Cam, L. M. Le and Neyman, J.},
+  interhash = {8d7d4dfe7d3a06b8c9c3c2bb7aa91e28},
+  intrahash = {5dcdb8cd9fba78e0e791af619d61d66d},
+  keywords = {kmeans clustering},
+  pages = {281-297},
+  publisher = {University of California Press},
+  timestamp = {2011-01-11T13:35:01.000+0100},
+  title = {Some Methods for Classification and Analysis of MultiVariate Observations},
+  volume = 1,
+  year = 1967
+}
+    ]]></citation>
+  </citations>
+  <!--
+     vim:et:sw=2:ts=2:
+--> </tool>
author	eschen42
date	Tue, 08 Aug 2017 15:30:38 -0400
parents
children	02cafb660b72