Mercurial > repos > eschen42 > w4mkmeans

--- a/w4mkmeans.xml	Tue Aug 08 15:30:38 2017 -0400
+++ b/w4mkmeans.xml	Wed Aug 09 18:06:55 2017 -0400
@@ -1,5 +1,5 @@
-<tool id="w4mkmeans" name="Kmeans_for_W4M" version="0.98.1">
-  <description>Calculate K-means for dataMatrix features or samples</description>
+<tool id="w4mkmeans" name="w4mKmeans" version="0.98.3">
+  <description>Calculate K-means for W4M dataMatrix features or samples</description>

   <requirements>
     <requirement type="package" version="3.3.2">r-base</requirement>
@@ -14,18 +14,19 @@
   <command detect_errors="aggressive"><![CDATA[
     Rscript $__tool_directory__/w4mkmeans_wrapper.R
       tool_directory $__tool_directory__
+      algorithm '$algorithm'
+      categorical_prefix '$categoricalPrefix'
       data_matrix_path '$dataMatrix_in'
-      variable_metadata_path '$variableMetadata_in'
-      sample_metadata_path '$sampleMetadata_in'
-      ksamples '$ksamples'
-      kfeatures '$kfeatures'
       iter_max '$iter_max'
+      kfeatures '$kfeatures'
+      ksamples '$ksamples'
       nstart '$nstart'
-      algorithm '$algorithm'
+      sampleMetadata_out '$sampleMetadata_out'
+      sample_metadata_path '$sampleMetadata_in'
       scores_out '$scores_out'
-      sampleMetadata_out '$sampleMetadata_out'
+      slots "\${GALAXY_SLOTS:-1}"
       variableMetadata_out '$variableMetadata_out'
-      slots "\${GALAXY_SLOTS:-1}"
+      variable_metadata_path '$variableMetadata_in'
     ; echo exit code $?
   ]]></command>

@@ -33,11 +34,12 @@
     <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="variable x sample, decimal: '.', missing: NA, mode: numerical, separator: tab" />
     <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="sample x metadata columns, separator: tab" />
     <param name="variableMetadata_in" label="Variable metadata file" type="data" format="tabular" help="variable x metadata columns, separator: tab" />
+    <param name="categoricalPrefix" label="prefix for cluster names " type="text" value="k" help="[categorical_prefix] Some tools require non-numeric values to discern categorical data; e.g., enter 'k' here to prepend 'k' to cluster numbers in the output; default 'k'." />
     <param name="ksamples" label="K value(s) for samples" type="text" value = "0" help="[ksamples] Single K or comma-separated Ks for samples, or 0 for none." />
     <param name="kfeatures" label="K value(s) for features" type="text" value = "0" help="[kfeatures] Single K or comma-separated Ks for features (variables), or 0 for none." />
     <param name="iter_max" label="Max number of iterations" type="text" value = "10" help="[iter_max] The maximum number of iterations allowed; default 10." />
     <param name="nstart" label="Number of random sets" type="text" value = "1" help="[nstart] How many random sets should be chosen; default 1." />
-    <param name="algorithm" label="Algorithm for clustering" type="select" value = "Hartigan-Wong" help="[algorithm] K-means clustering algorithm, default 'Hartigan-Wong'; alternatives 'Lloyd', 'MacQueen'; 'Forgy' is a synonym for 'Lloyd', see references for further info.">
+    <param name="algorithm" label="Algorithm for clustering" type="select" value = "Hartigan-Wong" help="[algorithm] K-means clustering algorithm, default 'Hartigan-Wong'; alternatives 'Lloyd', 'MacQueen'; 'Forgy' is a synonym for 'Lloyd', see stats::kmeans reference for further info.">
       <option value="Forgy">Forgy</option>
       <option value="Hartigan-Wong" selected="True">Hartigan-Wong</option>
       <option value="Lloyd">Lloyd</option>
@@ -52,7 +54,7 @@
   </outputs>

   <tests>
-        <test>
+    <test>
       <param name="dataMatrix_in" value="input_dataMatrix.tsv"/>
       <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
       <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
@@ -176,6 +178,10 @@

   - maximum number of iterations per calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html).

+**categorical_prefix** - character(s) to add as prefix to category number (default = 'k')
+
+  - some tools treat only non-numeric data as categorical; this prefix ('k' by default) ensures that clusters data will be treated as categorical; an empty string is permitted here if desired (and succeeding tools accept integers as categorical data).
+
 ------------
 Output files
 ------------
@@ -232,7 +238,8 @@
 NEWS
 ----

-August 2017, Version 0.98.1 - First release
+- August 2017, Version 0.98.3 - Add (optional) prefix to category numbers for downstream tools that treat only non-numeric data as categorical.
+- August 2017, Version 0.98.1 - First release

 ---------
 Citations
--- a/w4mkmeans_routines.R	Tue Aug 08 15:30:38 2017 -0400
+++ b/w4mkmeans_routines.R	Wed Aug 09 18:06:55 2017 -0400
@@ -10,23 +10,24 @@
      "w4mkmeans: bad input.",
      "# contract:",
      "    required - caller will provide an environment comprising:",
-     "      log_print        - a logging function with the signature function(x, ...) expecting strings as x and ...",
-     "      variableMetadata - the corresponding W4M data.frame having feature metadata",
-     "      sampleMetdata    - the corresponding W4M data.frame having sample metadata",
-     "      dataMatrix       - the corresponding W4M matrix",
-     "      slots            - the number of parallel slots for calculating kmeans",
+     "      log_print          - a logging function with the signature function(x, ...) expecting strings as x and ...",
+     "      variableMetadata   - the corresponding W4M data.frame having feature metadata",
+     "      sampleMetdata      - the corresponding W4M data.frame having sample metadata",
+     "      dataMatrix         - the corresponding W4M matrix",
+     "      slots              - the number of parallel slots for calculating kmeans",
      "    optional - environment may comprise:",
-     "      kfeatures        - an array of integers, the k's to apply for clustering by feature (default, empty array)",
-     "      ksamples         - an array of integers, the k's to apply for clustering by sample (default, empty array)",
-     "      iter.max         - the maximum number of iterations when calculating a cluster (default = 10)",
-     "      nstart           - how many random sets of centers should be chosen (default = 1)",
-     "      algorithm        - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)",
+     "      kfeatures          - an array of integers, the k's to apply for clustering by feature (default, empty array)",
+     "      ksamples           - an array of integers, the k's to apply for clustering by sample (default, empty array)",
+     "      iter.max           - the maximum number of iterations when calculating a cluster (default = 10)",
+     "      nstart             - how many random sets of centers should be chosen (default = 1)",
+     "      algorithm          - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)",
+     "      categorical_prefix - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)",
      "      ",
      "    this routine will return a list comprising:",
-     "      variableMetadata - the input variableMetadata data.frame with updates, if any",
-     "      sampleMetadata   - the input sampleMetadata data.frame with updates, if any",
-     "      scores           - an array of strings, each representing a line of a tsv having the following header:",
-     "                           clusterOn TAB k TAB totalSS TAB betweenSS TAB proportion"
+     "      variableMetadata   - the input variableMetadata data.frame with updates, if any",
+     "      sampleMetadata     - the input sampleMetadata data.frame with updates, if any",
+     "      scores             - an array of strings, each representing a line of a tsv having the following header:",
+     "                             clusterOn TAB k TAB totalSS TAB betweenSS TAB proportion"
     )
   )
 }
@@ -37,11 +38,12 @@
     lapply(w4kmeans_usage(),print)
   }
   # supply default arguments
-  if ( ! exists("iter.max" , env) ) env$iter.max  <- 10
-  if ( ! exists("nstart"   , env) ) env$nstart    <- 1
-  if ( ! exists("algorithm", env) ) env$algorithm <- 'Hartigan-Wong'
-  if ( ! exists("ksamples" , env) ) env$ksamples  <- c()
-  if ( ! exists("kfeatures", env) ) env$kfeatures <- c()
+  if ( ! exists("iter.max"          , env) ) env$iter.max  <- 10
+  if ( ! exists("nstart"            , env) ) env$nstart    <- 1
+  if ( ! exists("algorithm"         , env) ) env$algorithm <- 'Hartigan-Wong'
+  if ( ! exists("categorical_prefix", env) ) env$categorical_prefix <- 'k'
+  if ( ! exists("ksamples"          , env) ) env$ksamples  <- c()
+  if ( ! exists("kfeatures"         , env) ) env$kfeatures <- c()
   # check mandatory arguments
   expected <- c(
     "log_print"
@@ -61,9 +63,19 @@
   scores          <- c( "clusterOn\tk\ttotalSS\tbetweenSS\tproportion" )
   sampleMetadata  <- env$sampleMetadata
   featureMetadata <- env$variableMetadata
-  ksamples        <- as.numeric(env$ksamples)
-  kfeatures       <- as.numeric(env$kfeatures)
   slots           <- env$slots
+  positive_ints <- function(a, what) {
+    i <- as.integer(a)    # may introduce NAs by coercion
+    i <- i[!is.na(i)]     # eliminate NAs
+    i <- i[i > 0]         # eliminate non-positive integers
+    i <- unique(sort(i))  # eliminate redundancy and disorder
+    if (length(a)!=length(i)) {
+      failure_action("Some values for '", what, "' were skipped where not unique, not positive, or not convertible to an integer.")
+    }
+    return (i)            # return results, if any
+  }
+  ksamples        <- positive_ints(env$ksamples , "ksamples")
+  kfeatures       <- positive_ints(env$kfeatures, "kfeatures")

   myLapply <- parLapply
   # uncomment the next line to mimic parLapply, but without parallelization (for testing/experimentation)
@@ -113,7 +125,7 @@
         for ( i in 1:ksamples_length ) {
           result <- smpl_result_list[[i]]
           if (result$success) {
-            sampleMetadata[sprintf("k%d",ksamples[i])] <- result$value$clusters
+            sampleMetadata[sprintf("k%d",ksamples[i])] <- sprintf("%s%d", env$categorical_prefix, result$value$clusters)
             scores <- c(scores, result$value$scores)
           }
         }
@@ -132,7 +144,7 @@
         for ( i in 1:kfeatures_length ) {
           result <- feat_result_list[[i]]
           if (result$success) {
-            featureMetadata[sprintf("k%d",kfeatures[i])] <- result$value$clusters
+            featureMetadata[sprintf("k%d",kfeatures[i])] <- sprintf("%s%d", env$categorical_prefix, result$value$clusters)
             scores <- c(scores, result$value$scores)
           }
         }
--- a/w4mkmeans_wrapper.R	Tue Aug 08 15:30:38 2017 -0400
+++ b/w4mkmeans_wrapper.R	Wed Aug 09 18:06:55 2017 -0400
@@ -8,25 +8,26 @@
 #   - [parallel::clusterApply](https://stat.ethz.ch/R-manual/R-devel/library/parallel/html/clusterApply.html)

 # invocation:
-#   Rscript $__tool_directory__/w4mkmeans_wrapper.R \
-#     tool_directory $__tool_directory__
-#     data_matrix_path '$dataMatrix_in' \
-#     variable_metadata_path '$variableMetadata_in' \
-#     sample_metadata_path '$sampleMetadata_in' \
-#     kfeatures '$kfeatures' \
-#     ksamples '$ksamples' \
-#     iter_max '$iter_max' \
-#     nstart '$nstart' \
-#     algorithm '$algorithm' \
-#     scores '$scores' \
-#     sampleMetadata_out '$sampleMetadata_out' \
-#     variableMetadata_out '$variableMetadata_out' \
-#     slots "\${GALAXY_SLOTS:-1}" \
+#   Rscript w4mkmeans_wrapper.R \
+#     algorithm "$algorithm" \
+#     categorical_prefix "$categorical_prefix" \
+#     data_matrix_path "$dataMatrix_in" \
+#     iter_max "$iter_max" \
+#     kfeatures "$kfeatures" \
+#     ksamples "$ksamples" \
+#     nstart "$nstart" \
+#     sampleMetadata_out "$sampleMetadata_out" \
+#     sample_metadata_path "$sampleMetadata_in" \
+#     scores_out "$scores_out" \
+#     slots "${GALAXY_SLOTS:-1}" \
+#     variableMetadata_out "$variableMetadata_out" \
+#     variable_metadata_path "$variableMetadata_in"
 #
 # <inputs>
 #   <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="variable x sample, decimal: '.', missing: NA, mode: numerical, separator: tab" />
 #   <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="sample x metadata columns, separator: tab" />
 #   <param name="variableMetadata_in" label="Variable metadata file" type="data" format="tabular" help="variable x metadata columns, separator: tab" />
+#   <param name="categoricalPrefix" label="prefix for cluster names " type="text" value="k" help="Some tools require non-numeric values to discern categorical; e.g., enter 'k' here to prepend 'k' to cluster numbers in the output; default 'k'." />
 #   <param name="kfeatures" label="K value(s) for features" type="text" value="0" help="Single or min,max value(s) for K for features (variables), or 0 for none." />
 #   <param name="ksamples" label="K value(s) for samples" type="text" value="0" help="Single or min,max value(s) for K for samples, or 0 for none." />
 #   <param name="iter_max" label="Max number of iterations" type="text" value="10" help="The maximum number of iterations allowed; default 10." />
@@ -294,6 +295,10 @@
 args_env$slots     <- as.numeric(               argVc['slots'    ])
 # string args
 args_env$algorithm <- as.character(             argVc['algorithm'])
+args_env$categorical_prefix <- as.character(    argVc['categorical_prefix'])
+
+
+# make local 'log_print' function available through 'env'
 args_env$log_print <- log_print

 log_print("PARAMETERS (parsed):")