diff w4mkmeans_wrapper.R @ 0:6ccbe18131a6 draft

planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit 299e5c7fdb0d6eb0773f3660009f6d63c2082a8d
author eschen42
date Tue, 08 Aug 2017 15:30:38 -0400
parents
children 02cafb660b72
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/w4mkmeans_wrapper.R	Tue Aug 08 15:30:38 2017 -0400
@@ -0,0 +1,370 @@
+#!/usr/bin/env Rscript
+
+# references:
+#   what this does:
+#   - [stats::kmeans](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html)
+#   - [stats::p.adjust](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/p.adjust.html)
+#   how this does what it does:
+#   - [parallel::clusterApply](https://stat.ethz.ch/R-manual/R-devel/library/parallel/html/clusterApply.html)
+
+# invocation:
+#   Rscript $__tool_directory__/w4mkmeans_wrapper.R \
+#     tool_directory $__tool_directory__
+#     data_matrix_path '$dataMatrix_in' \
+#     variable_metadata_path '$variableMetadata_in' \
+#     sample_metadata_path '$sampleMetadata_in' \
+#     kfeatures '$kfeatures' \
+#     ksamples '$ksamples' \
+#     iter_max '$iter_max' \
+#     nstart '$nstart' \
+#     algorithm '$algorithm' \
+#     scores '$scores' \
+#     sampleMetadata_out '$sampleMetadata_out' \
+#     variableMetadata_out '$variableMetadata_out' \
+#     slots "\${GALAXY_SLOTS:-1}" \
+# 
+# <inputs>
+#   <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="variable x sample, decimal: '.', missing: NA, mode: numerical, separator: tab" />
+#   <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="sample x metadata columns, separator: tab" />
+#   <param name="variableMetadata_in" label="Variable metadata file" type="data" format="tabular" help="variable x metadata columns, separator: tab" />
+#   <param name="kfeatures" label="K value(s) for features" type="text" value="0" help="Single or min,max value(s) for K for features (variables), or 0 for none." />
+#   <param name="ksamples" label="K value(s) for samples" type="text" value="0" help="Single or min,max value(s) for K for samples, or 0 for none." />
+#   <param name="iter_max" label="Max number of iterations" type="text" value="10" help="The maximum number of iterations allowed; default 10." />
+#   <param name="nstart" label="Number of random sets" type="text" value="1" help="How many random sets should be chosen; default 1." />
+# 	<param name="algorithm" label="Algorithm for clustering" type="select" value = "Hartigan-Wong" help="K-means clustering algorithm, default 'Hartigan-Wong'; alternatives 'Lloyd', 'MacQueen'; 'Forgy' is a synonym for 'Lloyd', see stats::kmeans reference for further info and references.">
+# 	  <option value="Hartigan-Wong" selected="TRUE">Hartigan-Wong</option>
+# 	  <option value="Lloyd">Lloyd</option>
+# 	  <option value="MacQueen">MacQueen</option>
+# 	  <option value="Forgy">Forgy</option>
+# 	</param>
+# </inputs>
+# <outputs>
+#   <data name="sampleMetadata_out" label="${tool.name}_${sampleMetadata_in.name}" format="tabular" ></data>
+#   <data name="variableMetadata_out" label="${tool.name}_${variableMetadata_in.name}" format="tabular" ></data>
+# </outputs>
+
+##------------------------
+## libraries for this file
+##------------------------
+
+library(batch) ## for 'parseCommandArgs'
+
+##-------------------
+## Pre-initialization
+##-------------------
+
+argVc <- unlist(parseCommandArgs(evaluate=FALSE))
+if ( Reduce( `|`, grepl("tool_directory",names(argVc)) ) ) {
+  tool_directory <- as.character(argVc["tool_directory"])
+} else {
+  tool_directory <- "."
+}
+r_path <- function(f) paste( tool_directory, f, sep = "/" )
+
+##----------------------------------------------------------
+## Computation - source general and module-specific routines
+##----------------------------------------------------------
+
+log_print <- function(x, ...) { 
+  cat(
+    format(Sys.time(), "%Y-%m-%dT%H:%M:%S%z")
+  , " "
+  , c(x, ...)
+  , "\n"
+  , sep=""
+  , file=stderr()
+  )
+}
+
+# log_print(sprintf("tool_directory is %s", tool_directory))
+
+w4m_general_purpose_routines_path <- r_path("w4m_general_purpose_routines.R")
+# log_print(sprintf("w4m_general_purpose_routines_path is %s", w4m_general_purpose_routines_path))
+if ( ! file.exists(w4m_general_purpose_routines_path) ) {
+  log_print("cannot find file w4m_general_purpose_routines.R")
+  q(save = "no", status = 1, runLast = TRUE)
+}
+# log_print("sourcing ",w4m_general_purpose_routines_path)
+source(w4m_general_purpose_routines_path)
+if ( ! exists("prepare.data.matrix") ) {
+  log_print("'prepare.data.matrix' was not read from file w4m_general_purpose_routines.R")
+  q(save = "no", status = 1, runLast = TRUE)
+}
+
+w4mkmeans_routines_path <- r_path("w4mkmeans_routines.R")
+# log_print(sprintf("w4mkmeans_routines_path is %s", w4mkmeans_routines_path))
+if ( ! file.exists(w4mkmeans_routines_path) ) {
+  log_print("cannot find file w4mkmeans_routines.R")
+  q(save = "no", status = 1, runLast = TRUE)
+}
+# log_print("sourcing ",w4mkmeans_routines_path)
+source(w4mkmeans_routines_path)
+if ( ! exists("w4mkmeans") ) {
+  log_print("'w4mkmeans' was not read from file w4mkmeans_routines.R")
+  q(save = "no", status = 1, runLast = TRUE)
+}
+
+##-----------------------------------------
+## Computation - W4m data-suppport routines
+##-----------------------------------------
+
+# read_data_frame - read a w4m data frame from a tsv, with error handling
+#   e.g., data_matrix_input_env <- read_data_frame(dataMatrix_in, "data matrix input")
+read_data_frame <- function(file_path, kind_string, failure_action = log_print) {
+  my.env <- new.env()
+  my.env$success <- FALSE
+  my.env$msg <- sprintf("no message reading %s", kind_string)
+  tryCatch(
+    expr = {
+      my.env$data    <- utils::read.delim( fill = FALSE, file = file_path )
+      my.env$success <- TRUE
+    }
+  , error = function(e) {
+     my.env$msg <<- sprintf("%s read failed", kind_string)
+    }
+  )
+  if (!my.env$success) {
+    failure_action(my.env$msg)
+  }
+  return (my.env)
+}
+
+# write_result - write a w4m data frame to a tsv
+write_result <- function(result, file_path, kind_string, failure_action = log_print) {
+  my.env <- new.env()
+  my.env$success <- FALSE
+  my.env$msg <- sprintf("no message writing %s", kind_string)
+  tryCatch(
+    expr = {
+      write.table(
+        x = result
+      , sep = "\t"
+      , file = file_path
+      , quote = FALSE
+      , row.names = FALSE
+      )
+      my.env$success <- TRUE
+    }
+  , error = function(e) {
+     my.env$msg <<- sprintf("%s write failed", kind_string)
+    }
+  )
+  if (!my.env$success) {
+    failure_action(my.env$msg)
+    return (my.env)
+  }
+  return (my.env)
+}
+
+# read the three input files
+read_input_data <- function(env, failure_action = log_print) {
+  kind_string <- "none"
+  tryCatch(
+    expr = {
+      # read in the sample metadata
+      kind_string <- "sample metadata input"
+      smpl_metadata_input_env <- 
+        read_data_frame(
+                         file_path = env$sample_metadata_path
+                       , kind_string = kind_string
+                       , failure_action = failure_action
+                       )
+      if (!smpl_metadata_input_env$success) {
+        failure_action(smpl_metadata_input_env$msg)
+        return ( FALSE )
+      }
+      env$sampleMetadata <- smpl_metadata_input_env$data
+
+      # read in the variable metadata
+      kind_string <- "variable metadata input"
+      vrbl_metadata_input_env <- 
+        read_data_frame(
+                         file_path = env$variable_metadata_path
+                       , kind_string = kind_string
+                       , failure_action = failure_action
+                       )
+      if (!vrbl_metadata_input_env$success) {
+        failure_action(vrbl_metadata_input_env$msg)
+        return ( FALSE )
+      }
+      env$variableMetadata <- vrbl_metadata_input_env$data
+
+      # read in the data matrix
+      kind_string <- "data matrix input"
+      data_matrix_input_env <-
+        read_data_frame(
+                         file_path = env$data_matrix_path
+                       , kind_string = kind_string
+                       , failure_action = failure_action
+                       )
+      if (!data_matrix_input_env$success) {
+        failure_action(data_matrix_input_env$msg)
+        return ( FALSE )
+      }
+      # data frame for dataMatrix has rownames in first column
+      data_matrix_df <- data_matrix_input_env$data
+      rownames(data_matrix_df) <- data_matrix_df[,1]
+      data_matrix <- data_matrix_df[,2:ncol(data_matrix_df)]
+      env$dataMatrix <- as.matrix(data_matrix)
+
+    }
+  , error = function(e) {
+     failure_action( sprintf("read_input_data failed for '%s' - %s", kind_string, format_error(e)) )
+     return ( FALSE )
+    }
+  )
+  return ( TRUE )
+}
+
+
+read_input_failure_action <- function(x, ...) { 
+  log_print("Failure reading input for '", modNamC, "' Galaxy module call")
+  log_print(x, ...)
+}
+
+##--------------------------
+## Computation - Entry Point
+##--------------------------
+
+##----------
+## Constants
+##----------
+
+modNamC <- "w4mkmeans" ## module name
+
+## options
+##--------
+
+# Set the handler for R error-handling
+options( show.error.messages = F
+       , error = function () { 
+                   log_print( "Fatal error in '", modNamC, "': ", geterrmessage() )
+                   q( "no", 1, F )
+                 }
+       , warn = -1
+       )
+
+# strings as factors? - not by default!
+# save old value
+strAsFacL <- options()$stringsAsFactors
+options(stringsAsFactors = FALSE)
+
+
+## log file
+##---------
+
+log_print("Start of the '", modNamC, "' Galaxy module call")
+
+## arguments
+##----------
+
+args_env <- new.env()
+
+# files
+
+log_print("PARAMETERS (raw):")
+invisible(
+  lapply(
+    X = 1:length(argVc)
+  , FUN = function(i) {
+      log_print(sprintf("  - %s: %s", names(argVc)[i], argVc[i]))
+    }
+  )
+)
+
+# write.table(as.matrix(argVc), col.names=F, quote=F, sep='\t')
+
+## output files
+sampleMetadata_out              <- as.character(argVc["sampleMetadata_out"])
+variableMetadata_out            <- as.character(argVc["variableMetadata_out"])
+scores_out                      <- as.character(argVc["scores_out"])
+## input files
+args_env$data_matrix_path       <- as.character(argVc["data_matrix_path"])
+args_env$variable_metadata_path <- as.character(argVc["variable_metadata_path"])
+args_env$sample_metadata_path   <- as.character(argVc["sample_metadata_path"])
+  
+# other parameters
+
+# multi-string args - split csv: "1,2,3" -> c("1","2","3")
+args_env$kfeatures <- strsplit(x = as.character(argVc['kfeatures']), split = ",", fixed = TRUE)[[1]]
+args_env$ksamples  <- strsplit(x = as.character(argVc['ksamples' ]), split = ",", fixed = TRUE)[[1]]
+# numeric args
+args_env$iter_max  <- as.numeric(               argVc['iter_max'  ])
+args_env$nstart    <- as.numeric(               argVc['nstart'   ])
+args_env$slots     <- as.numeric(               argVc['slots'    ])
+# string args
+args_env$algorithm <- as.character(             argVc['algorithm'])
+args_env$log_print <- log_print
+
+log_print("PARAMETERS (parsed):")
+for (member in ls(args_env)) {
+  value <- get(member, args_env)
+  value <- ifelse(length(value) == 1, value, sprintf("c(%s)", paste(value, collapse=", ")))
+  
+  log_print(sprintf("  - %s: %s", member, ifelse( !is.function(value) , value, "function" )))
+}
+log_print("")
+
+##---------------------------------------------------------
+## Computation - attempt to read input data
+##---------------------------------------------------------
+if ( ! read_input_data(args_env, failure_action = read_input_failure_action) ) {
+  result <- -1
+} else {
+  log_print("Input data was read successfully.")
+  result <- w4mkmeans(env = args_env)
+  log_print("returned from call to w4mkmeans.")
+}
+
+if ( length(result) == 0 ) {
+  log_print("no results were produced")
+  # exit with status code non-zero to indicate error
+  q(save = "no", status = 1, runLast = FALSE)
+} else if ( ! setequal(names(result),c("variableMetadata","sampleMetadata","scores")) ) {
+  log_print(sprintf("unexpected result keys %s", names(result)))
+  # exit with status code non-zero to indicate error
+  q(save = "no", status = 1, runLast = FALSE)
+} else if ( ! write_result(result = result$variableMetadata, file_path = variableMetadata_out, kind_string = "clustered variableMetadata")$success ) {
+  log_print("failed to write output file for clustered variableMetadata")
+  # exit with status code non-zero to indicate error
+  q(save = "no", status = 1, runLast = FALSE)
+} else if ( ! write_result(result = result$sampleMetadata, file_path = sampleMetadata_out, kind_string = "clustered sampleMetadata")$success ) {
+  log_print("failed to write output file for clustered sampleMetadata")
+  # exit with status code non-zero to indicate error
+  q(save = "no", status = 1, runLast = FALSE)
+} else {
+  tryCatch(
+    expr = {
+      fileConn<-file(scores_out)
+      writeLines(result$scores, fileConn)
+      close(fileConn)
+    }
+  , error = function(e) {
+      log_print(sprintf("failed to write output file for cluster scores - %s", format_error(e)))
+      # exit with status code non-zero to indicate error
+      q(save = "no", status = 1, runLast = FALSE)
+    }
+  )
+}
+
+##--------
+## Closing
+##--------
+
+
+if (!file.exists(sampleMetadata_out)) {
+  log_print(sprintf("ERROR %s::w4m_kmeans_wrapper - file '%s' was not created", modNamC, sampleMetadata_out))
+}
+
+if (!file.exists(variableMetadata_out)) {
+  log_print(sprintf("ERROR %s::w4m_kmeans_wrapper - file '%s' was not created", modNamC, variableMetadata_out))
+}
+
+if (!file.exists(scores_out)) {
+  log_print(sprintf("ERROR %s::w4m_kmeans_wrapper - file '%s' was not created", modNamC, scores_out))
+}
+
+log_print("Normal termination of '", modNamC, "' Galaxy module call")
+
+# exit with status code zero
+q(save = "no", status = 0, runLast = FALSE)