comparison w4mkmeans_wrapper.R @ 0:6ccbe18131a6 draft

planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit 299e5c7fdb0d6eb0773f3660009f6d63c2082a8d
author eschen42
date Tue, 08 Aug 2017 15:30:38 -0400
parents
children 02cafb660b72
comparison
equal deleted inserted replaced
-1:000000000000 0:6ccbe18131a6
1 #!/usr/bin/env Rscript
2
3 # references:
4 # what this does:
5 # - [stats::kmeans](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html)
6 # - [stats::p.adjust](https://stat.ethz.ch/R-manual/R-devel/library/stats/html/p.adjust.html)
7 # how this does what it does:
8 # - [parallel::clusterApply](https://stat.ethz.ch/R-manual/R-devel/library/parallel/html/clusterApply.html)
9
10 # invocation:
11 # Rscript $__tool_directory__/w4mkmeans_wrapper.R \
12 # tool_directory $__tool_directory__
13 # data_matrix_path '$dataMatrix_in' \
14 # variable_metadata_path '$variableMetadata_in' \
15 # sample_metadata_path '$sampleMetadata_in' \
16 # kfeatures '$kfeatures' \
17 # ksamples '$ksamples' \
18 # iter_max '$iter_max' \
19 # nstart '$nstart' \
20 # algorithm '$algorithm' \
21 # scores '$scores' \
22 # sampleMetadata_out '$sampleMetadata_out' \
23 # variableMetadata_out '$variableMetadata_out' \
24 # slots "\${GALAXY_SLOTS:-1}" \
25 #
26 # <inputs>
27 # <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="variable x sample, decimal: '.', missing: NA, mode: numerical, separator: tab" />
28 # <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="sample x metadata columns, separator: tab" />
29 # <param name="variableMetadata_in" label="Variable metadata file" type="data" format="tabular" help="variable x metadata columns, separator: tab" />
30 # <param name="kfeatures" label="K value(s) for features" type="text" value="0" help="Single or min,max value(s) for K for features (variables), or 0 for none." />
31 # <param name="ksamples" label="K value(s) for samples" type="text" value="0" help="Single or min,max value(s) for K for samples, or 0 for none." />
32 # <param name="iter_max" label="Max number of iterations" type="text" value="10" help="The maximum number of iterations allowed; default 10." />
33 # <param name="nstart" label="Number of random sets" type="text" value="1" help="How many random sets should be chosen; default 1." />
34 # <param name="algorithm" label="Algorithm for clustering" type="select" value = "Hartigan-Wong" help="K-means clustering algorithm, default 'Hartigan-Wong'; alternatives 'Lloyd', 'MacQueen'; 'Forgy' is a synonym for 'Lloyd', see stats::kmeans reference for further info and references.">
35 # <option value="Hartigan-Wong" selected="TRUE">Hartigan-Wong</option>
36 # <option value="Lloyd">Lloyd</option>
37 # <option value="MacQueen">MacQueen</option>
38 # <option value="Forgy">Forgy</option>
39 # </param>
40 # </inputs>
41 # <outputs>
42 # <data name="sampleMetadata_out" label="${tool.name}_${sampleMetadata_in.name}" format="tabular" ></data>
43 # <data name="variableMetadata_out" label="${tool.name}_${variableMetadata_in.name}" format="tabular" ></data>
44 # </outputs>
45
46 ##------------------------
47 ## libraries for this file
48 ##------------------------
49
50 library(batch) ## for 'parseCommandArgs'
51
52 ##-------------------
53 ## Pre-initialization
54 ##-------------------
55
56 argVc <- unlist(parseCommandArgs(evaluate=FALSE))
57 if ( Reduce( `|`, grepl("tool_directory",names(argVc)) ) ) {
58 tool_directory <- as.character(argVc["tool_directory"])
59 } else {
60 tool_directory <- "."
61 }
62 r_path <- function(f) paste( tool_directory, f, sep = "/" )
63
64 ##----------------------------------------------------------
65 ## Computation - source general and module-specific routines
66 ##----------------------------------------------------------
67
68 log_print <- function(x, ...) {
69 cat(
70 format(Sys.time(), "%Y-%m-%dT%H:%M:%S%z")
71 , " "
72 , c(x, ...)
73 , "\n"
74 , sep=""
75 , file=stderr()
76 )
77 }
78
79 # log_print(sprintf("tool_directory is %s", tool_directory))
80
81 w4m_general_purpose_routines_path <- r_path("w4m_general_purpose_routines.R")
82 # log_print(sprintf("w4m_general_purpose_routines_path is %s", w4m_general_purpose_routines_path))
83 if ( ! file.exists(w4m_general_purpose_routines_path) ) {
84 log_print("cannot find file w4m_general_purpose_routines.R")
85 q(save = "no", status = 1, runLast = TRUE)
86 }
87 # log_print("sourcing ",w4m_general_purpose_routines_path)
88 source(w4m_general_purpose_routines_path)
89 if ( ! exists("prepare.data.matrix") ) {
90 log_print("'prepare.data.matrix' was not read from file w4m_general_purpose_routines.R")
91 q(save = "no", status = 1, runLast = TRUE)
92 }
93
94 w4mkmeans_routines_path <- r_path("w4mkmeans_routines.R")
95 # log_print(sprintf("w4mkmeans_routines_path is %s", w4mkmeans_routines_path))
96 if ( ! file.exists(w4mkmeans_routines_path) ) {
97 log_print("cannot find file w4mkmeans_routines.R")
98 q(save = "no", status = 1, runLast = TRUE)
99 }
100 # log_print("sourcing ",w4mkmeans_routines_path)
101 source(w4mkmeans_routines_path)
102 if ( ! exists("w4mkmeans") ) {
103 log_print("'w4mkmeans' was not read from file w4mkmeans_routines.R")
104 q(save = "no", status = 1, runLast = TRUE)
105 }
106
107 ##-----------------------------------------
108 ## Computation - W4m data-suppport routines
109 ##-----------------------------------------
110
111 # read_data_frame - read a w4m data frame from a tsv, with error handling
112 # e.g., data_matrix_input_env <- read_data_frame(dataMatrix_in, "data matrix input")
113 read_data_frame <- function(file_path, kind_string, failure_action = log_print) {
114 my.env <- new.env()
115 my.env$success <- FALSE
116 my.env$msg <- sprintf("no message reading %s", kind_string)
117 tryCatch(
118 expr = {
119 my.env$data <- utils::read.delim( fill = FALSE, file = file_path )
120 my.env$success <- TRUE
121 }
122 , error = function(e) {
123 my.env$msg <<- sprintf("%s read failed", kind_string)
124 }
125 )
126 if (!my.env$success) {
127 failure_action(my.env$msg)
128 }
129 return (my.env)
130 }
131
132 # write_result - write a w4m data frame to a tsv
133 write_result <- function(result, file_path, kind_string, failure_action = log_print) {
134 my.env <- new.env()
135 my.env$success <- FALSE
136 my.env$msg <- sprintf("no message writing %s", kind_string)
137 tryCatch(
138 expr = {
139 write.table(
140 x = result
141 , sep = "\t"
142 , file = file_path
143 , quote = FALSE
144 , row.names = FALSE
145 )
146 my.env$success <- TRUE
147 }
148 , error = function(e) {
149 my.env$msg <<- sprintf("%s write failed", kind_string)
150 }
151 )
152 if (!my.env$success) {
153 failure_action(my.env$msg)
154 return (my.env)
155 }
156 return (my.env)
157 }
158
159 # read the three input files
160 read_input_data <- function(env, failure_action = log_print) {
161 kind_string <- "none"
162 tryCatch(
163 expr = {
164 # read in the sample metadata
165 kind_string <- "sample metadata input"
166 smpl_metadata_input_env <-
167 read_data_frame(
168 file_path = env$sample_metadata_path
169 , kind_string = kind_string
170 , failure_action = failure_action
171 )
172 if (!smpl_metadata_input_env$success) {
173 failure_action(smpl_metadata_input_env$msg)
174 return ( FALSE )
175 }
176 env$sampleMetadata <- smpl_metadata_input_env$data
177
178 # read in the variable metadata
179 kind_string <- "variable metadata input"
180 vrbl_metadata_input_env <-
181 read_data_frame(
182 file_path = env$variable_metadata_path
183 , kind_string = kind_string
184 , failure_action = failure_action
185 )
186 if (!vrbl_metadata_input_env$success) {
187 failure_action(vrbl_metadata_input_env$msg)
188 return ( FALSE )
189 }
190 env$variableMetadata <- vrbl_metadata_input_env$data
191
192 # read in the data matrix
193 kind_string <- "data matrix input"
194 data_matrix_input_env <-
195 read_data_frame(
196 file_path = env$data_matrix_path
197 , kind_string = kind_string
198 , failure_action = failure_action
199 )
200 if (!data_matrix_input_env$success) {
201 failure_action(data_matrix_input_env$msg)
202 return ( FALSE )
203 }
204 # data frame for dataMatrix has rownames in first column
205 data_matrix_df <- data_matrix_input_env$data
206 rownames(data_matrix_df) <- data_matrix_df[,1]
207 data_matrix <- data_matrix_df[,2:ncol(data_matrix_df)]
208 env$dataMatrix <- as.matrix(data_matrix)
209
210 }
211 , error = function(e) {
212 failure_action( sprintf("read_input_data failed for '%s' - %s", kind_string, format_error(e)) )
213 return ( FALSE )
214 }
215 )
216 return ( TRUE )
217 }
218
219
220 read_input_failure_action <- function(x, ...) {
221 log_print("Failure reading input for '", modNamC, "' Galaxy module call")
222 log_print(x, ...)
223 }
224
225 ##--------------------------
226 ## Computation - Entry Point
227 ##--------------------------
228
229 ##----------
230 ## Constants
231 ##----------
232
233 modNamC <- "w4mkmeans" ## module name
234
235 ## options
236 ##--------
237
238 # Set the handler for R error-handling
239 options( show.error.messages = F
240 , error = function () {
241 log_print( "Fatal error in '", modNamC, "': ", geterrmessage() )
242 q( "no", 1, F )
243 }
244 , warn = -1
245 )
246
247 # strings as factors? - not by default!
248 # save old value
249 strAsFacL <- options()$stringsAsFactors
250 options(stringsAsFactors = FALSE)
251
252
253 ## log file
254 ##---------
255
256 log_print("Start of the '", modNamC, "' Galaxy module call")
257
258 ## arguments
259 ##----------
260
261 args_env <- new.env()
262
263 # files
264
265 log_print("PARAMETERS (raw):")
266 invisible(
267 lapply(
268 X = 1:length(argVc)
269 , FUN = function(i) {
270 log_print(sprintf(" - %s: %s", names(argVc)[i], argVc[i]))
271 }
272 )
273 )
274
275 # write.table(as.matrix(argVc), col.names=F, quote=F, sep='\t')
276
277 ## output files
278 sampleMetadata_out <- as.character(argVc["sampleMetadata_out"])
279 variableMetadata_out <- as.character(argVc["variableMetadata_out"])
280 scores_out <- as.character(argVc["scores_out"])
281 ## input files
282 args_env$data_matrix_path <- as.character(argVc["data_matrix_path"])
283 args_env$variable_metadata_path <- as.character(argVc["variable_metadata_path"])
284 args_env$sample_metadata_path <- as.character(argVc["sample_metadata_path"])
285
286 # other parameters
287
288 # multi-string args - split csv: "1,2,3" -> c("1","2","3")
289 args_env$kfeatures <- strsplit(x = as.character(argVc['kfeatures']), split = ",", fixed = TRUE)[[1]]
290 args_env$ksamples <- strsplit(x = as.character(argVc['ksamples' ]), split = ",", fixed = TRUE)[[1]]
291 # numeric args
292 args_env$iter_max <- as.numeric( argVc['iter_max' ])
293 args_env$nstart <- as.numeric( argVc['nstart' ])
294 args_env$slots <- as.numeric( argVc['slots' ])
295 # string args
296 args_env$algorithm <- as.character( argVc['algorithm'])
297 args_env$log_print <- log_print
298
299 log_print("PARAMETERS (parsed):")
300 for (member in ls(args_env)) {
301 value <- get(member, args_env)
302 value <- ifelse(length(value) == 1, value, sprintf("c(%s)", paste(value, collapse=", ")))
303
304 log_print(sprintf(" - %s: %s", member, ifelse( !is.function(value) , value, "function" )))
305 }
306 log_print("")
307
308 ##---------------------------------------------------------
309 ## Computation - attempt to read input data
310 ##---------------------------------------------------------
311 if ( ! read_input_data(args_env, failure_action = read_input_failure_action) ) {
312 result <- -1
313 } else {
314 log_print("Input data was read successfully.")
315 result <- w4mkmeans(env = args_env)
316 log_print("returned from call to w4mkmeans.")
317 }
318
319 if ( length(result) == 0 ) {
320 log_print("no results were produced")
321 # exit with status code non-zero to indicate error
322 q(save = "no", status = 1, runLast = FALSE)
323 } else if ( ! setequal(names(result),c("variableMetadata","sampleMetadata","scores")) ) {
324 log_print(sprintf("unexpected result keys %s", names(result)))
325 # exit with status code non-zero to indicate error
326 q(save = "no", status = 1, runLast = FALSE)
327 } else if ( ! write_result(result = result$variableMetadata, file_path = variableMetadata_out, kind_string = "clustered variableMetadata")$success ) {
328 log_print("failed to write output file for clustered variableMetadata")
329 # exit with status code non-zero to indicate error
330 q(save = "no", status = 1, runLast = FALSE)
331 } else if ( ! write_result(result = result$sampleMetadata, file_path = sampleMetadata_out, kind_string = "clustered sampleMetadata")$success ) {
332 log_print("failed to write output file for clustered sampleMetadata")
333 # exit with status code non-zero to indicate error
334 q(save = "no", status = 1, runLast = FALSE)
335 } else {
336 tryCatch(
337 expr = {
338 fileConn<-file(scores_out)
339 writeLines(result$scores, fileConn)
340 close(fileConn)
341 }
342 , error = function(e) {
343 log_print(sprintf("failed to write output file for cluster scores - %s", format_error(e)))
344 # exit with status code non-zero to indicate error
345 q(save = "no", status = 1, runLast = FALSE)
346 }
347 )
348 }
349
350 ##--------
351 ## Closing
352 ##--------
353
354
355 if (!file.exists(sampleMetadata_out)) {
356 log_print(sprintf("ERROR %s::w4m_kmeans_wrapper - file '%s' was not created", modNamC, sampleMetadata_out))
357 }
358
359 if (!file.exists(variableMetadata_out)) {
360 log_print(sprintf("ERROR %s::w4m_kmeans_wrapper - file '%s' was not created", modNamC, variableMetadata_out))
361 }
362
363 if (!file.exists(scores_out)) {
364 log_print(sprintf("ERROR %s::w4m_kmeans_wrapper - file '%s' was not created", modNamC, scores_out))
365 }
366
367 log_print("Normal termination of '", modNamC, "' Galaxy module call")
368
369 # exit with status code zero
370 q(save = "no", status = 0, runLast = FALSE)