comparison w4mkmeans_routines.R @ 0:6ccbe18131a6 draft

planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit 299e5c7fdb0d6eb0773f3660009f6d63c2082a8d
author eschen42
date Tue, 08 Aug 2017 15:30:38 -0400
parents
children 02cafb660b72
comparison
equal deleted inserted replaced
-1:000000000000 0:6ccbe18131a6
1 ##------------------------------------------------------------------------------------------------------
2 ## these are the batch-independent and file-structure-independent routines to support the w4mkmeans tool
3 ##------------------------------------------------------------------------------------------------------
4
5 library(parallel)
6
7 w4kmeans_usage <- function() {
8 return (
9 c(
10 "w4mkmeans: bad input.",
11 "# contract:",
12 " required - caller will provide an environment comprising:",
13 " log_print - a logging function with the signature function(x, ...) expecting strings as x and ...",
14 " variableMetadata - the corresponding W4M data.frame having feature metadata",
15 " sampleMetdata - the corresponding W4M data.frame having sample metadata",
16 " dataMatrix - the corresponding W4M matrix",
17 " slots - the number of parallel slots for calculating kmeans",
18 " optional - environment may comprise:",
19 " kfeatures - an array of integers, the k's to apply for clustering by feature (default, empty array)",
20 " ksamples - an array of integers, the k's to apply for clustering by sample (default, empty array)",
21 " iter.max - the maximum number of iterations when calculating a cluster (default = 10)",
22 " nstart - how many random sets of centers should be chosen (default = 1)",
23 " algorithm - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)",
24 " ",
25 " this routine will return a list comprising:",
26 " variableMetadata - the input variableMetadata data.frame with updates, if any",
27 " sampleMetadata - the input sampleMetadata data.frame with updates, if any",
28 " scores - an array of strings, each representing a line of a tsv having the following header:",
29 " clusterOn TAB k TAB totalSS TAB betweenSS TAB proportion"
30 )
31 )
32 }
33
34 w4mkmeans <- function(env) {
35 # abort if 'env' is null or is not an environment
36 if ( is.null(env) || ! is.environment(env) ) {
37 lapply(w4kmeans_usage(),print)
38 }
39 # supply default arguments
40 if ( ! exists("iter.max" , env) ) env$iter.max <- 10
41 if ( ! exists("nstart" , env) ) env$nstart <- 1
42 if ( ! exists("algorithm", env) ) env$algorithm <- 'Hartigan-Wong'
43 if ( ! exists("ksamples" , env) ) env$ksamples <- c()
44 if ( ! exists("kfeatures", env) ) env$kfeatures <- c()
45 # check mandatory arguments
46 expected <- c(
47 "log_print"
48 , "variableMetadata"
49 , "sampleMetadata"
50 , "dataMatrix"
51 , "slots"
52 )
53 missing_from_env <- setdiff(expected, (ls(env)))
54 if ( length(missing_from_env) > 0 ) {
55 print(paste(c('expected environment members not found: ', as.character(missing_from_env)), collapse = ", "))
56 lapply(w4kmeans_usage(),print)
57 stop("w4mkmeans: contract has been broken")
58 }
59 # extract parameters from 'env'
60 failure_action <- env$log_print
61 scores <- c( "clusterOn\tk\ttotalSS\tbetweenSS\tproportion" )
62 sampleMetadata <- env$sampleMetadata
63 featureMetadata <- env$variableMetadata
64 ksamples <- as.numeric(env$ksamples)
65 kfeatures <- as.numeric(env$kfeatures)
66 slots <- env$slots
67
68 myLapply <- parLapply
69 # uncomment the next line to mimic parLapply, but without parallelization (for testing/experimentation)
70 # myLapply <- function(cl, ...) lapply(...)
71 cl <- NULL
72 if ( identical(myLapply, parLapply) ) {
73 failure_action(sprintf("w4mkmeans: using parallel evaluation with %d slots", slots))
74 failure_action(names(cl))
75 cl <- makePSOCKcluster(names = slots)
76 # from ?makePSOCKcluster: "It is good practice to shut down the workers by calling stopCluster."
77 clusterExport(
78 cl = cl
79 , varlist = c(
80 "tryCatchFunc"
81 , "calc_kmeans_one_dimension_one_k"
82 , "prepare.data.matrix"
83 )
84 )
85 final <- function(cl) {
86 # from ?makePSOCKcluster: "It is good practice to shut down the workers by calling stopCluster."
87 if ( !is.null(cl) ) {
88 failure_action("w4mkmeans: stopping cluster used for parallel evaluation")
89 stopCluster(cl)
90 }
91 }
92 } else {
93 failure_action("w4mkmeans: using sequential evaluation (1 slot)")
94 final <- function(cl) { }
95 }
96
97 tryCatch(
98 expr = {
99 # These myLapply calls produce lists of lists of results:
100 # - The outer list has no keys and its members are accessed by index
101 # - The inner list has keys "clusters" and "scores"
102
103 # for each $i in ksamples, append column 'k$i' to data frame sampleMetadata
104 ksamples_length <- length(ksamples)
105 if ( ksamples_length > 0 ) {
106 smpl_result_list <- myLapply(
107 cl = cl
108 , ksamples
109 , calc_kmeans_one_dimension_one_k
110 , env = env
111 , dimension = "samples"
112 )
113 for ( i in 1:ksamples_length ) {
114 result <- smpl_result_list[[i]]
115 if (result$success) {
116 sampleMetadata[sprintf("k%d",ksamples[i])] <- result$value$clusters
117 scores <- c(scores, result$value$scores)
118 }
119 }
120 }
121
122 # for each $i in kfeatures, append column 'k$i' to data frame featureMetadata
123 kfeatures_length <- length(kfeatures)
124 if ( kfeatures_length > 0 ) {
125 feat_result_list <- myLapply(
126 cl = cl
127 , kfeatures
128 , calc_kmeans_one_dimension_one_k
129 , env = env
130 , dimension = "features"
131 )
132 for ( i in 1:kfeatures_length ) {
133 result <- feat_result_list[[i]]
134 if (result$success) {
135 featureMetadata[sprintf("k%d",kfeatures[i])] <- result$value$clusters
136 scores <- c(scores, result$value$scores)
137 }
138 }
139 }
140
141 return (
142 list(
143 variableMetadata = featureMetadata
144 , sampleMetadata = sampleMetadata
145 , scores = scores
146 )
147 )
148 }
149 , finally = final(cl)
150 )
151 }
152
153 # calculate k-means for features or samples
154 # - recall that the dataMatrix has features in rows and samples in columns
155 # return value:
156 # list(clusters = km$cluster, scores = scores)
157 # arguments:
158 # env:
159 # environment having dataMatrix
160 # dimension:
161 # - "samples": produce clusters column to add to the sampleMetadata table
162 # - this is the default case
163 # - "variables": produce clusters column to add to the variableMetadata table
164 # k:
165 # integer, the number of clusters to make
166 calc_kmeans_one_dimension_one_k <- function(k, env, dimension = "samples") {
167 # abort if environment is not as expected
168 if ( is.null(env) || ! is.environment(env) ) {
169 stop("calc_kmeans_one_dimension_one_k - argument 'env' is not an environment")
170 }
171 if ( ! exists("log_print", env) || ! is.function(env$log_print) ) {
172 stop("calc_kmeans_one_dimension_one_k - argument 'env' - environment does not include log_print or it is not a function")
173 }
174 # abort if k is not as expected
175 if ( ! is.numeric(k) ) {
176 stop(sprintf("calc_kmeans_one_dimension_one_k - expected numeric argument 'k' but type is %s", typeof(k)))
177 }
178 k <- as.integer(k)
179 # abort if dimension is not as expected
180 if ( ! is.character(dimension)
181 || ! Reduce( f =`|`, x = sapply(X = c("features","samples"), FUN = `==`, dimension), init = FALSE) ) {
182 stop("calc_kmeans_one_dimension_one_k - argument 'dimension' is neither 'features' nor 'samples'")
183 }
184 dm <- env$dataMatrix
185 iter.max <- env$iter.max
186 nstart <- env$nstart
187 algorithm <- env$algorithm
188 dim_features <- dimension == "features"
189 # tryCatchFunc produces a list
190 # On success of expr(), tryCatchFunc produces
191 # list(success TRUE, value = expr(), msg = "")
192 # On failure of expr(), tryCatchFunc produces
193 # list(success = FALSE, value = NA, msg = "the error message")
194 result_list <- tryCatchFunc( expr = function() {
195 # kmeans clusters the rows; features are the columns of args_env$dataMatrix; samples, the rows
196 # - to calculate sample-clusters, no transposition is needed because samples are rows
197 # - to calculate feature-clusters, transposition is needed so that features will be the rows
198 if ( ! dim_features ) dm <- t(dm)
199 dm <- prepare.data.matrix( x.matrix = dm, data.transformation = function(x) { x } )
200 # need to set.seed to get reproducible results from kmeans
201 set.seed(4567)
202 # do the k-means clustering
203 km <- kmeans( x = dm, centers = k, iter.max, nstart = nstart, algorithm = algorithm )
204 scores <-
205 sprintf("%s\t%d\t%0.5e\t%0.5e\t%0.5f"
206 , dimension
207 , k
208 , km$totss
209 , km$betweenss
210 , km$betweenss/km$totss
211 )
212 list(clusters = km$cluster, scores = scores)
213 })
214 return ( result_list )
215 }
216