comparison w4m_general_purpose_routines.R @ 0:6ccbe18131a6 draft

planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit 299e5c7fdb0d6eb0773f3660009f6d63c2082a8d
author eschen42
date Tue, 08 Aug 2017 15:30:38 -0400
parents
children c415b7dc6f37
comparison
equal deleted inserted replaced
-1:000000000000 0:6ccbe18131a6
1 # prepare.data.matrix - Prepare x.datamatrix for multivariate statistical analaysis (MVA)
2 # - Motivation:
3 # - Selection:
4 # - You may want to exclude several samples from your analysis:
5 # - If so, set the argument 'exclude.samples' to a vector of sample names
6 # - You may want to exclude several features or features from your analysis:
7 # - If so, set the argument 'exclude.features' to a vector of feature names
8 # - Renaming samples:
9 # - You may want to rename several samples from your analysis:
10 # - If so, set the argument 'sample.rename.function' to a function accepting a vector
11 # of sample names and producing a vector of strings of equivalent length
12 # - MVA is confounded by missing values.
13 # - By default, this function imputes missing values as zero.
14 # - For a different imputation, set the 'data.imputation' argument to a function
15 # accepting a single matrix argument and returning a matrix of the same
16 # dimensions as the argument.
17 # - Transformation
18 # - It may be desirable to transform the intensity data to reduce the range.
19 # - By default, this function performs an eigth-root transformation:
20 # - Any root-tranformation has the advantage of never being negative.
21 # - Calculation of the eight-root is four times faster in my hands than log10.
22 # - However, it has the disadvantage that calculation of fold-differences
23 # is not additive as with log-transformation.
24 # - Rather, you must divide the values and raise to the eighth power.
25 # - For a different transformation, set the 'data.transformation' argument
26 # to a function accepting a single matrix argument.
27 # - The function should be written to return a matrix of the same dimensions
28 # as the argument.
29 # arguments:
30 # - x.matrix - matrix of intensities (or data.frame of sample metadata)
31 # - one row per sample
32 # - one column per feature or metadata attribute
33 # - exclude.samples - vector of labels of matrix rows (samples) to omit from analysis
34 # - exclude.features - vector of labels of matrix columnss (features) to omit from analysis
35 # - sample.rename.function - function to be used to rename rows if necessary, or NULL
36 # - e.g., sample.rename.function = function(x) {
37 # sub("(.*)_.*","\\1", row.names(x))
38 # }
39 # - data.imputation - function applied to matrix to impute missing values
40 # - e.g., data.imputation = function(m) {
41 # m[is.na(m)] <- min(m, na.rm = TRUE) / 100
42 # return (m)
43 # }
44 # - data.transformation - function applied to matrix cells
45 # - e.g., data.transformation = function(x) { return( log10(x) ) }
46 # or, data.transformation = log10
47 # result value:
48 # transformed, imputed x.datamatrix with renamed rows and with neither excluded values nor features
49 #
50 ################################
51 ##
52 ## Notes regarding the effectiveness and performance of the data transformation method.
53 ##
54 ## The two transformations that I tried (log10 and 8th root) required different imputation methods.
55 ##
56 ## For the LCMS resin data set that I was working with, separation in MVA was nearly equivalent for:
57 ## data.imputation <- function(x.matrix) {
58 ## x.matrix[is.na(x.matrix)] <- 0
59 ## return (x.matrix)
60 ## }
61 ## data.transformation <- function(x) {
62 ## sqrt( sqrt( sqrt(x) ) )
63 ## }
64 ## and
65 ## data.imputation <- function(x.matrix) {
66 ## x.matrix[is.na(x.matrix)] <- min(x.matrix, na.rm = TRUE) / 100
67 ## return (x.matrix)
68 ## }
69 ## data.transformation <- function(x) {
70 ## log10(x)
71 ## }
72 ##
73 ## Note further that triple application of the square root:
74 ## - may be four times faster than log10:
75 ## - may be three times faster than log2:
76 ##
77 ## system.time( junk <- sqrt( sqrt( sqrt(1:100000000) ) ) )
78 ## user system elapsed
79 ## 0.832 0.236 1.069
80 ## system.time( junk <- log10(1:100000000) )
81 ## user system elapsed
82 ## 3.936 0.400 4.337
83 ## system.time( junk <- log2(1:100000000) )
84 ## user system elapsed
85 ## 2.784 0.320 3.101
86 ##
87 ################################
88 #
89 prepare.data.matrix <- function(
90 x.matrix
91 , exclude.samples = NULL
92 , exclude.features = NULL
93 , sample.rename.function = NULL
94 , data.imputation =
95 function(m) {
96 # replace NA values with zero
97 m[is.na(m)] <- 0
98 # replace negative values with zero, if applicable (It should never be applicable!)
99 if (min(m < 0)) {
100 m <- matrix(lapply(X = m, FUN = function(z) {max(z,0)}), nrow = nrow(m) )
101 }
102 # return matrix as the result
103 return (m)
104 }
105 , data.transformation = function(x) {
106 sqrt( sqrt( sqrt(x) ) )
107 }
108 , en = new.env()
109 ) {
110 # MatVar - Compute variance of rows or columns of a matrix
111 # ref: http://stackoverflow.com/a/25100036
112 # For row variance, dim == 1, for col variance, dim == 2
113 MatVar <- function(x, dim = 1) {
114 if (dim == 1) {
115 dim.x.2 <- dim(x)[2]
116 if ( dim.x.2 == 0 )
117 stop("MatVar: there are zero columns")
118 if ( dim.x.2 == 1 ) {
119 stop("MatVar: a single column is insufficient to calculate a variance")
120 # return ( rep.int(x = 0, times = nrow(x)) )
121 } else {
122 return ( rowSums( (x - rowMeans(x))^2 ) / ( dim(x)[2] - 1 ) )
123 }
124 } else if (dim == 2) {
125 dim.x.1 <- dim(x)[1]
126 if ( dim.x.1 == 0 ) {
127 stop("MatVar: there are zero rows")
128 }
129 if ( dim.x.1 == 1 ) {
130 stop("MatVar: a single row is insufficient to calculate a variance")
131 # return ( rep.int(x = 0, times = ncol(x)) )
132 } else {
133 return ( rowSums( (t(x) - colMeans(x))^2 ) / ( dim(x)[1] - 1 ) )
134 }
135 } else stop("Please enter valid dimension, for rows, dim = 1; for colums, dim = 2")
136 }
137
138 nonzero.var <- function(x) {
139 if (nrow(x) == 0) {
140 print(str(x))
141 stop("matrix has no rows")
142 }
143 if (ncol(x) == 0) {
144 print(str(x))
145 stop("matrix has no columns")
146 }
147 if ( is.numeric(x) ) {
148 # exclude any rows with zero variance
149 row.vars <- MatVar(x, dim = 1)
150 nonzero.row.vars <- row.vars > 0
151 nonzero.rows <- row.vars[nonzero.row.vars]
152 if ( length(rownames(x)) != length(rownames(nonzero.rows)) ) {
153 row.names <- attr(nonzero.rows,"names")
154 x <- x[ row.names, , drop = FALSE ]
155 }
156
157 # exclude any columns with zero variance
158 column.vars <- MatVar(x, dim = 2)
159 nonzero.column.vars <- column.vars > 0
160 nonzero.columns <- column.vars[nonzero.column.vars]
161 if ( length(colnames(x)) != length(colnames(nonzero.columns)) ) {
162 column.names <- attr(nonzero.columns,"names")
163 x <- x[ , column.names, drop = FALSE ]
164 }
165 }
166 return (x)
167 }
168
169 if (is.null(x.matrix)) {
170 stop("FATAL ERROR - prepare.data.matrix was called with null x.matrix")
171 }
172
173 en$xpre <- x <- x.matrix
174
175 # exclude any samples as indicated
176 if ( !is.null(exclude.features) ) {
177 my.colnames <- colnames(x)
178 my.col.diff <- setdiff(my.colnames, exclude.features)
179 x <- x[ , my.col.diff , drop = FALSE ]
180 }
181
182 # exclude any features as indicated
183 if ( !is.null(exclude.samples) ) {
184 my.rownames <- rownames(x)
185 my.row.diff <- setdiff(my.rownames, exclude.samples)
186 x <- x[ my.row.diff, , drop = FALSE ]
187 }
188
189 # rename rows if desired
190 if ( !is.null(sample.rename.function) ) {
191 renamed <- sample.rename.function(x)
192 rownames(x) <- renamed
193 }
194
195 # save redacted x.datamatrix to environment
196 en$redacted.data.matrix <- x
197
198 # impute values missing from the x.datamatrix
199 if ( !is.null(data.imputation) ) {
200 x <- data.imputation(x)
201 }
202
203 # perform transformation if desired
204 if ( !is.null(data.transformation) ) {
205 x <- data.transformation(x)
206 } else {
207 x <- x
208 }
209
210 # purge rows and columns that have zero variance
211 if ( is.numeric(x) ) {
212 x <- nonzero.var(x)
213 }
214
215 # save imputed, transformed x.datamatrix to environment
216 en$imputed.transformed.data.matrix <- x
217
218 return(x)
219 }
220
221
222 ##-----------------------------------------------
223 ## helper functions for error detection/reporting
224 ##-----------------------------------------------
225
226 # log-printing to stderr
227 log_print <- function(x, ...) {
228 cat(
229 format(Sys.time(), "%Y-%m-%dT%H:%M:%S%z")
230 , " "
231 , c(x, ...)
232 , "\n"
233 , sep=""
234 , file=stderr()
235 )
236 }
237
238 # tryCatchFunc produces a list
239 # On success of expr(), tryCatchFunc produces
240 # list(success TRUE, value = expr(), msg = "")
241 # On failure of expr(), tryCatchFunc produces
242 # list(success = FALSE, value = NA, msg = "the error message")
243 tryCatchFunc <- function(expr) {
244 # format error for logging
245 format_error <- function(e) {
246 paste(c("Error { message:", e$message, ", call:", e$call, "}"), collapse = " ")
247 }
248 my_expr <- expr
249 retval <- NULL
250 tryCatch(
251 expr = {
252 retval <- ( list( success = TRUE, value = my_expr(), msg = "" ) )
253 }
254 , error = function(e) {
255 retval <<- list( success = FALSE, value = NA, msg = format_error(e) )
256 }
257 )
258 return (retval)
259 }
260
261 # tryCatchProc produces a list
262 # On success of expr(), tryCatchProc produces
263 # list(success TRUE, msg = "")
264 # On failure of expr(), tryCatchProc produces
265 # list(success = FALSE, msg = "the error message")
266 tryCatchProc <- function(expr) {
267 # format error for logging
268 format_error <- function(e) {
269 paste(c("Error { message:", e$message, ", call:", e$call, "}"), collapse = " ")
270 }
271 retval <- NULL
272 tryCatch(
273 expr = {
274 expr()
275 retval <- ( list( success = TRUE, msg = "" ) )
276 }
277 , error = function(e) {
278 retval <<- list( success = FALSE, msg = format_error(e) )
279 }
280 )
281 return (retval)
282 }
283