comparison batch_correction_all_loess_script.R @ 0:b74d1d533dea draft default tip

planemo upload for repository https://github.com/workflow4metabolomics/batchcorrection.git commit 241fb99a843e13195c5054cd9731e1561f039bde
author ethevenot
date Thu, 04 Aug 2016 11:40:35 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:b74d1d533dea
1 loessF <- function(datVn, qcaVi, preVi, spnN, vrbL=FALSE) {
2
3 if(length(qcaVi) < 5) {
4
5 if(vrbL)
6 cat("\nWarning: less than 5 '", refC, "'; linear regression will be performed instead of loess regression for this batch\n", sep="")
7
8 return(predict(lm(datVn[qcaVi] ~ qcaVi),
9 newdata = data.frame(qcaVi = preVi)))
10
11 } else {
12
13 return(predict(loess(datVn[qcaVi] ~ qcaVi,
14 control = loess.control(surface = "direct"),
15 span = spnN),
16 newdata = data.frame(qcaVi = preVi)))
17
18 }
19
20 ## Note:
21 ## the surface = 'direct' argument allows extrapolation
22
23 } ## loessF
24
25 plotBatchF <- function(datMN, samDF.arg, spnN.arg) {
26
27 maiC <- switch(gsub("MN", "", deparse(substitute(datMN))),
28 raw = "Raw",
29 nrm = "Normalized")
30
31 colVc <- c(sample = "green4",
32 pool = "red",
33 blank = "black",
34 other = "yellow")
35
36 par(font = 2, font.axis = 2, font.lab = 2, lwd = 2, pch = 18)
37
38 layout(matrix(c(1, 1, 2, 3), nrow = 2),
39 widths = c(0.7, 0.3))
40
41 obsNamVc <- rownames(datMN)
42
43 obsColVc <- sapply(samDF.arg[, "sampleType"],
44 function(typC)
45 ifelse(typC %in% names(colVc), colVc[typC], colVc["other"]))
46
47 ## Graphic 1: Sum of intensities for each sample
48
49 par(mar = c(3.6, 3.6, 3.1, 0.6))
50
51 batTab <- table(samDF.arg[, "batch"])
52
53 sumVn <- rowSums(datMN, na.rm = TRUE)
54
55 plot(sumVn,
56 cex = 1.2,
57 col = obsColVc,
58 pch = 18,
59 xaxs = "i",
60 xlab = "",
61 ylab = "")
62
63 mtext("Injection order",
64 line = 2.2,
65 side = 1)
66 mtext("Sum of variable intensities",
67 line = 2.2,
68 side = 2)
69
70 mtext(maiC, cex = 1.2, line = 1.5, side = 3)
71
72 abline(v = cumsum(batTab) + 0.5,
73 col = "red")
74
75 mtext(names(batTab),
76 at = batTab / 2 + c(0, cumsum(batTab[-length(batTab)])))
77
78 obsColVuc <- obsColVc[sort(unique(names(obsColVc)))]
79
80 text(rep(batTab[1], times = length(obsColVuc)),
81 par("usr")[3] + (0.97 - length(obsColVuc) * 0.03 + 1:length(obsColVuc) * 0.03) * diff(par("usr")[3:4]),
82 col = obsColVuc,
83 font = 2,
84 labels = names(obsColVuc),
85 pos = 2)
86
87 for(batC in names(batTab)) {
88
89 batSeqVi <- which(samDF.arg[, "batch"] == batC)
90 batPooVi <- intersect(batSeqVi,
91 grep("pool", samDF.arg[, "sampleType"]))
92 batSamVi <- intersect(batSeqVi,
93 grep("sample", samDF.arg[, "sampleType"]))
94 if(length(batPooVi))
95 lines(batSeqVi,
96 loessF(sumVn, batPooVi, batSeqVi, spnN=spnN.arg),
97 col = colVc["pool"])
98 lines(batSeqVi,
99 loessF(sumVn, batSamVi, batSeqVi, spnN=spnN.arg),
100 col = colVc["sample"])
101
102 }
103
104 ## Graphics 2 and 3 (right): PCA score plots of components 1-4
105
106 radVn <- seq(0, 2 * pi, length.out = 100)
107 epsN <- .Machine[["double.eps"]] ## [1] 2.22e-16
108
109 pcaMN <- datMN
110
111 pcaLs <- opls(pcaMN, predI = 4, printL = FALSE, plotL = FALSE)
112 tMN <- getScoreMN(pcaLs)
113 vRelVn <- pcaLs@modelDF[, "R2X"]
114
115 n <- nrow(tMN)
116 hotN <- 2 * (n - 1) * (n^2 - 1) / (n^2 * (n - 2))
117
118 hotFisN <- hotN * qf(0.95, 2, n - 2)
119
120 pcsLs <- list(c(1, 2), c(3, 4))
121
122 par(mar = c(3.6, 3.6, 0.6, 1.1))
123
124 for(pcsN in 1:length(pcsLs)) {
125
126 pcsVn <- pcsLs[[pcsN]]
127
128 tcsMN <- tMN[, pcsVn]
129
130 micMN <- solve(cov(tcsMN))
131
132 n <- nrow(tMN)
133 hotN <- 2 * (n - 1) * (n^2 - 1) / (n^2 * (n - 2))
134
135 hotFisN <- hotN * qf(0.95, 2, n - 2)
136
137 hotVn <- apply(tcsMN,
138 1,
139 function(x) 1 - pf(1 / hotN * t(as.matrix(x)) %*% micMN %*% as.matrix(x), 2, n - 2))
140
141 obsHotVi <- which(hotVn < 0.05)
142
143 xLabC <- paste("t",
144 pcsVn[1],
145 "(",
146 round(vRelVn[pcsVn[1]] * 100),
147 "%)",
148 sep = "")
149
150 yLabC <- paste("t",
151 pcsVn[2],
152 "(",
153 round(vRelVn[pcsVn[2]] * 100),
154 "%)",
155 sep = "")
156
157 xLimVn <- c(-1, 1) * max(sqrt(var(tcsMN[, 1]) * hotFisN), max(abs(tcsMN[, 1])))
158 yLimVn <- c(-1, 1) * max(sqrt(var(tcsMN[, 2]) * hotFisN), max(abs(tcsMN[, 2])))
159
160 plot(tcsMN,
161 main = "",
162 type = "n",
163 xlab = "",
164 ylab = "",
165 xlim = xLimVn,
166 ylim = yLimVn)
167
168 mtext(xLabC,
169 line = 2.2,
170 side = 1)
171 mtext(yLabC,
172 line = 2.2,
173 side = 2)
174
175 par(lwd = 1)
176
177 abline(v = axTicks(1),
178 col = "grey")
179
180 abline(h = axTicks(2),
181 col = "grey")
182
183 abline(v = 0)
184 abline(h = 0)
185
186 lines(sqrt(var(tcsMN[, 1]) * hotFisN) * cos(radVn),
187 sqrt(var(tcsMN[, 2]) * hotFisN) * sin(radVn))
188
189 points(tcsMN,
190 col = obsColVc,
191 pch = 18)
192
193 if(length(obsHotVi))
194 text(tcsMN[obsHotVi, 1],
195 tcsMN[obsHotVi, 2],
196 col = obsColVc[obsHotVi],
197 labels = obsNamVc[obsHotVi],
198 pos = 3)
199
200 } ## for(pcsN in 1:length(pcsLs)) {
201
202 return(invisible(list(sumVn = sumVn,
203 tcsMN = tcsMN)))
204
205 } ## plotBatchF
206
207 shiftBatchCorrectF <- function(rawMN.arg,
208 samDF.arg,
209 refC.arg,
210 spnN.arg) {
211
212 cat("\nReference observations are: ", refC.arg, "\n")
213
214 ## computing median off all pools (or samples) for each variable
215
216 refMeaVn <- apply(rawMN.arg[samDF.arg[, "sampleType"] == refC.arg, ],
217 2,
218 function(feaRefVn) mean(feaRefVn, na.rm = TRUE))
219
220 ## splitting data and sample metadata from each batch
221
222 batRawLs <- split(as.data.frame(rawMN.arg),
223 f = samDF.arg[, "batch"])
224 batRawLs <- lapply(batRawLs, function(inpDF) as.matrix(inpDF))
225
226 batSamLs <- split(as.data.frame(samDF.arg),
227 f = samDF.arg[, "batch"])
228
229 ## checking extrapolation: are there pools at the first and last observations of each batch
230
231 if(refC.arg == "pool") {
232 pooExtML <- matrix(FALSE, nrow = 2, ncol = length(batRawLs),
233 dimnames = list(c("first", "last"), names(batRawLs)))
234 for(batC in names(batSamLs)) {
235 batSamTypVc <- batSamLs[[batC]][, "sampleType"]
236 pooExtML["first", batC] <- head(batSamTypVc, 1) == "pool"
237 pooExtML["last", batC] <- tail(batSamTypVc, 1) == "pool"
238 }
239 if(!all(c(pooExtML))) {
240 cat("\nWarning: Pools are missing at the first and/or last position of the following batches:\n")
241 pooExtBatVi <- which(!apply(pooExtML, 2, all))
242 for(i in 1:length(pooExtBatVi))
243 cat(names(pooExtBatVi)[i], ": ",
244 paste(rownames(pooExtML)[!pooExtML[, pooExtBatVi[i]]], collapse = ", "), "\n", sep = "")
245 cat("Extrapolating loess fits for these batches may result in inaccurate modeling!\n")
246 }
247 }
248
249 ## normalizing
250
251 nrmMN <- NULL ## normalized data matrix to be computed
252
253 cat("\nProcessing batch:")
254
255 for(batC in names(batRawLs)) { ## processing each batch individually
256
257 cat("\n", batC)
258
259 batRawMN <- batRawLs[[batC]]
260 batSamDF <- batSamLs[[batC]]
261
262 batAllVi <- 1:nrow(batRawMN)
263
264 batRefVi <- grep(refC.arg, batSamDF[, "sampleType"])
265
266 ## prediction of the loess fit
267
268 batLoeMN <- apply(batRawMN,
269 2,
270 function(rawVn) loessF(rawVn, batRefVi, batAllVi, spnN=spnN.arg, vrbL=TRUE))
271
272 ## normalization
273
274 batLoeMN[batLoeMN <= 0] <- NA
275
276 batNrmMN <- batRawMN / batLoeMN
277
278 nrmMN <- rbind(nrmMN,
279 batNrmMN)
280
281 }
282
283 cat("\n")
284
285 nrmMN <- sweep(nrmMN, MARGIN = 2, STATS = refMeaVn, FUN = "*")
286
287 return(nrmMN)
288
289 } ## shiftBatchCorrectF