1
|
1 #########################################################################
|
|
2 # SCRIPT INTENSITY CHECK #
|
|
3 # #
|
|
4 # Input: Data Matrix, VariableMetadata, SampleMetadata #
|
3
|
5 # Output: VariableMetadata, Graphics #
|
1
|
6 # #
|
|
7 # Dependencies: RcheckLibrary.R #
|
|
8 # #
|
|
9 #########################################################################
|
0
|
10
|
|
11
|
|
12 # Parameters (for dev)
|
|
13 if(FALSE){
|
|
14
|
|
15 rm(list = ls())
|
3
|
16 setwd("Y:\\Developpement\\Intensity check\\Pour tests\\Tests_global")
|
0
|
17
|
|
18 DM.name <- "DM_NA.tabular"
|
|
19 SM.name <- "SM_NA.tabular"
|
|
20 VM.name <- "vM_NA.tabular"
|
3
|
21 method <- "one_class"
|
|
22 chosen.stat <- "mean,sd,quartile,decile,NA"
|
|
23 class.col <- "2"
|
|
24 test.fold <- "Yes"
|
|
25 class1 <- "Pools"
|
1
|
26 fold.frac <- "Top"
|
3
|
27 logarithm <- "log10"
|
0
|
28 VM.output <- "new_VM.txt"
|
|
29 graphs.output <- "Barplots_and_Boxplots.pdf"
|
|
30 }
|
|
31
|
1
|
32
|
|
33
|
|
34
|
3
|
35 intens_check <- function(DM.name, SM.name, VM.name, method, chosen.stat, class.col, test.fold, class1, fold.frac,
|
|
36 logarithm, VM.output, graphs.output){
|
1
|
37
|
3
|
38 # This function allows to check the intensities with various statistics, number of missing values and mean fold change
|
0
|
39 #
|
3
|
40 # Three methods proposed:
|
|
41 # - global: tests for each variable without distinction between samples
|
|
42 # - one class: one class versus all the remaining samples
|
|
43 # - each class: if the class columns contains at least three classes and you want to test each of them
|
0
|
44 #
|
|
45 # Parameters:
|
|
46 # DM.name, SM.name, VM.name: dataMatrix, sampleMetadata, variableMetadata files access
|
3
|
47 # method: "global", "one_class", "each_class"
|
|
48 # chosen.stat: character listing the chosen analysis (comma-separated)
|
|
49 # class.col: number of the sampleMetadata's column with classes (if method = one_class or each_class)
|
|
50 # test.fold: "yes" or "no" (if method = one_class or each_class)
|
|
51 # class1: name of the class (if method = one_class)
|
|
52 # fold.frac: "Top" -> class1/other or "Bottom" -> other/class1 (if method = one_class)
|
|
53 # logarithm: "log2", "log10" or "none" (if method = one_class or each_class)
|
1
|
54 # VM.output: output file's access (VM with new columns)
|
|
55 # graphs.output: pdf file's access with barplots for the proportion of NA and boxplots with the folds values
|
|
56
|
0
|
57
|
|
58
|
|
59
|
1
|
60 # Input ---------------------------------------------------------------------------------------------------
|
0
|
61
|
|
62 DM <- read.table(DM.name, header=TRUE, sep="\t", check.names=FALSE)
|
|
63 SM <- read.table(SM.name, header=TRUE, sep="\t", check.names=FALSE)
|
|
64 VM <- read.table(VM.name, header=TRUE, sep="\t", check.names=FALSE)
|
|
65
|
|
66
|
|
67
|
|
68 # Table match check with Rchecklibrary
|
|
69 table.check <- match3(DM, SM, VM)
|
|
70 check.err(table.check)
|
3
|
71
|
5
|
72 # Transposing the dataMatrix
|
0
|
73 rownames(DM) <- DM[,1]
|
|
74 var_names <- DM[,1]
|
|
75 DM <- DM[,-1]
|
|
76 DM <- data.frame(t(DM))
|
5
|
77
|
|
78 # Re-ordering the dataMatrix to match the sampleMetadata file order
|
|
79 DM <- merge(x=cbind(1:nrow(SM),SM), y=DM, by.x=2, by.y=0)
|
|
80 DM <- DM[order(DM[,2]),]
|
|
81 rownames(DM) <- DM[,1]
|
|
82 DM <- DM[,-c(1:(ncol(SM)+1))]
|
|
83
|
|
84
|
3
|
85 stat.list <- strsplit(chosen.stat,",")[[1]]
|
0
|
86
|
1
|
87
|
|
88 # check class.col, class1 and the number of classes ---------------------------------------------------------
|
0
|
89
|
3
|
90 #set 1 class for all samples in case of method = no_class
|
|
91 if(method=="no_class"){
|
|
92 c_class <- rep("global", length=nrow(DM))
|
|
93 classnames <- "global"
|
|
94 nb_class=1
|
|
95 test.fold <- "No"
|
0
|
96 }
|
3
|
97
|
|
98
|
|
99 if(method != "no_class"){
|
|
100
|
|
101 class.col <- colnames(SM)[as.numeric(class.col)]
|
|
102
|
|
103 if(!(class.col %in% colnames(SM))){
|
|
104 stop("\n- - - - - - - - -\n", "The column ",class.col, " is not a part of the specify sample Metadata","\n- - - - - - - - -\n")
|
|
105 }
|
0
|
106
|
3
|
107 c_class <- SM[,class.col]
|
|
108 c_class <- as.factor(c_class)
|
|
109 nb_class <- nlevels(c_class)
|
|
110 classnames <- levels(c_class)
|
|
111
|
|
112 if((nb_class < 2)&&(test.fold=="Yes")){
|
5
|
113 err.1class <- c("\n The column",class.col, "contains only one class, fold calculation could not be executed. \n")
|
3
|
114 cat(err.1class)
|
|
115 }
|
|
116
|
|
117 if((nb_class > (nrow(SM))/3)&&(method == "each_class")){
|
|
118 class.err <- c("\n There are too many classes, think about reducing the number of classes and excluding those
|
5
|
119 with few samples. \n")
|
3
|
120 cat(class.err)
|
|
121 }
|
|
122
|
|
123
|
|
124 if(method == "one_class"){
|
|
125 if(!(class1 %in% classnames)){
|
|
126 list.class1 <- c("\n Classes:",classnames,"\n")
|
|
127 cat(list.class1)
|
|
128 err.class1 <- c("The class ",class1, " does not appear in the column ", class.col)
|
|
129 stop("\n- - - - - - - - -\n", err.class1,"\n- - - - - - - - -\n")
|
|
130 }
|
0
|
131
|
3
|
132 #If method is "one_class", change others classes in "other"
|
|
133 for(i in 1:length(c_class)){
|
|
134 if(c_class[i]!=class1){
|
|
135 c_class <- as.character(c_class)
|
|
136 c_class[i] <- "Other"
|
|
137 c_class <- as.factor(c_class)
|
|
138 nb_class <- nlevels(c_class)
|
|
139 classnames <- c(class1,"Other")
|
|
140 }
|
|
141 }
|
|
142 }
|
|
143
|
1
|
144 }
|
3
|
145
|
|
146
|
|
147 # Statistics ------------------------------------------------------------------------------------------------
|
|
148
|
5
|
149 # Check whether the dataMatrix contains non-numeric values
|
|
150 if(!(is.numeric(as.matrix(DM)))){
|
|
151 # findchar definition
|
|
152 findchar <- function(myval){
|
|
153 if(is.na(myval)){
|
|
154 return("ok")
|
|
155 }else{
|
|
156 mytest <- as.character(myval)
|
|
157 if(is.na(as.numeric(mytest[1]))){
|
|
158 return("char")
|
|
159 }else{
|
|
160 return("ok")
|
|
161 }
|
|
162 }
|
|
163 }
|
|
164 # findchar application
|
|
165 chardiag <- suppressWarnings(apply(DM,2,vapply,findchar,"character"))
|
|
166 charlist <- which(chardiag == "char")
|
|
167 err.stock <- paste("\n- - - - - - - - -\nYour dataMatrix contains",
|
|
168 length(charlist),
|
|
169 "non-numeric value(s). To help you check your data, please find below a short overview:\n")
|
|
170 charover <- 1
|
|
171 while((length(err.stock)<10)&(length(err.stock)<(length(charlist)+1))){
|
|
172 charex <- paste("variable",colnames(DM)[ceiling(charlist[charover]/nrow(DM))],
|
|
173 "- sample",rownames(DM)[charlist[charover]-floor(charlist[charover]/nrow(DM))*nrow(DM)],
|
|
174 "~> value:",as.matrix(DM)[charlist[charover]],"\n")
|
|
175 err.stock <- c(err.stock,charex)
|
|
176 charover <- charover + 1
|
|
177 }
|
|
178 stop(c(err.stock,"The dataMatrix file is supposed to contain only numeric values.\n- - - - - - - - -\n"))
|
|
179 }
|
3
|
180
|
|
181 ### Initialization
|
|
182
|
|
183 DM <- cbind(c_class,DM)
|
|
184
|
|
185 stat.res <- t(DM[0,-1,drop=FALSE])
|
|
186 names <- NULL
|
|
187
|
|
188 mean.res <- NULL
|
|
189 mean.names <- NULL
|
0
|
190
|
3
|
191 sd.res <- NULL
|
|
192 sd.names <- NULL
|
|
193
|
|
194 med.res <- NULL
|
|
195 med.names <- NULL
|
|
196
|
|
197 quart.res <- NULL
|
|
198 quart.names <- NULL
|
|
199
|
|
200 dec.res <- NULL
|
|
201 dec.names <- NULL
|
|
202
|
|
203 NA.res <- NULL
|
|
204 NA.names <- NULL
|
|
205 pct_NA.res <- NULL
|
|
206 pct_NA.names <- NULL
|
|
207
|
|
208 fold.res <- NULL
|
|
209 fold.names <- NULL
|
|
210
|
|
211 if(("NA" %in% stat.list)||(test.fold=="Yes")){
|
|
212 graphs <- 1
|
|
213 }else{
|
|
214 graphs=0
|
|
215 }
|
|
216
|
|
217 data_bp <- data.frame() #table for NA barplot
|
|
218
|
0
|
219
|
1
|
220
|
3
|
221 ### Computation
|
0
|
222
|
1
|
223
|
3
|
224 for(j in 1:nb_class){
|
|
225
|
|
226 # Mean ---------
|
|
227
|
|
228 if("mean" %in% stat.list){
|
|
229 mean.res <- cbind(mean.res, colMeans(DM[which(DM$c_class==classnames[j]),-1],na.rm=TRUE))
|
|
230 mean.names <- cbind(mean.names, paste("Mean",classnames[j], sep="_"))
|
|
231 if(j == nb_class){
|
|
232 stat.res <- cbind(stat.res, mean.res)
|
|
233 names <- cbind(names, mean.names)
|
|
234 }
|
|
235 }
|
|
236
|
|
237 # Standard deviation -----
|
|
238
|
|
239 if("sd" %in% stat.list){
|
|
240 sd.res <- cbind(sd.res, apply(DM[which(DM$c_class==classnames[j]),-1],2,sd,na.rm=TRUE))
|
|
241 sd.names <- cbind(sd.names, paste("Sd",classnames[j], sep="_"))
|
|
242 if(j == nb_class){
|
|
243 stat.res <- cbind(stat.res, sd.res)
|
|
244 names <- cbind(names, sd.names)
|
|
245 }
|
|
246 }
|
|
247
|
|
248 # Median ---------
|
|
249
|
|
250 if(("median" %in% stat.list)&&(!("quartile" %in% stat.list))){
|
|
251 med.res <- cbind(med.res, apply(DM[which(DM$c_class==classnames[j]),-1],2,median,na.rm=TRUE))
|
|
252 med.names <- cbind(med.names, paste("Median",classnames[j], sep="_"))
|
|
253 if(j == nb_class){
|
|
254 stat.res <- cbind(stat.res, med.res)
|
|
255 names <- cbind(names, med.names)
|
|
256 }
|
|
257 }
|
|
258
|
|
259 # Quartiles ------
|
|
260
|
|
261 if("quartile" %in% stat.list){
|
|
262 quart.res <- cbind(quart.res,t(apply(DM[which(DM$c_class==classnames[j]),-1],2,quantile,na.rm=TRUE)))
|
|
263 quart.names <- cbind(quart.names, paste("Min",classnames[j], sep="_"),paste("Q1",classnames[j], sep="_"),
|
|
264 paste("Median",classnames[j],sep="_"),paste("Q3",classnames[j],sep="_"),
|
|
265 paste("Max",classnames[j],sep="_"))
|
|
266 if(j == nb_class){
|
|
267 stat.res <- cbind(stat.res, quart.res)
|
|
268 names <- cbind(names, quart.names)
|
|
269 }
|
|
270 }
|
|
271
|
|
272 # Deciles ------
|
|
273
|
|
274 if("decile" %in% stat.list){
|
|
275 dec.res <- cbind(dec.res,t(apply(DM[which(DM$c_class==classnames[j]),-1],2,quantile,na.rm=TRUE,seq(0,1,0.1))))
|
|
276 dec.names <- cbind(dec.names, t(matrix(paste((paste("D",seq(0,10,1),sep="")),classnames[j],sep="_"))))
|
|
277 if(j == nb_class){
|
|
278 stat.res <- cbind(stat.res, dec.res)
|
|
279 names <- cbind(names, dec.names)
|
|
280 }
|
|
281 }
|
|
282
|
|
283 # Missing values ------------
|
|
284
|
|
285 if("NA" %in% stat.list){
|
|
286
|
|
287 nb_NA <- apply(DM[which(DM$c_class==classnames[j]),-1],2,function(x) sum(is.na(x)))
|
|
288 pct_NA <- round(nb_NA/nrow(DM[which(DM$c_class==classnames[j]),-1])*100,digits=4)
|
|
289 NA.res <- cbind(NA.res,nb_NA)
|
|
290 pct_NA.res <- cbind(pct_NA.res,pct_NA)
|
|
291 NA.names <- cbind(NA.names, paste("NA",classnames[j], sep="_"))
|
|
292 pct_NA.names <- cbind(pct_NA.names,paste("Pct_NA", classnames[j], sep="_"))
|
|
293 if(j == nb_class){
|
|
294 stat.res <- cbind(stat.res, NA.res,pct_NA.res)
|
|
295 names <- cbind(names, NA.names,pct_NA.names)
|
|
296 }
|
0
|
297
|
3
|
298 #for barplots
|
|
299 Nb_NA_0_20 <- 0
|
|
300 Nb_NA_20_40 <- 0
|
|
301 Nb_NA_40_60 <- 0
|
|
302 Nb_NA_60_80 <- 0
|
|
303 Nb_NA_80_100 <- 0
|
|
304
|
|
305 for (i in 1:length(pct_NA)){
|
|
306
|
|
307 if ((0<=pct_NA[i])&(pct_NA[i]<20)){
|
|
308 Nb_NA_0_20=Nb_NA_0_20+1}
|
|
309
|
|
310 if ((20<=pct_NA[i])&(pct_NA[i]<40)){
|
|
311 Nb_NA_20_40=Nb_NA_20_40+1}
|
|
312
|
|
313 if ((40<=pct_NA[i])&(pct_NA[i]<60)){
|
|
314 Nb_NA_40_60=Nb_NA_40_60+1}
|
|
315
|
|
316 if ((60<=pct_NA[i])&(pct_NA[i]<80)){
|
|
317 Nb_NA_60_80=Nb_NA_60_80+1}
|
|
318
|
|
319 if ((80<=pct_NA[i])&(pct_NA[i]<=100)){
|
|
320 Nb_NA_80_100=Nb_NA_80_100+1}
|
|
321 }
|
|
322 data_bp[1,j] <- Nb_NA_0_20
|
|
323 data_bp[2,j] <- Nb_NA_20_40
|
|
324 data_bp[3,j] <- Nb_NA_40_60
|
|
325 data_bp[4,j] <- Nb_NA_60_80
|
|
326 data_bp[5,j] <- Nb_NA_80_100
|
|
327 rownames(data_bp) <- c("0%-20%", "20%-40%", "40%-60%", "60%-80%", "80%-100%")
|
|
328
|
|
329 if(j == nb_class){
|
|
330
|
|
331 # Alert message if there is no missing value in data matrix
|
|
332 sum_total <- sum(NA.res)
|
|
333 alerte <- NULL
|
|
334 if(sum_total==0){
|
|
335 alerte <- c(alerte, "Data Matrix contains no NA.\n")
|
|
336 }
|
|
337 if(length(alerte) != 0){
|
|
338 cat(alerte,"\n")
|
|
339 }
|
|
340
|
|
341
|
|
342 colnames(data_bp) <- classnames
|
|
343 data_bp <- as.matrix(data_bp)
|
|
344 }
|
|
345 }
|
|
346
|
|
347
|
|
348 # Mean fold change ------------
|
|
349
|
|
350 if(test.fold=="Yes"){
|
|
351 if(nb_class >= 2){
|
|
352 if(j!=nb_class){
|
|
353 ratio1 <- NULL
|
|
354 ratio2 <- NULL
|
|
355 if(method=="each_class"){
|
|
356 fold.frac <- "Top"
|
|
357 }
|
|
358 for(k in (j+1):nb_class) {
|
|
359 if(fold.frac=="Bottom"){
|
|
360 ratio1 <- classnames[k]
|
|
361 ratio2 <- classnames[j]
|
|
362 }else{
|
|
363 ratio1 <- classnames[j]
|
|
364 ratio2 <- classnames[k]
|
|
365 }
|
|
366 fold <- colMeans(DM[which(DM$c_class==ratio1),-1],na.rm=TRUE)/
|
|
367 colMeans(DM[which(DM$c_class==ratio2),-1],na.rm=TRUE)
|
|
368 if(logarithm=="log2"){
|
|
369 fold.res <- cbind(fold.res,log2(fold))
|
|
370 }else if(logarithm=="log10"){
|
|
371 fold.res <- cbind(fold.res,log10(fold))
|
|
372 }else{
|
|
373 fold.res <- cbind(fold.res, fold)
|
|
374 }
|
|
375 if(logarithm == "none"){
|
|
376 fold.names <- cbind(fold.names,paste("fold",ratio1,"VS", ratio2, sep="_"))
|
|
377 }else{
|
|
378 fold.names <- cbind(fold.names,paste(logarithm, "fold", ratio1, "VS", ratio2, sep="_"))
|
|
379 }
|
|
380 }
|
|
381
|
|
382 }else{
|
|
383 stat.res <- cbind(stat.res,fold.res)
|
|
384 names <- cbind(names, fold.names)
|
|
385 }
|
|
386 }
|
|
387 }
|
|
388
|
|
389 }
|
|
390
|
|
391 ############
|
|
392
|
|
393
|
|
394 # check columns names in variableMetadata
|
|
395
|
|
396 VM.names <- colnames(VM)
|
|
397 for (i in 1:length(VM.names)){
|
|
398 for (j in 1:length(names)){
|
|
399 if (VM.names[i]==names[j]){
|
|
400 names[j] <- paste(names[j], "2", sep="_")
|
0
|
401 }
|
|
402 }
|
|
403 }
|
3
|
404
|
|
405 colnames(stat.res) <- names
|
|
406
|
0
|
407
|
1
|
408
|
|
409
|
|
410
|
0
|
411
|
1
|
412 # Output ---------------------------------------------------------------------------------------------------
|
0
|
413
|
3
|
414 VM <-cbind(VM,stat.res)
|
0
|
415
|
|
416 write.table(VM, VM.output,sep="\t", quote=FALSE, row.names=FALSE)
|
|
417
|
3
|
418
|
|
419 ### graphics pdf
|
|
420
|
|
421 if(graphs == 1){
|
0
|
422
|
|
423 pdf(graphs.output)
|
3
|
424
|
0
|
425
|
1
|
426 #Barplots for NA
|
3
|
427 if("NA" %in% stat.list){
|
|
428 graph.colors <- c("green3","palegreen3","lightblue","orangered","red")
|
0
|
429 par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
|
3
|
430
|
|
431 bp=barplot(data_bp, col=graph.colors, main="Proportion of NA", xlab="Classes", ylab="Variables")
|
|
432 legend("topright", fill=graph.colors,rownames(data_bp), inset=c(-0.3,0))
|
|
433
|
0
|
434 stock=0
|
|
435 for (i in 1:nrow(data_bp)){
|
1
|
436 text(bp, stock+data_bp[i,]/2, data_bp[i,], col="white", cex=0.7)
|
0
|
437 stock <- stock+data_bp[i,]
|
|
438 }
|
|
439
|
3
|
440 }
|
|
441
|
|
442 # Boxplots for fold test
|
1
|
443
|
3
|
444 if((test.fold=="Yes")&&(nb_class >= 2)){
|
|
445
|
|
446 clean_fold <- fold.res
|
1
|
447 for(i in 1:nrow(clean_fold)){
|
|
448 for(j in 1:ncol(clean_fold)){
|
|
449 if(is.infinite(clean_fold[i,j])){
|
|
450 clean_fold[i,j] <- NA
|
|
451 }
|
|
452 }
|
|
453 }
|
|
454 for (j in 1:ncol(clean_fold)){
|
|
455 title <- paste(fold.names[j])
|
3
|
456 boxplot(clean_fold[,j], main=title)
|
1
|
457 }
|
3
|
458 }
|
|
459
|
|
460 dev.off()
|
0
|
461
|
3
|
462 }else{
|
|
463 pdf(graphs.output)
|
|
464 plot.new()
|
|
465 legend("center","You did not select any option with graphical output.")
|
|
466 dev.off()
|
|
467 }
|
0
|
468
|
3
|
469 }
|
|
470
|
|
471
|
|
472
|
|
473
|
0
|
474
|
|
475 |