Mercurial > repos > melpetera > intensity_checks
diff Intchecks/Script_intensity_check.R @ 3:bdee2c2c484b draft
Uploaded
author | melpetera |
---|---|
date | Fri, 08 Mar 2019 09:07:12 -0500 |
parents | 4973a2104cfd |
children | a31f3f802b2b |
line wrap: on
line diff
--- a/Intchecks/Script_intensity_check.R Mon Jan 14 08:17:26 2019 -0500 +++ b/Intchecks/Script_intensity_check.R Fri Mar 08 09:07:12 2019 -0500 @@ -2,7 +2,7 @@ # SCRIPT INTENSITY CHECK # # # # Input: Data Matrix, VariableMetadata, SampleMetadata # -# Output: VariableMetadata, Graphics (barplots and boxplots) # +# Output: VariableMetadata, Graphics # # # # Dependencies: RcheckLibrary.R # # # @@ -13,16 +13,18 @@ if(FALSE){ rm(list = ls()) - setwd("Y:\\Developpement\\Intensity check\\Pour tests") + setwd("Y:\\Developpement\\Intensity check\\Pour tests\\Tests_global") DM.name <- "DM_NA.tabular" SM.name <- "SM_NA.tabular" VM.name <- "vM_NA.tabular" - class.col <- "2" - type <- "One_class" - class1 <- "Blanks" + method <- "one_class" + chosen.stat <- "mean,sd,quartile,decile,NA" + class.col <- "2" + test.fold <- "Yes" + class1 <- "Pools" fold.frac <- "Top" - logarithm <- "log2" + logarithm <- "log10" VM.output <- "new_VM.txt" graphs.output <- "Barplots_and_Boxplots.pdf" } @@ -30,31 +32,31 @@ -intens_check <- function(DM.name, SM.name, VM.name, class.col, type, class1, fold.frac, logarithm, - VM.output, graphs.output){ - +intens_check <- function(DM.name, SM.name, VM.name, method, chosen.stat, class.col, test.fold, class1, fold.frac, + logarithm, VM.output, graphs.output){ - # This function allows to check the intensities considering classes with a mean fold change calculation, - # the number and the proportion of missing values (NA) in dataMatrix + # This function allows to check the intensities with various statistics, number of missing values and mean fold change # - # Two options: - # - one class (selected by the user) against all the remaining samples ("One_class") - # - tests on each class ("Each_class") + # Three methods proposed: + # - global: tests for each variable without distinction between samples + # - one class: one class versus all the remaining samples + # - each class: if the class columns contains at least three classes and you want to test each of them # # Parameters: # DM.name, SM.name, VM.name: dataMatrix, sampleMetadata, variableMetadata files access - # class.col: number of the sampleMetadata's column with classes - # type: "One_class" or "Each_class" - # class1: name of the class, only if type="One_class" - # fold.frac: if type="One class": class1/other ("Top") or other/class1 ("Bottom") - # logarithm: "log2", "log10" or "none" for log mean fold change + # method: "global", "one_class", "each_class" + # chosen.stat: character listing the chosen analysis (comma-separated) + # class.col: number of the sampleMetadata's column with classes (if method = one_class or each_class) + # test.fold: "yes" or "no" (if method = one_class or each_class) + # class1: name of the class (if method = one_class) + # fold.frac: "Top" -> class1/other or "Bottom" -> other/class1 (if method = one_class) + # logarithm: "log2", "log10" or "none" (if method = one_class or each_class) # VM.output: output file's access (VM with new columns) # graphs.output: pdf file's access with barplots for the proportion of NA and boxplots with the folds values - # Input --------------------------------------------------------------------------------------------------- DM <- read.table(DM.name, header=TRUE, sep="\t", check.names=FALSE) @@ -66,247 +68,344 @@ # Table match check with Rchecklibrary table.check <- match3(DM, SM, VM) check.err(table.check) - - + rownames(DM) <- DM[,1] var_names <- DM[,1] DM <- DM[,-1] DM <- data.frame(t(DM)) - - class.col <- colnames(SM)[as.numeric(class.col)] + + stat.list <- strsplit(chosen.stat,",")[[1]] # check class.col, class1 and the number of classes --------------------------------------------------------- - if(!(class.col %in% colnames(SM))){ - stop("\n- - - - - - - - -\n", "The column ",class.col, " is not a part of the specify sample Metadata","\n- - - - - - - - -\n") + #set 1 class for all samples in case of method = no_class + if(method=="no_class"){ + c_class <- rep("global", length=nrow(DM)) + classnames <- "global" + nb_class=1 + test.fold <- "No" } + + + if(method != "no_class"){ + + class.col <- colnames(SM)[as.numeric(class.col)] + + if(!(class.col %in% colnames(SM))){ + stop("\n- - - - - - - - -\n", "The column ",class.col, " is not a part of the specify sample Metadata","\n- - - - - - - - -\n") + } - c_class <- SM[,class.col] - c_class <- as.factor(c_class) - nb_class <- nlevels(c_class) - classnames <- levels(c_class) + c_class <- SM[,class.col] + c_class <- as.factor(c_class) + nb_class <- nlevels(c_class) + classnames <- levels(c_class) + + if((nb_class < 2)&&(test.fold=="Yes")){ + err.1class <- c("\n The column",class.col, "contains only one class, fold calculation could not be executed \n") + cat(err.1class) + } + + if((nb_class > (nrow(SM))/3)&&(method == "each_class")){ + class.err <- c("\n There are too many classes, think about reducing the number of classes and excluding those + with few samples \n") + cat(class.err) + } + + + if(method == "one_class"){ + if(!(class1 %in% classnames)){ + list.class1 <- c("\n Classes:",classnames,"\n") + cat(list.class1) + err.class1 <- c("The class ",class1, " does not appear in the column ", class.col) + stop("\n- - - - - - - - -\n", err.class1,"\n- - - - - - - - -\n") + } - if(nb_class < 2){ - err.1class <- c("\n The column",class.col, "contains only one class, fold calculation could not be executed \n") - cat(err.1class) + #If method is "one_class", change others classes in "other" + for(i in 1:length(c_class)){ + if(c_class[i]!=class1){ + c_class <- as.character(c_class) + c_class[i] <- "Other" + c_class <- as.factor(c_class) + nb_class <- nlevels(c_class) + classnames <- c(class1,"Other") + } + } + } + } + + + # Statistics ------------------------------------------------------------------------------------------------ + + + ### Initialization + + DM <- cbind(c_class,DM) + + stat.res <- t(DM[0,-1,drop=FALSE]) + names <- NULL + + mean.res <- NULL + mean.names <- NULL - if((nb_class > (nrow(SM))/3)){ - class.err <- c("\n There are too many classes, think about reducing the number of classes and excluding those - with few samples \n") - cat(class.err) - } + sd.res <- NULL + sd.names <- NULL + + med.res <- NULL + med.names <- NULL + + quart.res <- NULL + quart.names <- NULL + + dec.res <- NULL + dec.names <- NULL + + NA.res <- NULL + NA.names <- NULL + pct_NA.res <- NULL + pct_NA.names <- NULL + + fold.res <- NULL + fold.names <- NULL + + if(("NA" %in% stat.list)||(test.fold=="Yes")){ + graphs <- 1 + }else{ + graphs=0 + } + + data_bp <- data.frame() #table for NA barplot + - if(type == "One_class"){ - if(!(class1 %in% classnames)){ - list.class1 <- c("\n Classes:",classnames,"\n") - cat(list.class1) - err.class1 <- c("The class ",class1, " does not appear in the column ", class.col) - stop("\n- - - - - - - - -\n", err.class1,"\n- - - - - - - - -\n") - } - } + ### Computation - #If type is "one_class", change others classes in "other" - if(type == "One_class"){ - for(i in 1:length(c_class)){ - if(c_class[i]!=class1){ - c_class <- as.character(c_class) - c_class[i] <- "Other" - c_class <- as.factor(c_class) - nb_class <- nlevels(c_class) - classnames <- c(class1,"Other") + for(j in 1:nb_class){ + + # Mean --------- + + if("mean" %in% stat.list){ + mean.res <- cbind(mean.res, colMeans(DM[which(DM$c_class==classnames[j]),-1],na.rm=TRUE)) + mean.names <- cbind(mean.names, paste("Mean",classnames[j], sep="_")) + if(j == nb_class){ + stat.res <- cbind(stat.res, mean.res) + names <- cbind(names, mean.names) + } + } + + # Standard deviation ----- + + if("sd" %in% stat.list){ + sd.res <- cbind(sd.res, apply(DM[which(DM$c_class==classnames[j]),-1],2,sd,na.rm=TRUE)) + sd.names <- cbind(sd.names, paste("Sd",classnames[j], sep="_")) + if(j == nb_class){ + stat.res <- cbind(stat.res, sd.res) + names <- cbind(names, sd.names) + } + } + + # Median --------- + + if(("median" %in% stat.list)&&(!("quartile" %in% stat.list))){ + med.res <- cbind(med.res, apply(DM[which(DM$c_class==classnames[j]),-1],2,median,na.rm=TRUE)) + med.names <- cbind(med.names, paste("Median",classnames[j], sep="_")) + if(j == nb_class){ + stat.res <- cbind(stat.res, med.res) + names <- cbind(names, med.names) + } + } + + # Quartiles ------ + + if("quartile" %in% stat.list){ + quart.res <- cbind(quart.res,t(apply(DM[which(DM$c_class==classnames[j]),-1],2,quantile,na.rm=TRUE))) + quart.names <- cbind(quart.names, paste("Min",classnames[j], sep="_"),paste("Q1",classnames[j], sep="_"), + paste("Median",classnames[j],sep="_"),paste("Q3",classnames[j],sep="_"), + paste("Max",classnames[j],sep="_")) + if(j == nb_class){ + stat.res <- cbind(stat.res, quart.res) + names <- cbind(names, quart.names) + } + } + + # Deciles ------ + + if("decile" %in% stat.list){ + dec.res <- cbind(dec.res,t(apply(DM[which(DM$c_class==classnames[j]),-1],2,quantile,na.rm=TRUE,seq(0,1,0.1)))) + dec.names <- cbind(dec.names, t(matrix(paste((paste("D",seq(0,10,1),sep="")),classnames[j],sep="_")))) + if(j == nb_class){ + stat.res <- cbind(stat.res, dec.res) + names <- cbind(names, dec.names) + } + } + + # Missing values ------------ + + if("NA" %in% stat.list){ + + nb_NA <- apply(DM[which(DM$c_class==classnames[j]),-1],2,function(x) sum(is.na(x))) + pct_NA <- round(nb_NA/nrow(DM[which(DM$c_class==classnames[j]),-1])*100,digits=4) + NA.res <- cbind(NA.res,nb_NA) + pct_NA.res <- cbind(pct_NA.res,pct_NA) + NA.names <- cbind(NA.names, paste("NA",classnames[j], sep="_")) + pct_NA.names <- cbind(pct_NA.names,paste("Pct_NA", classnames[j], sep="_")) + if(j == nb_class){ + stat.res <- cbind(stat.res, NA.res,pct_NA.res) + names <- cbind(names, NA.names,pct_NA.names) + } + #for barplots + Nb_NA_0_20 <- 0 + Nb_NA_20_40 <- 0 + Nb_NA_40_60 <- 0 + Nb_NA_60_80 <- 0 + Nb_NA_80_100 <- 0 + + for (i in 1:length(pct_NA)){ + + if ((0<=pct_NA[i])&(pct_NA[i]<20)){ + Nb_NA_0_20=Nb_NA_0_20+1} + + if ((20<=pct_NA[i])&(pct_NA[i]<40)){ + Nb_NA_20_40=Nb_NA_20_40+1} + + if ((40<=pct_NA[i])&(pct_NA[i]<60)){ + Nb_NA_40_60=Nb_NA_40_60+1} + + if ((60<=pct_NA[i])&(pct_NA[i]<80)){ + Nb_NA_60_80=Nb_NA_60_80+1} + + if ((80<=pct_NA[i])&(pct_NA[i]<=100)){ + Nb_NA_80_100=Nb_NA_80_100+1} + } + data_bp[1,j] <- Nb_NA_0_20 + data_bp[2,j] <- Nb_NA_20_40 + data_bp[3,j] <- Nb_NA_40_60 + data_bp[4,j] <- Nb_NA_60_80 + data_bp[5,j] <- Nb_NA_80_100 + rownames(data_bp) <- c("0%-20%", "20%-40%", "40%-60%", "60%-80%", "80%-100%") + + if(j == nb_class){ + + # Alert message if there is no missing value in data matrix + sum_total <- sum(NA.res) + alerte <- NULL + if(sum_total==0){ + alerte <- c(alerte, "Data Matrix contains no NA.\n") + } + if(length(alerte) != 0){ + cat(alerte,"\n") + } + + + colnames(data_bp) <- classnames + data_bp <- as.matrix(data_bp) + } + } + + + # Mean fold change ------------ + + if(test.fold=="Yes"){ + if(nb_class >= 2){ + if(j!=nb_class){ + ratio1 <- NULL + ratio2 <- NULL + if(method=="each_class"){ + fold.frac <- "Top" + } + for(k in (j+1):nb_class) { + if(fold.frac=="Bottom"){ + ratio1 <- classnames[k] + ratio2 <- classnames[j] + }else{ + ratio1 <- classnames[j] + ratio2 <- classnames[k] + } + fold <- colMeans(DM[which(DM$c_class==ratio1),-1],na.rm=TRUE)/ + colMeans(DM[which(DM$c_class==ratio2),-1],na.rm=TRUE) + if(logarithm=="log2"){ + fold.res <- cbind(fold.res,log2(fold)) + }else if(logarithm=="log10"){ + fold.res <- cbind(fold.res,log10(fold)) + }else{ + fold.res <- cbind(fold.res, fold) + } + if(logarithm == "none"){ + fold.names <- cbind(fold.names,paste("fold",ratio1,"VS", ratio2, sep="_")) + }else{ + fold.names <- cbind(fold.names,paste(logarithm, "fold", ratio1, "VS", ratio2, sep="_")) + } + } + + }else{ + stat.res <- cbind(stat.res,fold.res) + names <- cbind(names, fold.names) + } + } + } + + } + + ############ + + + # check columns names in variableMetadata + + VM.names <- colnames(VM) + for (i in 1:length(VM.names)){ + for (j in 1:length(names)){ + if (VM.names[i]==names[j]){ + names[j] <- paste(names[j], "2", sep="_") } } } + + colnames(stat.res) <- names + - DM <- cbind(DM,c_class) - - # fold calculation ------------------------------------------------------------------------------------------- - - if(nb_class >= 2){ - - - fold <- data.frame() - n <- 1 - ratio1 <- NULL - ratio2 <- NULL - - if(type=="Each_class"){ - fold.frac <- "Top" - } - - for(j in 1:(nb_class-1)){ - for(k in (j+1):nb_class) { - - if(fold.frac=="Bottom"){ - ratio1 <- classnames[k] - ratio2 <- classnames[j] - }else{ - ratio1 <- classnames[j] - ratio2 <- classnames[k] - } - - for (i in 1:(length(DM)-1)){ - fold[i,n] <- mean(DM[which(DM$c_class==ratio1),i], na.rm=TRUE)/ - mean(DM[which(DM$c_class==ratio2),i], na.rm=TRUE) - if(logarithm=="log2"){ - fold[i,n] <- log2(fold[i,n]) - }else if(logarithm=="log10"){ - fold[i,n] <- log10(fold[i,n]) - } - } - names(fold)[n] <- paste("fold",ratio1,"VS", ratio2, sep="_") - if(logarithm != "none"){ - names(fold)[n] <- paste(logarithm,names(fold)[n], sep="_") - } - n <- n + 1} - } - - } - - # number and proportion of NA --------------------------------------------------------------------------------- - - calcul_NA <- data.frame() - pct_NA <- data.frame() - for (i in 1:(length(DM)-1)){ - for (j in 1:nb_class){ - n <- 0 - new_DM <- DM[which(DM$c_class==classnames[j]),i] - for(k in 1:length(new_DM)){ - if (is.na(new_DM[k])){ - n <- n + 1} - calcul_NA[i,j] <- n - pct_NA[i,j] <- (calcul_NA[i,j]/length(new_DM))*100} - } - } - names(calcul_NA) <- paste("NA",classnames, sep="_") - names(pct_NA) <- paste("Pct_NA", classnames, sep="_") - - # Alert message if there is no NA in data matrix - - sumNA <- colSums(calcul_NA) - sum_total <- sum(sumNA) - alerte <- NULL - if(sum_total==0){ - alerte <- c(alerte, "Data Matrix contains no NA.\n") - } - - if(length(alerte) != 0){ - cat(alerte,"\n") - } - table_NA <- cbind(calcul_NA, pct_NA) - - - - # check columns names --------------------------------------------------------------------------------------- - - VM.names <- colnames(VM) - - # Fold - - if(nb_class >=2){ - fold.names <- colnames(fold) - - for (i in 1:length(VM.names)){ - for (j in 1:length(fold.names)){ - if (VM.names[i]==fold.names[j]){ - fold.names[j] <- paste(fold.names[j],"2", sep="_") - } - } - } - colnames(fold) <- fold.names - - VM <- cbind(VM,fold) - } - - # NA - NA.names <- colnames(table_NA) - - for (i in 1:length(VM.names)){ - for (j in 1:length(NA.names)){ - if (VM.names[i]==NA.names[j]){ - NA.names[j] <- paste(NA.names[j],"2", sep="_") - } - } - } - colnames(table_NA) <- NA.names - VM <- cbind(VM,table_NA) - - - #for NA barplots ------------------------------------------------------------------------------------------- - - data_bp <- data.frame() - - for (j in 1:ncol(pct_NA)){ - Nb_NA_0_20 <- 0 - Nb_NA_20_40 <- 0 - Nb_NA_40_60 <- 0 - Nb_NA_60_80 <- 0 - Nb_NA_80_100 <- 0 - for (i in 1:nrow(pct_NA)){ - - if ((0<=pct_NA[i,j])&(pct_NA[i,j]<20)){ - Nb_NA_0_20=Nb_NA_0_20+1 - } - - if ((20<=pct_NA[i,j])&(pct_NA[i,j]<40)){ - Nb_NA_20_40=Nb_NA_20_40+1} - - if ((40<=pct_NA[i,j])&(pct_NA[i,j]<60)){ - Nb_NA_40_60=Nb_NA_40_60+1} - - if ((60<=pct_NA[i,j])&(pct_NA[i,j]<80)){ - Nb_NA_60_80=Nb_NA_60_80+1} - - if ((80<=pct_NA[i,j])&(pct_NA[i,j]<=100)){ - Nb_NA_80_100=Nb_NA_80_100+1} - } - data_bp[1,j] <- Nb_NA_0_20 - data_bp[2,j] <- Nb_NA_20_40 - data_bp[3,j] <- Nb_NA_40_60 - data_bp[4,j] <- Nb_NA_60_80 - data_bp[5,j] <- Nb_NA_80_100 - } - rownames(data_bp) <- c("0%-20%", "20%-40%", "40%-60%", "60%-80%", "80%-100%") - colnames(data_bp) <- classnames - data_bp <- as.matrix(data_bp) - # Output --------------------------------------------------------------------------------------------------- + VM <-cbind(VM,stat.res) write.table(VM, VM.output,sep="\t", quote=FALSE, row.names=FALSE) - #graphics pdf + + ### graphics pdf + + if(graphs == 1){ pdf(graphs.output) + #Barplots for NA + if("NA" %in% stat.list){ + graph.colors <- c("green3","palegreen3","lightblue","orangered","red") par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE) - - bp=barplot(data_bp, col=rainbow(nrow(data_bp)), main="Proportion of NA", xlab="Classes", ylab="Variables") - legend("topright", fill=rainbow(nrow(data_bp)),rownames(data_bp), inset=c(-0.3,0)) - + + bp=barplot(data_bp, col=graph.colors, main="Proportion of NA", xlab="Classes", ylab="Variables") + legend("topright", fill=graph.colors,rownames(data_bp), inset=c(-0.3,0)) + stock=0 for (i in 1:nrow(data_bp)){ text(bp, stock+data_bp[i,]/2, data_bp[i,], col="white", cex=0.7) stock <- stock+data_bp[i,] } - - #Boxplots for fold test + } + + # Boxplots for fold test - if(nb_class >= 2){ - - clean_fold <- fold + if((test.fold=="Yes")&&(nb_class >= 2)){ + + clean_fold <- fold.res for(i in 1:nrow(clean_fold)){ for(j in 1:ncol(clean_fold)){ if(is.infinite(clean_fold[i,j])){ @@ -316,12 +415,23 @@ } for (j in 1:ncol(clean_fold)){ title <- paste(fold.names[j]) - boxplot(clean_fold[j], main=title) + boxplot(clean_fold[,j], main=title) } - } + } + + dev.off() - dev.off() + }else{ + pdf(graphs.output) + plot.new() + legend("center","You did not select any option with graphical output.") + dev.off() + } -} + } + + + + \ No newline at end of file