diff Intchecks/Script_intensity_check.R @ 0:c2c2e1be904a draft

Uploaded
author melpetera
date Thu, 11 Oct 2018 05:33:19 -0400
parents
children 4973a2104cfd
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Intchecks/Script_intensity_check.R	Thu Oct 11 05:33:19 2018 -0400
@@ -0,0 +1,264 @@
+####################################################################
+# SCRIPT INTENSITY CHECK
+# V1: Fold and NA
+#
+# Input: Data Matrix, VariableMetadata, SampleMetadata
+# Output: VariableMetadata, Graphics (barplots and boxplots)
+#
+# Dependencies: RcheckLibrary.R 
+#
+####################################################################
+
+
+# Parameters (for dev)
+if(FALSE){
+  
+  rm(list = ls())
+  setwd("Y:\\Developpement\\Intensity check\\Pour tests")
+  
+  DM.name <- "DM_NA.tabular"
+  SM.name <- "SM_NA.tabular"
+  VM.name <- "vM_NA.tabular"
+  type <- "One_class"
+  class.col <- "2"
+  class1 <- "Blanks"
+  VM.output <- "new_VM.txt"
+  graphs.output <- "Barplots_and_Boxplots.pdf"
+}
+
+intens_check <- function(DM.name, SM.name, VM.name, type, class.col, class1, VM.output, graphs.output){
+  # This function allows to check the intensities considering classes with a fold calculation, the number and
+  # the proportion of NA
+  #
+  # Two options: 
+  # - one class (selected by the user) against all the remaining samples ("One_class")
+  # - tests on each class ("Each_class")
+  #
+  # Parameters:
+  # DM.name, SM.name, VM.name: dataMatrix, sampleMetadata, variableMetadata files access
+  # type: "One_class" or "Each_class"
+  # class.col: number of the sampleMetadata's column with classes
+  # class1: name of the class if type="One_class"
+  # VM.output: output file's access (VM with the new columns)
+  # graphs.output: pdf with barplots for the proportion of NA and boxplots with the folds values
+  
+  
+  
+  
+  # Input ---------------------------------------------------------
+  
+  DM <- read.table(DM.name, header=TRUE, sep="\t", check.names=FALSE)
+  SM <- read.table(SM.name, header=TRUE, sep="\t", check.names=FALSE)
+  VM <- read.table(VM.name, header=TRUE, sep="\t", check.names=FALSE)
+  
+  
+  
+  # Table match check with Rchecklibrary
+  table.check <- match3(DM, SM, VM)
+  check.err(table.check)
+  
+  
+  rownames(DM) <- DM[,1]
+  var_names <- DM[,1]
+  DM <- DM[,-1]
+  DM <- data.frame(t(DM))
+  
+  class.col <- colnames(SM)[as.numeric(class.col)]
+  
+  # check class.col, class1 and the number of classes---------------
+  
+  if(!(class.col %in% colnames(SM))){
+    stop("\n- - - - - - - - -\n", "The column ",class.col, " is not a part of the specify sample Metadata","\n- - - - - - - - -\n")
+  }
+  
+  c_class <- SM[,class.col]
+  c_class <- as.factor(c_class)
+  nb_class <- nlevels(c_class)
+  classnames <- levels(c_class)
+  
+  
+  if((nb_class > (nrow(SM))/3)){
+    class.err <- c("\n There are too many classes, think about reducing the number of classes and excluding those
+                   with few samples \n")
+    cat(class.err)
+  }
+
+  if(type == "One_class"){  
+    if(!(class1 %in% classnames)){
+      list.class1 <- c("\n Classes:",classnames,"\n")
+      cat(list.class1)
+      err.class1 <- c("The class ",class1, " does not appear in the column ", class.col)
+      stop("\n- - - - - - - - -\n", err.class1,"\n- - - - - - - - -\n")
+    }
+  }
+  
+  #If type is "one_class", change others classes in "other"
+  if(type == "One_class"){
+    for(i in 1:length(c_class)){
+      if(c_class[i]!=class1){
+        c_class <- as.character(c_class)
+        c_class[i] <- "Other"
+        c_class <- as.factor(c_class)
+        nb_class <- nlevels(c_class)
+        classnames <- levels(c_class)
+        
+      }
+    }
+  }
+
+  DM <- cbind(DM,c_class)
+  
+  # fold -------------------------------------------------------
+  n <- 1
+  fold <- data.frame()
+  for(j in 1:(nb_class-1)){
+    for(k in (j+1):nb_class) {
+      for (i in 1:(length(DM)-1)){
+        fold[i,n] <- mean(DM[which(DM$c_class==classnames[k]),i], na.rm=TRUE)/
+          mean(DM[which(DM$c_class==classnames[j]),i], na.rm=TRUE)}
+      names(fold)[n] <- paste("fold",classnames[k],"VS", classnames[j], sep="_")
+      n <- n + 1}
+  }
+  
+  # NA ---------------------------------------------------------
+  calcul_NA <- data.frame()
+  pct_NA <- data.frame()
+  for (i in 1:(length(DM)-1)){
+    for (j in 1:nb_class){
+      n <- 0
+      new_DM <- DM[which(DM$c_class==classnames[j]),i]
+      for(k in 1:length(new_DM)){
+        if (is.na(new_DM[k])){
+          n <- n + 1}
+        calcul_NA[i,j] <- n
+        pct_NA[i,j] <- (calcul_NA[i,j]/length(new_DM))*100}
+    }
+  }
+  names(calcul_NA) <- paste("Nb_NA",classnames, sep="_")
+  names(pct_NA) <- paste("Pct_NA", classnames, sep="_")
+  
+  # Alert message if there is no NA
+  
+  sumNA <- colSums(calcul_NA)
+  sum_total <- sum(sumNA)
+  
+  alerte <- NULL
+  if(sum_total==0){
+    alerte <- c(alerte, "Data Matrix contains no NA.\n")
+  }
+  
+  if(length(alerte) != 0){ 
+    cat(alerte,"\n")
+  }
+  
+  table_NA <- cbind(calcul_NA, pct_NA)
+  
+  # check columns names ----------------------------------------
+  
+  # Fold
+  VM.names <- colnames(VM)
+  fold.names <- colnames(fold)
+  
+  for (i in 1:length(VM.names)){
+    for (j in 1:length(fold.names)){
+      if (VM.names[i]==fold.names[j]){
+        fold.names[j] <- paste(fold.names[j],"2", sep="_")
+      }
+    }
+  }
+  colnames(fold) <- fold.names
+  
+  # NA
+  NA.names <- colnames(table_NA)
+  
+  for (i in 1:length(VM.names)){
+    for (j in 1:length(NA.names)){
+      if (VM.names[i]==NA.names[j]){
+        NA.names[j] <- paste(NA.names[j],"2", sep="_")
+      }
+    }
+  }
+  colnames(table_NA) <- NA.names
+  
+  #for NA barplots ---------------------------------------------
+  
+  data_bp <- data.frame()
+  
+  for (j in 1:ncol(pct_NA)){
+    Nb_NA_0_20 <- 0
+    Nb_NA_20_40 <- 0
+    Nb_NA_40_60 <- 0
+    Nb_NA_60_80 <- 0
+    Nb_NA_80_100 <- 0
+    for (i in 1:nrow(pct_NA)){
+      
+      if ((0<=pct_NA[i,j])&(pct_NA[i,j]<20)){
+        Nb_NA_0_20=Nb_NA_0_20+1
+      }
+      
+      if ((20<=pct_NA[i,j])&(pct_NA[i,j]<40)){
+        Nb_NA_20_40=Nb_NA_20_40+1}
+      
+      if ((40<=pct_NA[i,j])&(pct_NA[i,j]<60)){
+        Nb_NA_40_60=Nb_NA_40_60+1}
+      
+      if ((60<=pct_NA[i,j])&(pct_NA[i,j]<80)){
+        Nb_NA_60_80=Nb_NA_60_80+1}   
+      
+      if ((80<=pct_NA[i,j])&(pct_NA[i,j]<=100)){
+        Nb_NA_80_100=Nb_NA_80_100+1}   
+    }
+    data_bp[1,j] <- Nb_NA_0_20
+    data_bp[2,j] <- Nb_NA_20_40
+    data_bp[3,j] <- Nb_NA_40_60
+    data_bp[4,j] <- Nb_NA_60_80
+    data_bp[5,j] <- Nb_NA_80_100
+  }
+  rownames(data_bp) <- c("0-20%", "20-40%", "40-60%", "60-80%", "80-100%")
+  colnames(data_bp) <- classnames
+  data_bp <- as.matrix(data_bp)
+  
+  
+  # Output--------------------------------------------------------
+  
+  VM <- cbind(VM,fold,table_NA)
+  
+  write.table(VM, VM.output,sep="\t", quote=FALSE, row.names=FALSE)
+  
+  #graphics pdf
+  
+  pdf(graphs.output)
+  
+  #Boxplots for NA
+  
+  par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
+  
+  bp=barplot(data_bp, col=rainbow(nrow(data_bp)), main="Proportion of NA", xlab="Classes", ylab="Variables")
+  legend("topright", fill=rainbow(nrow(data_bp)),rownames(data_bp), inset=c(-0.3,0))
+  
+  
+  stock=0
+  for (i in 1:nrow(data_bp)){
+    text(bp, stock+data_bp[i,]/2, data_bp[i,], col="white")
+    stock <- stock+data_bp[i,]
+  }
+  
+  
+  #Boxplots for fold test
+  for (j in 1:ncol(fold)){
+    title=paste(fold.names[j])
+    boxplot(fold[j], main=title)
+  }
+  
+  dev.off()
+
+}  
+
+
+# Function call---------------
+
+#setwd("Y:\\Developpement\\Intensity check\\Pour tests")
+#intens_check("DM_NA.tabular", "SM_NA.tabular", "VM_NA.tabular", "One_class", "class", "Blanks", "VM_oneclass_NA.txt", 
+#"Barplots_and_Boxplots")
+
+  
\ No newline at end of file