diff Intchecks/Script_intensity_check.R @ 1:4973a2104cfd draft

Uploaded
author melpetera
date Wed, 05 Dec 2018 10:27:45 -0500
parents c2c2e1be904a
children bdee2c2c484b
line wrap: on
line diff
--- a/Intchecks/Script_intensity_check.R	Thu Oct 11 05:33:19 2018 -0400
+++ b/Intchecks/Script_intensity_check.R	Wed Dec 05 10:27:45 2018 -0500
@@ -1,13 +1,12 @@
-####################################################################
-# SCRIPT INTENSITY CHECK
-# V1: Fold and NA
-#
-# Input: Data Matrix, VariableMetadata, SampleMetadata
-# Output: VariableMetadata, Graphics (barplots and boxplots)
-#
-# Dependencies: RcheckLibrary.R 
-#
-####################################################################
+#########################################################################
+# SCRIPT INTENSITY CHECK                                                #
+#                                                                       #
+# Input: Data Matrix, VariableMetadata, SampleMetadata                  #
+# Output: VariableMetadata, Graphics (barplots and boxplots)            #
+#                                                                       #
+# Dependencies: RcheckLibrary.R                                         #
+#                                                                       #
+#########################################################################
 
 
 # Parameters (for dev)
@@ -19,16 +18,24 @@
   DM.name <- "DM_NA.tabular"
   SM.name <- "SM_NA.tabular"
   VM.name <- "vM_NA.tabular"
+  class.col <- "2"
   type <- "One_class"
-  class.col <- "2"
   class1 <- "Blanks"
+  fold.frac <- "Top"
+  logarithm <- "log2"
   VM.output <- "new_VM.txt"
   graphs.output <- "Barplots_and_Boxplots.pdf"
 }
 
-intens_check <- function(DM.name, SM.name, VM.name, type, class.col, class1, VM.output, graphs.output){
-  # This function allows to check the intensities considering classes with a fold calculation, the number and
-  # the proportion of NA
+
+
+
+intens_check <- function(DM.name, SM.name, VM.name, class.col, type, class1, fold.frac, logarithm,
+                         VM.output, graphs.output){
+  
+  
+  # This function allows to check the intensities considering classes with a mean fold change calculation,  
+  # the number and the proportion of missing values (NA) in dataMatrix
   #
   # Two options: 
   # - one class (selected by the user) against all the remaining samples ("One_class")
@@ -36,16 +43,19 @@
   #
   # Parameters:
   # DM.name, SM.name, VM.name: dataMatrix, sampleMetadata, variableMetadata files access
+  # class.col: number of the sampleMetadata's column with classes
   # type: "One_class" or "Each_class"
-  # class.col: number of the sampleMetadata's column with classes
-  # class1: name of the class if type="One_class"
-  # VM.output: output file's access (VM with the new columns)
-  # graphs.output: pdf with barplots for the proportion of NA and boxplots with the folds values
+  # class1: name of the class, only if type="One_class"
+  # fold.frac: if type="One class": class1/other ("Top") or other/class1 ("Bottom")
+  # logarithm: "log2", "log10" or "none" for log mean fold change
+  # VM.output: output file's access (VM with new columns)
+  # graphs.output: pdf file's access with barplots for the proportion of NA and boxplots with the folds values
+
   
   
   
   
-  # Input ---------------------------------------------------------
+  # Input ---------------------------------------------------------------------------------------------------
   
   DM <- read.table(DM.name, header=TRUE, sep="\t", check.names=FALSE)
   SM <- read.table(SM.name, header=TRUE, sep="\t", check.names=FALSE)
@@ -58,6 +68,7 @@
   check.err(table.check)
   
   
+  
   rownames(DM) <- DM[,1]
   var_names <- DM[,1]
   DM <- DM[,-1]
@@ -65,7 +76,8 @@
   
   class.col <- colnames(SM)[as.numeric(class.col)]
   
-  # check class.col, class1 and the number of classes---------------
+  
+  # check class.col, class1 and the number of classes ---------------------------------------------------------
   
   if(!(class.col %in% colnames(SM))){
     stop("\n- - - - - - - - -\n", "The column ",class.col, " is not a part of the specify sample Metadata","\n- - - - - - - - -\n")
@@ -76,6 +88,10 @@
   nb_class <- nlevels(c_class)
   classnames <- levels(c_class)
   
+  if(nb_class < 2){
+    err.1class <- c("\n The column",class.col, "contains only one class, fold calculation could not be executed \n")
+    cat(err.1class)  
+  }
   
   if((nb_class > (nrow(SM))/3)){
     class.err <- c("\n There are too many classes, think about reducing the number of classes and excluding those
@@ -83,6 +99,7 @@
     cat(class.err)
   }
 
+  
   if(type == "One_class"){  
     if(!(class1 %in% classnames)){
       list.class1 <- c("\n Classes:",classnames,"\n")
@@ -92,6 +109,7 @@
     }
   }
   
+  
   #If type is "one_class", change others classes in "other"
   if(type == "One_class"){
     for(i in 1:length(c_class)){
@@ -100,7 +118,7 @@
         c_class[i] <- "Other"
         c_class <- as.factor(c_class)
         nb_class <- nlevels(c_class)
-        classnames <- levels(c_class)
+        classnames <- c(class1,"Other")
         
       }
     }
@@ -108,19 +126,53 @@
 
   DM <- cbind(DM,c_class)
   
-  # fold -------------------------------------------------------
-  n <- 1
-  fold <- data.frame()
-  for(j in 1:(nb_class-1)){
-    for(k in (j+1):nb_class) {
-      for (i in 1:(length(DM)-1)){
-        fold[i,n] <- mean(DM[which(DM$c_class==classnames[k]),i], na.rm=TRUE)/
-          mean(DM[which(DM$c_class==classnames[j]),i], na.rm=TRUE)}
-      names(fold)[n] <- paste("fold",classnames[k],"VS", classnames[j], sep="_")
-      n <- n + 1}
+  
+  
+  # fold calculation -------------------------------------------------------------------------------------------
+  
+  if(nb_class >= 2){
+    
+    
+    fold <- data.frame()
+    n <- 1
+    ratio1 <- NULL
+    ratio2 <- NULL
+    
+    if(type=="Each_class"){
+      fold.frac <- "Top"
+    }
+    
+    for(j in 1:(nb_class-1)){
+      for(k in (j+1):nb_class) {
+        
+        if(fold.frac=="Bottom"){
+          ratio1 <- classnames[k]
+          ratio2 <- classnames[j]
+        }else{
+          ratio1 <- classnames[j]
+          ratio2 <- classnames[k]
+        }
+        
+        for (i in 1:(length(DM)-1)){
+          fold[i,n] <- mean(DM[which(DM$c_class==ratio1),i], na.rm=TRUE)/
+            mean(DM[which(DM$c_class==ratio2),i], na.rm=TRUE)
+          if(logarithm=="log2"){
+            fold[i,n] <- log2(fold[i,n])
+          }else if(logarithm=="log10"){
+            fold[i,n] <- log10(fold[i,n])
+          }
+        }
+        names(fold)[n] <- paste("fold",ratio1,"VS", ratio2, sep="_")
+        if(logarithm != "none"){
+          names(fold)[n] <- paste(logarithm,names(fold)[n], sep="_")
+        }
+        n <- n + 1}
+    }
+    
   }
   
-  # NA ---------------------------------------------------------
+  # number and proportion of NA ---------------------------------------------------------------------------------
+  
   calcul_NA <- data.frame()
   pct_NA <- data.frame()
   for (i in 1:(length(DM)-1)){
@@ -134,14 +186,13 @@
         pct_NA[i,j] <- (calcul_NA[i,j]/length(new_DM))*100}
     }
   }
-  names(calcul_NA) <- paste("Nb_NA",classnames, sep="_")
+  names(calcul_NA) <- paste("NA",classnames, sep="_")
   names(pct_NA) <- paste("Pct_NA", classnames, sep="_")
   
-  # Alert message if there is no NA
+  # Alert message if there is no NA in data matrix
   
   sumNA <- colSums(calcul_NA)
   sum_total <- sum(sumNA)
-  
   alerte <- NULL
   if(sum_total==0){
     alerte <- c(alerte, "Data Matrix contains no NA.\n")
@@ -150,23 +201,31 @@
   if(length(alerte) != 0){ 
     cat(alerte,"\n")
   }
-  
   table_NA <- cbind(calcul_NA, pct_NA)
   
-  # check columns names ----------------------------------------
+  
+  
+  # check columns names ---------------------------------------------------------------------------------------
+  
+
+  VM.names <- colnames(VM)
   
   # Fold
-  VM.names <- colnames(VM)
-  fold.names <- colnames(fold)
+  
+  if(nb_class >=2){
+    fold.names <- colnames(fold)
   
-  for (i in 1:length(VM.names)){
-    for (j in 1:length(fold.names)){
-      if (VM.names[i]==fold.names[j]){
-        fold.names[j] <- paste(fold.names[j],"2", sep="_")
+    for (i in 1:length(VM.names)){
+      for (j in 1:length(fold.names)){
+        if (VM.names[i]==fold.names[j]){
+          fold.names[j] <- paste(fold.names[j],"2", sep="_")
+        }
       }
     }
+    colnames(fold) <- fold.names
+    
+    VM <- cbind(VM,fold)
   }
-  colnames(fold) <- fold.names
   
   # NA
   NA.names <- colnames(table_NA)
@@ -179,8 +238,10 @@
     }
   }
   colnames(table_NA) <- NA.names
+  VM <- cbind(VM,table_NA)
   
-  #for NA barplots ---------------------------------------------
+  
+  #for NA barplots -------------------------------------------------------------------------------------------
   
   data_bp <- data.frame()
   
@@ -214,14 +275,13 @@
     data_bp[4,j] <- Nb_NA_60_80
     data_bp[5,j] <- Nb_NA_80_100
   }
-  rownames(data_bp) <- c("0-20%", "20-40%", "40-60%", "60-80%", "80-100%")
+  rownames(data_bp) <- c("0%-20%", "20%-40%", "40%-60%", "60%-80%", "80%-100%")
   colnames(data_bp) <- classnames
   data_bp <- as.matrix(data_bp)
   
   
-  # Output--------------------------------------------------------
+  # Output ---------------------------------------------------------------------------------------------------
   
-  VM <- cbind(VM,fold,table_NA)
   
   write.table(VM, VM.output,sep="\t", quote=FALSE, row.names=FALSE)
   
@@ -229,36 +289,39 @@
   
   pdf(graphs.output)
   
-  #Boxplots for NA
-  
+  #Barplots for NA
   par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
   
   bp=barplot(data_bp, col=rainbow(nrow(data_bp)), main="Proportion of NA", xlab="Classes", ylab="Variables")
   legend("topright", fill=rainbow(nrow(data_bp)),rownames(data_bp), inset=c(-0.3,0))
   
-  
   stock=0
   for (i in 1:nrow(data_bp)){
-    text(bp, stock+data_bp[i,]/2, data_bp[i,], col="white")
+    text(bp, stock+data_bp[i,]/2, data_bp[i,], col="white", cex=0.7)
     stock <- stock+data_bp[i,]
   }
   
   
   #Boxplots for fold test
-  for (j in 1:ncol(fold)){
-    title=paste(fold.names[j])
-    boxplot(fold[j], main=title)
-  }
+  
+  if(nb_class >= 2){
+    
+    clean_fold <- fold
+    for(i in 1:nrow(clean_fold)){
+      for(j in 1:ncol(clean_fold)){
+        if(is.infinite(clean_fold[i,j])){
+          clean_fold[i,j] <- NA
+        }
+      }
+    }
+    for (j in 1:ncol(clean_fold)){
+      title <- paste(fold.names[j])
+      boxplot(clean_fold[j], main=title)
+    }
+    }
   
   dev.off()
 
 }  
 
-
-# Function call---------------
-
-#setwd("Y:\\Developpement\\Intensity check\\Pour tests")
-#intens_check("DM_NA.tabular", "SM_NA.tabular", "VM_NA.tabular", "One_class", "class", "Blanks", "VM_oneclass_NA.txt", 
-#"Barplots_and_Boxplots")
-
   
\ No newline at end of file