view Intchecks/Script_intensity_check.R @ 1:4973a2104cfd draft

Uploaded
author melpetera
date Wed, 05 Dec 2018 10:27:45 -0500
parents c2c2e1be904a
children bdee2c2c484b
line wrap: on
line source

#########################################################################
# SCRIPT INTENSITY CHECK                                                #
#                                                                       #
# Input: Data Matrix, VariableMetadata, SampleMetadata                  #
# Output: VariableMetadata, Graphics (barplots and boxplots)            #
#                                                                       #
# Dependencies: RcheckLibrary.R                                         #
#                                                                       #
#########################################################################


# Parameters (for dev)
if(FALSE){
  
  rm(list = ls())
  setwd("Y:\\Developpement\\Intensity check\\Pour tests")
  
  DM.name <- "DM_NA.tabular"
  SM.name <- "SM_NA.tabular"
  VM.name <- "vM_NA.tabular"
  class.col <- "2"
  type <- "One_class"
  class1 <- "Blanks"
  fold.frac <- "Top"
  logarithm <- "log2"
  VM.output <- "new_VM.txt"
  graphs.output <- "Barplots_and_Boxplots.pdf"
}




intens_check <- function(DM.name, SM.name, VM.name, class.col, type, class1, fold.frac, logarithm,
                         VM.output, graphs.output){
  
  
  # This function allows to check the intensities considering classes with a mean fold change calculation,  
  # the number and the proportion of missing values (NA) in dataMatrix
  #
  # Two options: 
  # - one class (selected by the user) against all the remaining samples ("One_class")
  # - tests on each class ("Each_class")
  #
  # Parameters:
  # DM.name, SM.name, VM.name: dataMatrix, sampleMetadata, variableMetadata files access
  # class.col: number of the sampleMetadata's column with classes
  # type: "One_class" or "Each_class"
  # class1: name of the class, only if type="One_class"
  # fold.frac: if type="One class": class1/other ("Top") or other/class1 ("Bottom")
  # logarithm: "log2", "log10" or "none" for log mean fold change
  # VM.output: output file's access (VM with new columns)
  # graphs.output: pdf file's access with barplots for the proportion of NA and boxplots with the folds values

  
  
  
  
  # Input ---------------------------------------------------------------------------------------------------
  
  DM <- read.table(DM.name, header=TRUE, sep="\t", check.names=FALSE)
  SM <- read.table(SM.name, header=TRUE, sep="\t", check.names=FALSE)
  VM <- read.table(VM.name, header=TRUE, sep="\t", check.names=FALSE)
  
  
  
  # Table match check with Rchecklibrary
  table.check <- match3(DM, SM, VM)
  check.err(table.check)
  
  
  
  rownames(DM) <- DM[,1]
  var_names <- DM[,1]
  DM <- DM[,-1]
  DM <- data.frame(t(DM))
  
  class.col <- colnames(SM)[as.numeric(class.col)]
  
  
  # check class.col, class1 and the number of classes ---------------------------------------------------------
  
  if(!(class.col %in% colnames(SM))){
    stop("\n- - - - - - - - -\n", "The column ",class.col, " is not a part of the specify sample Metadata","\n- - - - - - - - -\n")
  }
  
  c_class <- SM[,class.col]
  c_class <- as.factor(c_class)
  nb_class <- nlevels(c_class)
  classnames <- levels(c_class)
  
  if(nb_class < 2){
    err.1class <- c("\n The column",class.col, "contains only one class, fold calculation could not be executed \n")
    cat(err.1class)  
  }
  
  if((nb_class > (nrow(SM))/3)){
    class.err <- c("\n There are too many classes, think about reducing the number of classes and excluding those
                   with few samples \n")
    cat(class.err)
  }

  
  if(type == "One_class"){  
    if(!(class1 %in% classnames)){
      list.class1 <- c("\n Classes:",classnames,"\n")
      cat(list.class1)
      err.class1 <- c("The class ",class1, " does not appear in the column ", class.col)
      stop("\n- - - - - - - - -\n", err.class1,"\n- - - - - - - - -\n")
    }
  }
  
  
  #If type is "one_class", change others classes in "other"
  if(type == "One_class"){
    for(i in 1:length(c_class)){
      if(c_class[i]!=class1){
        c_class <- as.character(c_class)
        c_class[i] <- "Other"
        c_class <- as.factor(c_class)
        nb_class <- nlevels(c_class)
        classnames <- c(class1,"Other")
        
      }
    }
  }

  DM <- cbind(DM,c_class)
  
  
  
  # fold calculation -------------------------------------------------------------------------------------------
  
  if(nb_class >= 2){
    
    
    fold <- data.frame()
    n <- 1
    ratio1 <- NULL
    ratio2 <- NULL
    
    if(type=="Each_class"){
      fold.frac <- "Top"
    }
    
    for(j in 1:(nb_class-1)){
      for(k in (j+1):nb_class) {
        
        if(fold.frac=="Bottom"){
          ratio1 <- classnames[k]
          ratio2 <- classnames[j]
        }else{
          ratio1 <- classnames[j]
          ratio2 <- classnames[k]
        }
        
        for (i in 1:(length(DM)-1)){
          fold[i,n] <- mean(DM[which(DM$c_class==ratio1),i], na.rm=TRUE)/
            mean(DM[which(DM$c_class==ratio2),i], na.rm=TRUE)
          if(logarithm=="log2"){
            fold[i,n] <- log2(fold[i,n])
          }else if(logarithm=="log10"){
            fold[i,n] <- log10(fold[i,n])
          }
        }
        names(fold)[n] <- paste("fold",ratio1,"VS", ratio2, sep="_")
        if(logarithm != "none"){
          names(fold)[n] <- paste(logarithm,names(fold)[n], sep="_")
        }
        n <- n + 1}
    }
    
  }
  
  # number and proportion of NA ---------------------------------------------------------------------------------
  
  calcul_NA <- data.frame()
  pct_NA <- data.frame()
  for (i in 1:(length(DM)-1)){
    for (j in 1:nb_class){
      n <- 0
      new_DM <- DM[which(DM$c_class==classnames[j]),i]
      for(k in 1:length(new_DM)){
        if (is.na(new_DM[k])){
          n <- n + 1}
        calcul_NA[i,j] <- n
        pct_NA[i,j] <- (calcul_NA[i,j]/length(new_DM))*100}
    }
  }
  names(calcul_NA) <- paste("NA",classnames, sep="_")
  names(pct_NA) <- paste("Pct_NA", classnames, sep="_")
  
  # Alert message if there is no NA in data matrix
  
  sumNA <- colSums(calcul_NA)
  sum_total <- sum(sumNA)
  alerte <- NULL
  if(sum_total==0){
    alerte <- c(alerte, "Data Matrix contains no NA.\n")
  }
  
  if(length(alerte) != 0){ 
    cat(alerte,"\n")
  }
  table_NA <- cbind(calcul_NA, pct_NA)
  
  
  
  # check columns names ---------------------------------------------------------------------------------------
  

  VM.names <- colnames(VM)
  
  # Fold
  
  if(nb_class >=2){
    fold.names <- colnames(fold)
  
    for (i in 1:length(VM.names)){
      for (j in 1:length(fold.names)){
        if (VM.names[i]==fold.names[j]){
          fold.names[j] <- paste(fold.names[j],"2", sep="_")
        }
      }
    }
    colnames(fold) <- fold.names
    
    VM <- cbind(VM,fold)
  }
  
  # NA
  NA.names <- colnames(table_NA)
  
  for (i in 1:length(VM.names)){
    for (j in 1:length(NA.names)){
      if (VM.names[i]==NA.names[j]){
        NA.names[j] <- paste(NA.names[j],"2", sep="_")
      }
    }
  }
  colnames(table_NA) <- NA.names
  VM <- cbind(VM,table_NA)
  
  
  #for NA barplots -------------------------------------------------------------------------------------------
  
  data_bp <- data.frame()
  
  for (j in 1:ncol(pct_NA)){
    Nb_NA_0_20 <- 0
    Nb_NA_20_40 <- 0
    Nb_NA_40_60 <- 0
    Nb_NA_60_80 <- 0
    Nb_NA_80_100 <- 0
    for (i in 1:nrow(pct_NA)){
      
      if ((0<=pct_NA[i,j])&(pct_NA[i,j]<20)){
        Nb_NA_0_20=Nb_NA_0_20+1
      }
      
      if ((20<=pct_NA[i,j])&(pct_NA[i,j]<40)){
        Nb_NA_20_40=Nb_NA_20_40+1}
      
      if ((40<=pct_NA[i,j])&(pct_NA[i,j]<60)){
        Nb_NA_40_60=Nb_NA_40_60+1}
      
      if ((60<=pct_NA[i,j])&(pct_NA[i,j]<80)){
        Nb_NA_60_80=Nb_NA_60_80+1}   
      
      if ((80<=pct_NA[i,j])&(pct_NA[i,j]<=100)){
        Nb_NA_80_100=Nb_NA_80_100+1}   
    }
    data_bp[1,j] <- Nb_NA_0_20
    data_bp[2,j] <- Nb_NA_20_40
    data_bp[3,j] <- Nb_NA_40_60
    data_bp[4,j] <- Nb_NA_60_80
    data_bp[5,j] <- Nb_NA_80_100
  }
  rownames(data_bp) <- c("0%-20%", "20%-40%", "40%-60%", "60%-80%", "80%-100%")
  colnames(data_bp) <- classnames
  data_bp <- as.matrix(data_bp)
  
  
  # Output ---------------------------------------------------------------------------------------------------
  
  
  write.table(VM, VM.output,sep="\t", quote=FALSE, row.names=FALSE)
  
  #graphics pdf
  
  pdf(graphs.output)
  
  #Barplots for NA
  par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
  
  bp=barplot(data_bp, col=rainbow(nrow(data_bp)), main="Proportion of NA", xlab="Classes", ylab="Variables")
  legend("topright", fill=rainbow(nrow(data_bp)),rownames(data_bp), inset=c(-0.3,0))
  
  stock=0
  for (i in 1:nrow(data_bp)){
    text(bp, stock+data_bp[i,]/2, data_bp[i,], col="white", cex=0.7)
    stock <- stock+data_bp[i,]
  }
  
  
  #Boxplots for fold test
  
  if(nb_class >= 2){
    
    clean_fold <- fold
    for(i in 1:nrow(clean_fold)){
      for(j in 1:ncol(clean_fold)){
        if(is.infinite(clean_fold[i,j])){
          clean_fold[i,j] <- NA
        }
      }
    }
    for (j in 1:ncol(clean_fold)){
      title <- paste(fold.names[j])
      boxplot(clean_fold[j], main=title)
    }
    }
  
  dev.off()

}