view CorrTable/Corr_Script_samples_row.R @ 2:2173ad5e7750 draft default tip

Uploaded
author melpetera
date Wed, 16 Oct 2019 03:12:55 -0400
parents 29ec7e3afdd4
children
line wrap: on
line source

  #################################################################################################
  # CORRELATION TABLE                                                                             #
  #                                                                                               #
  #                                                                                               #
  # Input : 2 tables with shared samples                                                          #
  # Output : Correlation table ; Heatmap (pdf)                                                    #
  #                                                                                               #
  # Dependencies : Libraries "ggplot2" and "reshape2"                                             #
  #                                                                                               #
  #################################################################################################
  
  
  # Parameters (for dev)
  if(FALSE){
    
    tab1.name <- "test/ressources/inputs/CT/CT2_DM.tabular"
    tab2.name <- "test/ressources/inputs/CT/CT2_base_Diapason_14ClinCES_PRIN.txt"
    param1.samples <- "column"
    param2.samples <- "row"
    corr.method <- "pearson"
    test.corr <- "yes"
    alpha <- 0.05
    multi.name <- "none"
    filter <- "yes"
    filters.choice <- "filters_0_thr"
    threshold <- 0.2
    reorder.var <- "yes"
	plot.choice <- "auto"
    color.heatmap <- "yes"
    type.classes <-"irregular"
    reg.value <- 1/3
    irreg.vect <- c(-0.3, -0.2, -0.1, 0, 0.3, 0.4)
    output1 <- "Correlation_table.txt"
    output2 <- "Heatmap.pdf"
    
  }
  
  
  
  correlation.tab <- function(tab1.name, tab2.name, param1.samples, param2.samples, corr.method, test.corr, alpha, 
                              multi.name, filter, filters.choice, threshold, reorder.var, plot.choice, color.heatmap, 
                              type.classes, reg.value, irreg.vect, output1, output2){

    # This function allows to visualize the correlation between two tables
    # 
    # Parameters:
    # - tab1.name: table 1 file's access
    # - tab2.name: table 2 file's access
    # - param1.samples ("row" or "column"): where the samples are in tab1
    # - param2.samples ("row" or "column"): where the samples are in tab2
    # - corr.method ("pearson", "spearman", "kendall"): 
    # - test.corr ("yes" or "no"): test the significance of a correlation coefficient
    # - alpha (value between 0 and 1): risk for the correlation significance test
    # - multi.name ("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"): correction of multiple tests
    # - filter ("yes", "no"): use filter.0 or/and filter.threshold
    # - filters.choice ("filter_0" or "filters_0_thr"): zero filter removes variables with all their correlation coefficients = 0
    #   and threshold filter remove variables with all their correlation coefficients in abs < threshold
    # - threshold (value between 0 and 1): threshold for filter threshold
    # - reorder.var ("yes" or "no"): reorder variables in the correlation table thanks to the HCA
	# - plot.choice ("auto", "forced" or "none"): determine whether a heatmap is plotted
    # - color.heatmap ("yes" or "no"): color the heatmap with classes defined by the user
    # - type.classes ("regular" or "irregular"): choose to color the heatmap with regular or irregular classes
    # - reg.value (value between 0 and 1): value for regular classes
    # - irreg.vect (vector with values between -1 and 1): vector which indicates values for intervals (irregular classes)
    # - output1: correlation table file's access
    # - output2: heatmap (colored correlation table) file's access
  
  
  # Input ----------------------------------------------------------------------------------------------
  
  tab1 <- read.table(tab1.name, sep = "\t", header = TRUE, check.names = FALSE, row.names = 1) 
  tab2 <- read.table(tab2.name, sep = "\t", header = TRUE, check.names = FALSE, row.names = 1)  
  
  # Transpose tables according to the samples
  if(param1.samples == "column"){
    tab1 <- t(tab1)
  } 
  
  if(param2.samples == "column"){
    tab2 <- t(tab2)
  }
  
  # Sorting tables in alphabetical order of the samples
  tab1 <- tab1[order(rownames(tab1)),]
  tab2 <- tab2[order(rownames(tab2)),]
  

  # Checks ---------------------------------------------------------------------------------------------

  # Check if the 2 datasets match regarding samples identifiers
  # Adapt from functions "check.err" and "match2", RcheckLibrary.R
    
  err.stock <- NULL
    
  id1 <- rownames(tab1)
  id2 <- rownames(tab2)
    
  if(sum(id1 != id2) > 0){
    err.stock <- c("\nThe two tables do not match regarding sample identifiers.\n")
      
    if(length(which(id1%in%id2)) != length(id1)){
      identif <- id1[which(!(id1%in%id2))]
      if (length(identif) < 4){
        err.stock <- c(err.stock, "\nThe following identifier(s) found in the first table do not appear in the second table:\n")
      }
      else {
        err.stock <- c(err.stock, "\nFor example, the following identifiers found in the first table do not appear in the second table:\n")
      }
      identif <- identif[1:min(3,length(which(!(id1%in%id2))))]
      err.stock <- c(err.stock,"    ",paste(identif,collapse="\n    "),"\n")
    }
      
    if(length(which(id2%in%id1)) != length(id2)){
      identif <- id2[which(!(id2%in%id1))]
      if (length(identif) < 4){
        err.stock <- c(err.stock, "\nThe following identifier(s) found in the second table do not appear in the first table:\n")
      }
      else{
        err.stock <- c(err.stock, "\nFor example, the following identifiers found in the second table do not appear in the first table:\n")
      }
      identif <- identif[1:min(3,length(which(!(id2%in%id1))))] 
      err.stock <- c(err.stock,"    ",paste(identif,collapse="\n    "),"\n")
    }
    err.stock <- c(err.stock,"\nPlease check your data.\n")
  }
  
  if(length(err.stock)!=0){ 
    stop("\n- - - - - - - - -\n",err.stock,"\n- - - - - - - - -\n\n")
  }
  
    
  # Check whether tab1=tab2
  
  err.msg <- NULL
  
  if((ncol(tab1)==ncol(tab2))&&(sum(tab1!=tab2,na.rm=TRUE)==0)){
    autocor <- TRUE
	err.msg <- c(err.msg, "\nYou chose the same table for the two dataset inputs. \nTo allow filtering options,",
	             "we will turn the diagonal to 0 in the correlation matrix during the filtering process.\n")
  }else{
    autocor <- FALSE
  }
  
  
  # Check qualitative variables in each input tables
  
  var1.quali <- vector()
  var2.quali <- vector()
  
  for (i in 1:dim(tab1)[2]){
    if(class(tab1[,i]) != "numeric" & class(tab1[,i]) != "integer"){
      var1.quali <- c(var1.quali,i)
    }
  }
  
  for (j in 1:dim(tab2)[2]){
    if(class(tab2[,j]) != "numeric" & class(tab2[,j]) != "integer"){
      var2.quali <- c(var2.quali, j)
    }
  }
  
  if (length(var1.quali) != 0 | length(var2.quali) != 0){
    err.msg <- c(err.msg, "\nThere are qualitative variables in your input tables which have been removed to compute the correlation table.\n\n")
    
    if(length(var1.quali) != 0 && length(var1.quali) < 4){
      err.msg <- c(err.msg, "In table 1, the following qualitative variables have been removed:\n",
                   "  ",paste(colnames(tab1)[var1.quali],collapse="\n    "),"\n")
    } else if(length(var1.quali) != 0 && length(var1.quali) > 3){
      err.msg <- c(err.msg, "For example, in table 1, the following qualitative variables have been removed:\n",
                   "  ",paste(colnames(tab1)[var1.quali[1:3]],collapse="\n    "),"\n")
    }
    
    if(length(var2.quali) != 0 && length(var2.quali) < 4){
      err.msg <- c(err.msg, "In table 2, the following qualitative variables have been removed:\n", 
                   "  ",paste(colnames(tab2)[var2.quali],collapse="\n    "),"\n")
    } else if(length(var2.quali) != 0 && length(var2.quali) > 3){
      err.msg <- c(err.msg, "For example, in table 2, the following qualitative variables have been removed:\n", 
                   "  ",paste(colnames(tab2)[var2.quali[1:3]],collapse="\n    "),"\n")
    }  
  }
  
  if(length(var1.quali) != 0){
    tab1 <- tab1[,-var1.quali,drop=FALSE]
  }
  if(length(var2.quali) != 0){
    tab2 <- tab2[,-var2.quali,drop=FALSE]
  }
  
  
  # Check whether there are constant variables
  
  var1.cons <- vector()
  var2.cons <- vector()
  
  for (i in 1:dim(tab1)[2]){
    if(length(levels(as.factor(tab1[,i])))==1){ var1.cons <- c(var1.cons,i) }
  }
  
  for (j in 1:dim(tab2)[2]){
    if(length(levels(as.factor(tab2[,j])))==1){ var2.cons <- c(var2.cons, j) }
  } 
  
  if (length(var1.cons) != 0 | length(var2.cons) != 0){
    err.msg <- c(err.msg, "\nThere are constant variables in your input tables which have been removed to compute the correlation table.\n\n")
    
    if(length(var1.cons) != 0 && length(var1.cons) < 4){
      err.msg <- c(err.msg, "In table 1, the following constant variables have been removed:\n",
                   "  ",paste(colnames(tab1)[var1.cons],collapse="\n    "),"\n")
    } else if(length(var1.cons) != 0 && length(var1.cons) > 3){
      err.msg <- c(err.msg, "For example, in table 1, the following constant variables have been removed:\n",
                   "  ",paste(colnames(tab1)[var1.cons[1:3]],collapse="\n    "),"\n")
    }
    
    if(length(var2.cons) != 0 && length(var2.cons) < 4){
      err.msg <- c(err.msg, "In table 2, the following constant variables have been removed:\n", 
                   "  ",paste(colnames(tab2)[var2.cons],collapse="\n    "),"\n")
    } else if(length(var2.cons) != 0 && length(var2.cons) > 3){
      err.msg <- c(err.msg, "For example, in table 2, the following constant variables have been removed:\n", 
                   "  ",paste(colnames(tab2)[var2.cons[1:3]],collapse="\n    "),"\n")
    }  
  }
  
  if(length(var1.cons) != 0){
    tab1 <- tab1[,-var1.cons,drop=FALSE]
  }
  if(length(var2.cons) != 0){
    tab2 <- tab2[,-var2.cons,drop=FALSE]
  }
  
  
  # Print info message
  
  if(length(err.msg) != 0){ 
    cat("\n- - - - - - - - -\n",err.msg,"\n- - - - - - - - -\n\n")
  }
  
  rm(err.stock,var1.quali,var2.quali,var1.cons,var2.cons,err.msg) 
   
   
  # Correlation table ---------------------------------------------------------------------------------
  
  tab.corr <- matrix(nrow = dim(tab2)[2], ncol = dim(tab1)[2])
  for (i in 1:dim(tab2)[2]){
    for (j in 1:dim(tab1)[2]){
      tab.corr[i,j] <- cor(tab2[,i], tab1[,j], method = corr.method, use = "pairwise.complete.obs") 
    }
  }
  
  colnames(tab.corr) <- colnames(tab1)
  rownames(tab.corr) <- colnames(tab2)
  
  
  # Significance of correlation test  ------------------------------------------------------------------
  
  if (test.corr == "yes"){
    
	repcorrtest1 <- function(vari1,vari2,corrmeth){
	  suppressWarnings(corrtest <- cor.test(vari2, vari1, method = corrmeth))
	  return(corrtest$p.value)
	}
	repcorrtest2 <- function(stab,ftab,cormeth){
	  listp <- apply(X=ftab,2,repcorrtest1,vari2=stab,corrmeth=cormeth)
	  return(listp)
	}
	pvalue <- apply(X=tab1,2,repcorrtest2,ftab=tab2,cormeth=corr.method)
	
	
    if(multi.name != "none"){
      pvalue <- matrix(p.adjust(pvalue, method = multi.name), nrow = dim(tab.corr)[1], ncol = dim(tab.corr)[2])
    } 
	
	tab.corr[pvalue > alpha] <- 0 
	
  }  
  
  
  # Filter settings ------------------------------------------------------------------------------------
  
  if (filter == "yes"){
  
    # Turn diagonal from 1 to 0 if autocorrelation
	if(autocor){
	  for(i in 1:(ncol(tab.corr))){ tab.corr[i,i] <- 0 }
	}
    
    # Remove variables with all their correlation coefficients = 0 :
    if (filters.choice == "filter_0"){
      threshold <- 0
    } 
      
    var2.thres <- vector()
    for (i in 1:dim(tab.corr)[1]){
      if (length(which(abs(tab.corr[i,]) <= threshold)) == dim(tab.corr)[2]){
        var2.thres <- c(var2.thres, i)
      }
    }
    
    if (length(var2.thres) != 0){
      tab.corr <- tab.corr[-var2.thres,,drop=FALSE]
      tab2 <- tab2[, -var2.thres,drop=FALSE]
    }
    
    var1.thres <- vector()
    for (i in 1:dim(tab.corr)[2]){
      if (length(which(abs(tab.corr[,i]) <= threshold)) == dim(tab.corr)[1]){
        var1.thres <- c(var1.thres, i)
      }
    }
      
    if (length(var1.thres) != 0){
      tab.corr <- tab.corr[,-var1.thres,drop=FALSE]
      tab1 <- tab1[,-var1.thres,drop=FALSE]
    }
	
	# Turn diagonal from 0 back to 1 if autocorrelation
	if(autocor){
	  for(i in 1:(ncol(tab.corr))){ tab.corr[i,i] <- 1 }
	}
    
  } 
  
  
  # Reorder variables in the correlation table (with the HCA) ------------------------------------------
  if (reorder.var == "yes"){
    
    cormat.tab2 <- cor(tab2, method = corr.method, use = "pairwise.complete.obs")
    dist.tab2 <- as.dist(1 - cormat.tab2)
    hc.tab2 <- hclust(dist.tab2, method = "ward.D2")
    tab.corr <- tab.corr[hc.tab2$order,]
	rm(cormat.tab2)
    
    cormat.tab1 <- cor(tab1, method = corr.method, use = "pairwise.complete.obs")
    dist.tab1 <- as.dist(1 - cormat.tab1)
    hc.tab1 <- hclust(dist.tab1, method = "ward.D2")
    tab.corr <- tab.corr[,hc.tab1$order]
	rm(cormat.tab1)
    
  }
  
  
  # Output 1 : Correlation table -----------------------------------------------------------------------
  
  
  # Export correlation table
  write.table(x = data.frame(name = rownames(tab.corr), tab.corr), file = output1, sep = "\t", quote = FALSE, row.names = FALSE)
  
  
  
  # Create the heatmap ---------------------------------------------------------------------------------

  if(plot.choice != "none"){
 
  # A message if no variable kept
  if(length(tab.corr)==0){
	pdf(output2)
	plot.new()
	legend("center","Filtering leads to no remaining correlation coefficient.")
	dev.off()
  } else {
  
  # A message if more than 1000 variable in auto mode
  if((plot.choice=="auto")&&(max(dim(tab.corr))>1000)){
	pdf(output2)
	war.msg <- paste0("In 'default' mode, the colored table is not provided when\none of the tables contains more than ",
	                  "a thousand\nvariables after the filter step.\n\nOne of your table still contains ",max(dim(tab.corr))," variables.\n",
					  "Please consider more filtering, or use the 'Always plot a\ncolored table' mode to obtain your colored table.")
	plot.new()
	legend("center",war.msg,adj=c(0.05,0.075))
	dev.off()
  } else {
  
  
  library(ggplot2)
  library(reshape2)
  
  # Melt the correlation table :
  melted.tab.corr <- melt(tab.corr)
  
  if (color.heatmap == "yes") {
     
    # Add a column for the classes of each correlation coefficient
    classe <- rep(0, dim(melted.tab.corr)[1])
    melted <- cbind(melted.tab.corr, classe)
    
    if (type.classes == "regular"){
      
      vect <- vector()
      if (seq(-1,0,reg.value)[length(seq(-1,0,reg.value))] == 0){
        vect <- c(seq(-1,0,reg.value)[-length(seq(-1,0,reg.value))], 
                  rev(seq(1,0,-reg.value)))
      } else {
        vect <- c(seq(-1,0,reg.value), 0, rev(seq(1,0,-reg.value)))
      }
      
    } else if (type.classes == "irregular") {
      
      irreg.vect <- c(-1, irreg.vect, 1) 
      vect <- irreg.vect
      
    }
    
    # Color palette :  
    myPal <- colorRampPalette(c("#00CC00", "white", "red"), space = "Lab", interpolate = "spline")
    
    # Create vector intervals
    cl <- vector()
    cl <- paste("[", vect[1], ";", round(vect[2],3), "]", sep = "")
    
    for (x in 2:(length(vect)-1)) {
      if (vect[x+1] == 0) {
        cl <- c(cl, paste("]", round(vect[x],3), ";", round(vect[x+1],3), "[", sep = ""))
      } else {
        cl <- c(cl, paste("]", round(vect[x],3), ";", 
                          round(vect[x+1],3), "]", sep = ""))
      }
    }
    
    # Assign an interval to each correlation coefficient
      for (j in 1:(length(cl))){
        if (vect[j] == -1){
          melted$classe[melted$value >= vect[j] & melted$value <= vect[j+1]] <- cl[j]
        } else {
          melted$classe[melted$value > vect[j] & melted$value <= vect[j+1]] <- cl[j]
        }
      }
	
    
    # Find the 0 and assign it the white as name
    if (length(which(vect == 0)) == 1) {
      melted$classe[melted$value == 0] <- "0"
      indic <- which(vect == 0)
      cl <- c(cl[1:(indic-1)], 0, cl[indic:length(cl)])
      names(cl)[indic] <- "#FFFFFF"
    } else if (length(which(vect == 0)) == 0) {
      indic <- 0
      for (x in 1:(length(vect)-1)) {
        if (0 > vect[x] && 0 <= vect[x+1]) {
          names(cl)[x] <- "#FFFFFF"
          indic <- x
        }
      }
    }
    
    indic <- length(cl) - indic + 1
    cl <- rev(cl)
    
    # Assign the colors of each intervals as their name
    names(cl)[1:(indic-1)] <- myPal(length(cl[1:indic])*2-1)[1:indic-1]
    names(cl)[(indic+1):length(cl)] <- myPal(length(cl[indic:length(cl)])*2-1)[(ceiling(length(myPal(length(cl[indic:length(cl)])*2-1))/2)+1):length(myPal(length(cl[indic:length(cl)])*2-1))]
    
    
    melted$classe <- factor(melted$classe)
    melted$classe <- factor(melted$classe, levels = cl[cl%in%levels(melted$classe)])
    
    # Heatmap if color.heatmap = yes :
    ggplot(melted, aes(Var2, Var1, fill = classe)) +
      ggtitle("Colored correlation table" ) + xlab("Table 1") + ylab("Table 2") +
      geom_tile(color ="ghostwhite") +
      scale_fill_manual( breaks = levels(melted$classe),
                         values = names(cl)[cl%in%levels(melted$classe)],
                         name = paste(corr.method, "correlation", sep = "\n")) +
      theme_classic() +
      theme(axis.text.x = element_text(angle = 90, vjust = 0.5), 
            plot.title = element_text(hjust = 0.5)) 
    
  } else {
    
    # Heatmap if color.heatmap = no :
    ggplot(melted.tab.corr, aes(Var2, Var1, fill = value)) +
      ggtitle("Colored correlation table" ) + xlab("Table 1") + ylab("Table 2") +
      geom_tile(color ="ghostwhite") +
      scale_fill_gradient2(low = "red", high = "#00CC00", mid = "white", midpoint = 0, limit = c(-1,1),
                           name = paste(corr.method, "correlation", sep = "\n")) +
      theme_classic() +
      theme(axis.text.x = element_text(angle = 90, vjust = 0.5), 
            plot.title = element_text(hjust = 0.5))
  }
  
  
  ggsave(output2, device = "pdf", width = 10+0.075*dim(tab.corr)[2], height = 5+0.075*dim(tab.corr)[1], limitsize = FALSE)
  
  
  } # End of if((plot.choice=="auto")&&(max(dim(tab.corr))>1000))else
  } # End of if(length(tab.corr)==0)else
  } # End of if(plot.choice != "auto")
  
  
  } # End of correlation.tab
  
  
  # Function call
  # correlation.tab(tab1.name, tab2.name, param1.samples, param2.samples, corr.method, test.corr, alpha, multi.name, filter,
  #                             filters.choice, threshold, reorder.var, plot.choice, color.heatmap, type.classes,
  #                             reg.value, irreg.vect, output1, output2)