Mercurial > repos > melpetera > corr_table
view CorrTable/Corr_Script_samples_row.R @ 2:2173ad5e7750 draft default tip
Uploaded
author | melpetera |
---|---|
date | Wed, 16 Oct 2019 03:12:55 -0400 |
parents | 29ec7e3afdd4 |
children |
line wrap: on
line source
################################################################################################# # CORRELATION TABLE # # # # # # Input : 2 tables with shared samples # # Output : Correlation table ; Heatmap (pdf) # # # # Dependencies : Libraries "ggplot2" and "reshape2" # # # ################################################################################################# # Parameters (for dev) if(FALSE){ tab1.name <- "test/ressources/inputs/CT/CT2_DM.tabular" tab2.name <- "test/ressources/inputs/CT/CT2_base_Diapason_14ClinCES_PRIN.txt" param1.samples <- "column" param2.samples <- "row" corr.method <- "pearson" test.corr <- "yes" alpha <- 0.05 multi.name <- "none" filter <- "yes" filters.choice <- "filters_0_thr" threshold <- 0.2 reorder.var <- "yes" plot.choice <- "auto" color.heatmap <- "yes" type.classes <-"irregular" reg.value <- 1/3 irreg.vect <- c(-0.3, -0.2, -0.1, 0, 0.3, 0.4) output1 <- "Correlation_table.txt" output2 <- "Heatmap.pdf" } correlation.tab <- function(tab1.name, tab2.name, param1.samples, param2.samples, corr.method, test.corr, alpha, multi.name, filter, filters.choice, threshold, reorder.var, plot.choice, color.heatmap, type.classes, reg.value, irreg.vect, output1, output2){ # This function allows to visualize the correlation between two tables # # Parameters: # - tab1.name: table 1 file's access # - tab2.name: table 2 file's access # - param1.samples ("row" or "column"): where the samples are in tab1 # - param2.samples ("row" or "column"): where the samples are in tab2 # - corr.method ("pearson", "spearman", "kendall"): # - test.corr ("yes" or "no"): test the significance of a correlation coefficient # - alpha (value between 0 and 1): risk for the correlation significance test # - multi.name ("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"): correction of multiple tests # - filter ("yes", "no"): use filter.0 or/and filter.threshold # - filters.choice ("filter_0" or "filters_0_thr"): zero filter removes variables with all their correlation coefficients = 0 # and threshold filter remove variables with all their correlation coefficients in abs < threshold # - threshold (value between 0 and 1): threshold for filter threshold # - reorder.var ("yes" or "no"): reorder variables in the correlation table thanks to the HCA # - plot.choice ("auto", "forced" or "none"): determine whether a heatmap is plotted # - color.heatmap ("yes" or "no"): color the heatmap with classes defined by the user # - type.classes ("regular" or "irregular"): choose to color the heatmap with regular or irregular classes # - reg.value (value between 0 and 1): value for regular classes # - irreg.vect (vector with values between -1 and 1): vector which indicates values for intervals (irregular classes) # - output1: correlation table file's access # - output2: heatmap (colored correlation table) file's access # Input ---------------------------------------------------------------------------------------------- tab1 <- read.table(tab1.name, sep = "\t", header = TRUE, check.names = FALSE, row.names = 1) tab2 <- read.table(tab2.name, sep = "\t", header = TRUE, check.names = FALSE, row.names = 1) # Transpose tables according to the samples if(param1.samples == "column"){ tab1 <- t(tab1) } if(param2.samples == "column"){ tab2 <- t(tab2) } # Sorting tables in alphabetical order of the samples tab1 <- tab1[order(rownames(tab1)),] tab2 <- tab2[order(rownames(tab2)),] # Checks --------------------------------------------------------------------------------------------- # Check if the 2 datasets match regarding samples identifiers # Adapt from functions "check.err" and "match2", RcheckLibrary.R err.stock <- NULL id1 <- rownames(tab1) id2 <- rownames(tab2) if(sum(id1 != id2) > 0){ err.stock <- c("\nThe two tables do not match regarding sample identifiers.\n") if(length(which(id1%in%id2)) != length(id1)){ identif <- id1[which(!(id1%in%id2))] if (length(identif) < 4){ err.stock <- c(err.stock, "\nThe following identifier(s) found in the first table do not appear in the second table:\n") } else { err.stock <- c(err.stock, "\nFor example, the following identifiers found in the first table do not appear in the second table:\n") } identif <- identif[1:min(3,length(which(!(id1%in%id2))))] err.stock <- c(err.stock," ",paste(identif,collapse="\n "),"\n") } if(length(which(id2%in%id1)) != length(id2)){ identif <- id2[which(!(id2%in%id1))] if (length(identif) < 4){ err.stock <- c(err.stock, "\nThe following identifier(s) found in the second table do not appear in the first table:\n") } else{ err.stock <- c(err.stock, "\nFor example, the following identifiers found in the second table do not appear in the first table:\n") } identif <- identif[1:min(3,length(which(!(id2%in%id1))))] err.stock <- c(err.stock," ",paste(identif,collapse="\n "),"\n") } err.stock <- c(err.stock,"\nPlease check your data.\n") } if(length(err.stock)!=0){ stop("\n- - - - - - - - -\n",err.stock,"\n- - - - - - - - -\n\n") } # Check whether tab1=tab2 err.msg <- NULL if((ncol(tab1)==ncol(tab2))&&(sum(tab1!=tab2,na.rm=TRUE)==0)){ autocor <- TRUE err.msg <- c(err.msg, "\nYou chose the same table for the two dataset inputs. \nTo allow filtering options,", "we will turn the diagonal to 0 in the correlation matrix during the filtering process.\n") }else{ autocor <- FALSE } # Check qualitative variables in each input tables var1.quali <- vector() var2.quali <- vector() for (i in 1:dim(tab1)[2]){ if(class(tab1[,i]) != "numeric" & class(tab1[,i]) != "integer"){ var1.quali <- c(var1.quali,i) } } for (j in 1:dim(tab2)[2]){ if(class(tab2[,j]) != "numeric" & class(tab2[,j]) != "integer"){ var2.quali <- c(var2.quali, j) } } if (length(var1.quali) != 0 | length(var2.quali) != 0){ err.msg <- c(err.msg, "\nThere are qualitative variables in your input tables which have been removed to compute the correlation table.\n\n") if(length(var1.quali) != 0 && length(var1.quali) < 4){ err.msg <- c(err.msg, "In table 1, the following qualitative variables have been removed:\n", " ",paste(colnames(tab1)[var1.quali],collapse="\n "),"\n") } else if(length(var1.quali) != 0 && length(var1.quali) > 3){ err.msg <- c(err.msg, "For example, in table 1, the following qualitative variables have been removed:\n", " ",paste(colnames(tab1)[var1.quali[1:3]],collapse="\n "),"\n") } if(length(var2.quali) != 0 && length(var2.quali) < 4){ err.msg <- c(err.msg, "In table 2, the following qualitative variables have been removed:\n", " ",paste(colnames(tab2)[var2.quali],collapse="\n "),"\n") } else if(length(var2.quali) != 0 && length(var2.quali) > 3){ err.msg <- c(err.msg, "For example, in table 2, the following qualitative variables have been removed:\n", " ",paste(colnames(tab2)[var2.quali[1:3]],collapse="\n "),"\n") } } if(length(var1.quali) != 0){ tab1 <- tab1[,-var1.quali,drop=FALSE] } if(length(var2.quali) != 0){ tab2 <- tab2[,-var2.quali,drop=FALSE] } # Check whether there are constant variables var1.cons <- vector() var2.cons <- vector() for (i in 1:dim(tab1)[2]){ if(length(levels(as.factor(tab1[,i])))==1){ var1.cons <- c(var1.cons,i) } } for (j in 1:dim(tab2)[2]){ if(length(levels(as.factor(tab2[,j])))==1){ var2.cons <- c(var2.cons, j) } } if (length(var1.cons) != 0 | length(var2.cons) != 0){ err.msg <- c(err.msg, "\nThere are constant variables in your input tables which have been removed to compute the correlation table.\n\n") if(length(var1.cons) != 0 && length(var1.cons) < 4){ err.msg <- c(err.msg, "In table 1, the following constant variables have been removed:\n", " ",paste(colnames(tab1)[var1.cons],collapse="\n "),"\n") } else if(length(var1.cons) != 0 && length(var1.cons) > 3){ err.msg <- c(err.msg, "For example, in table 1, the following constant variables have been removed:\n", " ",paste(colnames(tab1)[var1.cons[1:3]],collapse="\n "),"\n") } if(length(var2.cons) != 0 && length(var2.cons) < 4){ err.msg <- c(err.msg, "In table 2, the following constant variables have been removed:\n", " ",paste(colnames(tab2)[var2.cons],collapse="\n "),"\n") } else if(length(var2.cons) != 0 && length(var2.cons) > 3){ err.msg <- c(err.msg, "For example, in table 2, the following constant variables have been removed:\n", " ",paste(colnames(tab2)[var2.cons[1:3]],collapse="\n "),"\n") } } if(length(var1.cons) != 0){ tab1 <- tab1[,-var1.cons,drop=FALSE] } if(length(var2.cons) != 0){ tab2 <- tab2[,-var2.cons,drop=FALSE] } # Print info message if(length(err.msg) != 0){ cat("\n- - - - - - - - -\n",err.msg,"\n- - - - - - - - -\n\n") } rm(err.stock,var1.quali,var2.quali,var1.cons,var2.cons,err.msg) # Correlation table --------------------------------------------------------------------------------- tab.corr <- matrix(nrow = dim(tab2)[2], ncol = dim(tab1)[2]) for (i in 1:dim(tab2)[2]){ for (j in 1:dim(tab1)[2]){ tab.corr[i,j] <- cor(tab2[,i], tab1[,j], method = corr.method, use = "pairwise.complete.obs") } } colnames(tab.corr) <- colnames(tab1) rownames(tab.corr) <- colnames(tab2) # Significance of correlation test ------------------------------------------------------------------ if (test.corr == "yes"){ repcorrtest1 <- function(vari1,vari2,corrmeth){ suppressWarnings(corrtest <- cor.test(vari2, vari1, method = corrmeth)) return(corrtest$p.value) } repcorrtest2 <- function(stab,ftab,cormeth){ listp <- apply(X=ftab,2,repcorrtest1,vari2=stab,corrmeth=cormeth) return(listp) } pvalue <- apply(X=tab1,2,repcorrtest2,ftab=tab2,cormeth=corr.method) if(multi.name != "none"){ pvalue <- matrix(p.adjust(pvalue, method = multi.name), nrow = dim(tab.corr)[1], ncol = dim(tab.corr)[2]) } tab.corr[pvalue > alpha] <- 0 } # Filter settings ------------------------------------------------------------------------------------ if (filter == "yes"){ # Turn diagonal from 1 to 0 if autocorrelation if(autocor){ for(i in 1:(ncol(tab.corr))){ tab.corr[i,i] <- 0 } } # Remove variables with all their correlation coefficients = 0 : if (filters.choice == "filter_0"){ threshold <- 0 } var2.thres <- vector() for (i in 1:dim(tab.corr)[1]){ if (length(which(abs(tab.corr[i,]) <= threshold)) == dim(tab.corr)[2]){ var2.thres <- c(var2.thres, i) } } if (length(var2.thres) != 0){ tab.corr <- tab.corr[-var2.thres,,drop=FALSE] tab2 <- tab2[, -var2.thres,drop=FALSE] } var1.thres <- vector() for (i in 1:dim(tab.corr)[2]){ if (length(which(abs(tab.corr[,i]) <= threshold)) == dim(tab.corr)[1]){ var1.thres <- c(var1.thres, i) } } if (length(var1.thres) != 0){ tab.corr <- tab.corr[,-var1.thres,drop=FALSE] tab1 <- tab1[,-var1.thres,drop=FALSE] } # Turn diagonal from 0 back to 1 if autocorrelation if(autocor){ for(i in 1:(ncol(tab.corr))){ tab.corr[i,i] <- 1 } } } # Reorder variables in the correlation table (with the HCA) ------------------------------------------ if (reorder.var == "yes"){ cormat.tab2 <- cor(tab2, method = corr.method, use = "pairwise.complete.obs") dist.tab2 <- as.dist(1 - cormat.tab2) hc.tab2 <- hclust(dist.tab2, method = "ward.D2") tab.corr <- tab.corr[hc.tab2$order,] rm(cormat.tab2) cormat.tab1 <- cor(tab1, method = corr.method, use = "pairwise.complete.obs") dist.tab1 <- as.dist(1 - cormat.tab1) hc.tab1 <- hclust(dist.tab1, method = "ward.D2") tab.corr <- tab.corr[,hc.tab1$order] rm(cormat.tab1) } # Output 1 : Correlation table ----------------------------------------------------------------------- # Export correlation table write.table(x = data.frame(name = rownames(tab.corr), tab.corr), file = output1, sep = "\t", quote = FALSE, row.names = FALSE) # Create the heatmap --------------------------------------------------------------------------------- if(plot.choice != "none"){ # A message if no variable kept if(length(tab.corr)==0){ pdf(output2) plot.new() legend("center","Filtering leads to no remaining correlation coefficient.") dev.off() } else { # A message if more than 1000 variable in auto mode if((plot.choice=="auto")&&(max(dim(tab.corr))>1000)){ pdf(output2) war.msg <- paste0("In 'default' mode, the colored table is not provided when\none of the tables contains more than ", "a thousand\nvariables after the filter step.\n\nOne of your table still contains ",max(dim(tab.corr))," variables.\n", "Please consider more filtering, or use the 'Always plot a\ncolored table' mode to obtain your colored table.") plot.new() legend("center",war.msg,adj=c(0.05,0.075)) dev.off() } else { library(ggplot2) library(reshape2) # Melt the correlation table : melted.tab.corr <- melt(tab.corr) if (color.heatmap == "yes") { # Add a column for the classes of each correlation coefficient classe <- rep(0, dim(melted.tab.corr)[1]) melted <- cbind(melted.tab.corr, classe) if (type.classes == "regular"){ vect <- vector() if (seq(-1,0,reg.value)[length(seq(-1,0,reg.value))] == 0){ vect <- c(seq(-1,0,reg.value)[-length(seq(-1,0,reg.value))], rev(seq(1,0,-reg.value))) } else { vect <- c(seq(-1,0,reg.value), 0, rev(seq(1,0,-reg.value))) } } else if (type.classes == "irregular") { irreg.vect <- c(-1, irreg.vect, 1) vect <- irreg.vect } # Color palette : myPal <- colorRampPalette(c("#00CC00", "white", "red"), space = "Lab", interpolate = "spline") # Create vector intervals cl <- vector() cl <- paste("[", vect[1], ";", round(vect[2],3), "]", sep = "") for (x in 2:(length(vect)-1)) { if (vect[x+1] == 0) { cl <- c(cl, paste("]", round(vect[x],3), ";", round(vect[x+1],3), "[", sep = "")) } else { cl <- c(cl, paste("]", round(vect[x],3), ";", round(vect[x+1],3), "]", sep = "")) } } # Assign an interval to each correlation coefficient for (j in 1:(length(cl))){ if (vect[j] == -1){ melted$classe[melted$value >= vect[j] & melted$value <= vect[j+1]] <- cl[j] } else { melted$classe[melted$value > vect[j] & melted$value <= vect[j+1]] <- cl[j] } } # Find the 0 and assign it the white as name if (length(which(vect == 0)) == 1) { melted$classe[melted$value == 0] <- "0" indic <- which(vect == 0) cl <- c(cl[1:(indic-1)], 0, cl[indic:length(cl)]) names(cl)[indic] <- "#FFFFFF" } else if (length(which(vect == 0)) == 0) { indic <- 0 for (x in 1:(length(vect)-1)) { if (0 > vect[x] && 0 <= vect[x+1]) { names(cl)[x] <- "#FFFFFF" indic <- x } } } indic <- length(cl) - indic + 1 cl <- rev(cl) # Assign the colors of each intervals as their name names(cl)[1:(indic-1)] <- myPal(length(cl[1:indic])*2-1)[1:indic-1] names(cl)[(indic+1):length(cl)] <- myPal(length(cl[indic:length(cl)])*2-1)[(ceiling(length(myPal(length(cl[indic:length(cl)])*2-1))/2)+1):length(myPal(length(cl[indic:length(cl)])*2-1))] melted$classe <- factor(melted$classe) melted$classe <- factor(melted$classe, levels = cl[cl%in%levels(melted$classe)]) # Heatmap if color.heatmap = yes : ggplot(melted, aes(Var2, Var1, fill = classe)) + ggtitle("Colored correlation table" ) + xlab("Table 1") + ylab("Table 2") + geom_tile(color ="ghostwhite") + scale_fill_manual( breaks = levels(melted$classe), values = names(cl)[cl%in%levels(melted$classe)], name = paste(corr.method, "correlation", sep = "\n")) + theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5)) } else { # Heatmap if color.heatmap = no : ggplot(melted.tab.corr, aes(Var2, Var1, fill = value)) + ggtitle("Colored correlation table" ) + xlab("Table 1") + ylab("Table 2") + geom_tile(color ="ghostwhite") + scale_fill_gradient2(low = "red", high = "#00CC00", mid = "white", midpoint = 0, limit = c(-1,1), name = paste(corr.method, "correlation", sep = "\n")) + theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5)) } ggsave(output2, device = "pdf", width = 10+0.075*dim(tab.corr)[2], height = 5+0.075*dim(tab.corr)[1], limitsize = FALSE) } # End of if((plot.choice=="auto")&&(max(dim(tab.corr))>1000))else } # End of if(length(tab.corr)==0)else } # End of if(plot.choice != "auto") } # End of correlation.tab # Function call # correlation.tab(tab1.name, tab2.name, param1.samples, param2.samples, corr.method, test.corr, alpha, multi.name, filter, # filters.choice, threshold, reorder.var, plot.choice, color.heatmap, type.classes, # reg.value, irreg.vect, output1, output2)