Mercurial > repos > melpetera > corr_table
diff CorrTable/Corr_Script_samples_row.R @ 0:b22c453e4cf4 draft
Uploaded
author | melpetera |
---|---|
date | Thu, 11 Oct 2018 05:35:55 -0400 |
parents | |
children | 29ec7e3afdd4 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CorrTable/Corr_Script_samples_row.R Thu Oct 11 05:35:55 2018 -0400 @@ -0,0 +1,410 @@ + ################################################################################################# + # CORRELATION TABLE # + # # + # # + # Input : 2 tables with common samples # + # Output : Correlation table ; Heatmap (pdf) # + # # + # Dependencies : Libraries "ggplot2" and "reshape2" # + # # + ################################################################################################# + + + # Parameters (for dev) + if(FALSE){ + + rm(list = ls()) + setwd(dir = "Y:/Developpement") + + tab1.name <- "Test/Ressources/Inputs/CT2_DM.tabular" + tab2.name <- "Test/Ressources/Inputs/CT2_base_Diapason_14ClinCES_PRIN.txt" + param1.samples <- "column" + param2.samples <- "row" + corr.method <- "pearson" + test.corr <- "yes" + alpha <- 0.05 + multi.name <- "none" + filter <- "yes" + filters.choice <- "filters_0_thr" + threshold <- 0.2 + reorder.var <- "yes" + color.heatmap <- "yes" + type.classes <-"irregular" + reg.value <- 1/3 + irreg.vect <- c(-0.3, -0.2, -0.1, 0, 0.3, 0.4) + output1 <- "Correlation_table.txt" + output2 <- "Heatmap.pdf" + + } + + + + correlation.tab <- function(tab1.name, tab2.name, param1.samples, param2.samples, corr.method, test.corr, alpha, + multi.name, filter, filters.choice, threshold, reorder.var, color.heatmap, type.classes, + reg.value, irreg.vect, output1, output2){ + + # This function allows to visualize the correlation between two tables + # + # Parameters: + # - tab1.name: table 1 file's access + # - tab2.name: table 2 file's access + # - param1.samples ("row" or "column"): where the samples are in tab1 + # - param2.samples ("row" or "column"): where the samples are in tab2 + # - corr.method ("pearson", "spearman", "kendall"): + # - test.corr ("yes" or "no"): test the significance of a correlation coefficient + # - alpha (value between 0 and 1): risk for the correlation significance test + # - multi.name ("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"): correction of multiple tests + # - filter ("yes", "no"): use filter.0 or/and filter.threshold + # - filters.choice ("filter_0" or "filters_0_thr"): zero filter removes variables with all their correlation coefficients = 0 + # and threshold filter remove variables with all their correlation coefficients in abs < threshold + # - threshold (value between 0 and 1): threshold for filter threshold + # - reorder.var ("yes" or "no"): reorder variables in the correlation table thanks to the HCA + # - color.heatmap ("yes" or "no"): color the heatmap with classes defined by the user + # - type.classes ("regular" or "irregular"): choose to color the heatmap with regular or irregular classes + # - reg.value (value between 0 and 1): value for regular classes + # - irreg.vect (vector with values between -1 and 1): vector which indicates values for intervals (irregular classes) + # - output1: correlation table file's access + # - output2: heatmap (colored correlation table) file's access + + + # Input ---------------------------------------------------------------------------------------------- + + tab1 <- read.table(tab1.name, sep = "\t", header = TRUE, check.names = FALSE, row.names = 1) + tab2 <- read.table(tab2.name, sep = "\t", header = TRUE, check.names = FALSE, row.names = 1) + + # Transpose tables according to the samples + if(param1.samples == "column"){ + tab1 <- t(tab1) + } + + if(param2.samples == "column"){ + tab2 <- t(tab2) + } + + # Sorting tables in alphabetical order of the samples + tab1 <- tab1[order(rownames(tab1)),] + tab2 <- tab2[order(rownames(tab2)),] + + + # Check if the 2 datasets match regarding samples identifiers + # Adapt from functions "check.err" and "match2", RcheckLibrary.R + + err.stock <- NULL + + id1 <- rownames(tab1) + id2 <- rownames(tab2) + + if(sum(id1 != id2) > 0){ + err.stock <- c("\nThe two tables do not match regarding sample identifiers.\n") + + if(length(which(id1%in%id2)) != length(id1)){ + identif <- id1[which(!(id1%in%id2))] + if (length(identif) < 4){ + err.stock <- c(err.stock, "\nThe following identifier(s) found in the first table do not appear in the second table:\n") + } + else { + err.stock <- c(err.stock, "\nFor example, the following identifiers found in the first table do not appear in the second table:\n") + } + identif <- identif[1:min(3,length(which(!(id1%in%id2))))] + err.stock <- c(err.stock," ",paste(identif,collapse="\n "),"\n") + } + + if(length(which(id2%in%id1)) != length(id2)){ + identif <- id2[which(!(id2%in%id1))] + if (length(identif) < 4){ + err.stock <- c(err.stock, "\nThe following identifier(s) found in the second table do not appear in the first table:\n") + } + else{ + err.stock <- c(err.stock, "\nFor example, the following identifiers found in the second table do not appear in the first table:\n") + } + identif <- identif[1:min(3,length(which(!(id2%in%id1))))] + err.stock <- c(err.stock," ",paste(identif,collapse="\n "),"\n") + } + err.stock <- c(err.stock,"\nPlease check your data.\n") + } + + if(length(err.stock)!=0){ + stop("\n- - - - - - - - -\n",err.stock,"\n- - - - - - - - -\n\n") + } + + + # Check qualitative variables in each input tables + err.msg <- NULL + + var1.quali <- vector() + var2.quali <- vector() + + for (i in 1:dim(tab1)[2]){ + if(class(tab1[,i]) != "numeric" & class(tab1[,i]) != "integer"){ + var1.quali <- c(var1.quali,i) + } + } + + for (j in 1:dim(tab2)[2]){ + if(class(tab2[,j]) != "numeric" & class(tab2[,j]) != "integer"){ + var2.quali <- c(var2.quali, j) + } + } + + if (length(var1.quali) != 0 | length(var2.quali) != 0){ + err.msg <- c(err.msg, "\nThere are qualitative variables in your input tables which have been removed to compute the correlation table.\n\n") + + if(length(var1.quali) != 0 && length(var1.quali) < 4){ + err.msg <- c(err.msg, "In table 1, the following qualitative variables have been removed:\n", + " ",paste(colnames(tab1)[var1.quali],collapse="\n "),"\n") + } else if(length(var1.quali) != 0 && length(var1.quali) > 3){ + err.msg <- c(err.msg, "For example, in table 1, the following qualitative variables have been removed:\n", + " ",paste(colnames(tab1)[var1.quali[1:3]],collapse="\n "),"\n") + } + + if(length(var2.quali) != 0 && length(var2.quali) < 4){ + err.msg <- c(err.msg, "In table 2, the following qualitative variables have been removed:\n", + " ",paste(colnames(tab2)[var2.quali],collapse="\n "),"\n") + } else if(length(var2.quali) != 0 && length(var2.quali) > 3){ + err.msg <- c(err.msg, "For example, in table 2, the following qualitative variables have been removed:\n", + " ",paste(colnames(tab2)[var2.quali[1:3]],collapse="\n "),"\n") + } + } + + if(length(var1.quali) != 0){ + tab1 <- tab1[,-var1.quali] + } + if(length(var2.quali) != 0){ + tab2 <- tab2[,-var2.quali] + } + + if(length(err.msg) != 0){ + cat("\n- - - - - - - - -\n",err.msg,"\n- - - - - - - - -\n\n") + } + + # Correlation table --------------------------------------------------------------------------------- + + tab.corr <- matrix(nrow = dim(tab2)[2], ncol = dim(tab1)[2]) + for (i in 1:dim(tab2)[2]){ + for (j in 1:dim(tab1)[2]){ + tab.corr[i,j] <- cor(tab2[,i], tab1[,j], method = corr.method, use = "pairwise.complete.obs") + } + } + + colnames(tab.corr) <- colnames(tab1) + rownames(tab.corr) <- colnames(tab2) + + + + # Significance of correlation test ------------------------------------------------------------------ + + if (test.corr == "yes"){ + + pvalue <- vector() + for (i in 1:dim(tab.corr)[1]){ + for (j in 1:dim(tab.corr)[2]){ + suppressWarnings(corrtest <- cor.test(tab2[,i], tab1[,j], method = corr.method)) + pvalue <- c(pvalue, corrtest$p.value) + if (multi.name == "none"){ + if (corrtest$p.value > alpha){ + tab.corr[i,j] <- 0 + } + } + } + } + + if(multi.name != "none"){ + adjust <- matrix(p.adjust(pvalue, method = multi.name), nrow = dim(tab.corr)[1], ncol = dim(tab.corr)[2], byrow = T) + tab.corr[adjust > alpha] <- 0 + } + } + + + # Filter settings ------------------------------------------------------------------------------------ + + if (filter == "yes"){ + + # Remove variables with all their correlation coefficients = 0 : + if (filters.choice == "filter_0"){ + threshold <- 0 + } + + var2.thres <- vector() + for (i in 1:dim(tab.corr)[1]){ + if (length(which(abs(tab.corr[i,]) <= threshold)) == dim(tab.corr)[2]){ + var2.thres <- c(var2.thres, i) + } + } + + if (length(var2.thres) != 0){ + tab.corr <- tab.corr[-var2.thres,] + tab2 <- tab2[, -var2.thres] + } + + var1.thres <- vector() + for (i in 1:dim(tab.corr)[2]){ + if (length(which(abs(tab.corr[,i]) <= threshold)) == dim(tab.corr)[1]){ + var1.thres <- c(var1.thres, i) + } + } + + if (length(var1.thres) != 0){ + tab.corr <- tab.corr[,-var1.thres] + tab1 <- tab1[,-var1.thres] + } + + } + + + # Reorder variables in the correlation table (with the HCA) ------------------------------------------ + if (reorder.var == "yes"){ + + cormat.tab2 <- cor(tab2, method = corr.method, use = "pairwise.complete.obs") + dist.tab2 <- as.dist(1 - cormat.tab2) + hc.tab2 <- hclust(dist.tab2, method = "ward.D2") + tab.corr <- tab.corr[hc.tab2$order,] + + cormat.tab1 <- cor(tab1, method = corr.method, use = "pairwise.complete.obs") + dist.tab1 <- as.dist(1 - cormat.tab1) + hc.tab1 <- hclust(dist.tab1, method = "ward.D2") + tab.corr <- tab.corr[,hc.tab1$order] + + } + + + + # Output 1 : Correlation table ----------------------------------------------------------------------- + + # Export correlation table + write.table(x = data.frame(name = rownames(tab.corr), tab.corr), file = output1, sep = "\t", quote = FALSE, row.names = FALSE) + + + + # Create the heatmap --------------------------------------------------------------------------------- + + # A message if no variable kept + if(length(tab.corr)==0){ + pdf(output2) + plot.new() + legend("center","Filtering leads to no remaining correlation coefficient.") + dev.off() + } else { + + + library(ggplot2) + library(reshape2) + + # Melt the correlation table : + melted.tab.corr <- melt(tab.corr) + + if (color.heatmap == "yes") { + + # Add a column for the classes of each correlation coefficient + classe <- rep(0, dim(melted.tab.corr)[1]) + melted <- cbind(melted.tab.corr, classe) + + if (type.classes == "regular"){ + + vect <- vector() + if (seq(-1,0,reg.value)[length(seq(-1,0,reg.value))] == 0){ + vect <- c(seq(-1,0,reg.value)[-length(seq(-1,0,reg.value))], + rev(seq(1,0,-reg.value))) + } else { + vect <- c(seq(-1,0,reg.value), 0, rev(seq(1,0,-reg.value))) + } + + } else if (type.classes == "irregular") { + + irreg.vect <- c(-1, irreg.vect, 1) + vect <- irreg.vect + + } + + # Color palette : + myPal <- colorRampPalette(c("#00CC00", "white", "red"), space = "Lab", interpolate = "spline") + + # Create vector intervals + cl <- vector() + cl <- paste("[", vect[1], ";", round(vect[2],3), "]", sep = "") + + for (x in 2:(length(vect)-1)) { + if (vect[x+1] == 0) { + cl <- c(cl, paste("]", round(vect[x],3), ";", round(vect[x+1],3), "[", sep = "")) + } else { + cl <- c(cl, paste("]", round(vect[x],3), ";", + round(vect[x+1],3), "]", sep = "")) + } + } + + # Assign an interval to each correlation coefficient + for (i in 1:dim(melted.tab.corr)[1]){ + for (j in 1:(length(cl))){ + if (vect[j] == -1){ + melted$classe[i][melted$value[i] >= vect[j] + && melted$value[i] <= vect[j+1]] <- cl[j] + } else { + melted$classe[i][melted$value[i] > vect[j] + && melted$value[i] <= vect[j+1]] <- cl[j] + } + } + } + + # Find the 0 and assign it the white as name + if (length(which(vect == 0)) == 1) { + melted$classe[melted$value == 0] <- "0" + indic <- which(vect == 0) + cl <- c(cl[1:(indic-1)], 0, cl[indic:length(cl)]) + names(cl)[indic] <- "#FFFFFF" + } else if (length(which(vect == 0)) == 0) { + indic <- 0 + for (x in 1:(length(vect)-1)) { + if (0 > vect[x] && 0 <= vect[x+1]) { + names(cl)[x] <- "#FFFFFF" + indic <- x + } + } + } + + indic <- length(cl) - indic + 1 + cl <- rev(cl) + + # Assign the colors of each intervals as their name + names(cl)[1:(indic-1)] <- myPal(length(cl[1:indic])*2-1)[1:indic-1] + names(cl)[(indic+1):length(cl)] <- myPal(length(cl[indic:length(cl)])*2-1)[(ceiling(length(myPal(length(cl[indic:length(cl)])*2-1))/2)+1):length(myPal(length(cl[indic:length(cl)])*2-1))] + + + melted$classe <- factor(melted$classe) + melted$classe <- factor(melted$classe, levels = cl[cl%in%levels(melted$classe)]) + + # Heatmap if color.heatmap = yes : + ggplot(melted, aes(Var2, Var1, fill = classe)) + + ggtitle("Colored correlation table" ) + xlab("Table 1") + ylab("Table 2") + + geom_tile(color ="ghostwhite") + + scale_fill_manual( breaks = levels(melted$classe), + values = names(cl)[cl%in%levels(melted$classe)], + name = paste(corr.method, "correlation", sep = "\n")) + + theme_classic() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5), + plot.title = element_text(hjust = 0.5)) + + } else { + + # Heatmap if color.heatmap = no : + ggplot(melted.tab.corr, aes(Var2, Var1, fill = value)) + + ggtitle("Colored correlation table" ) + xlab("Table 1") + ylab("Table 2") + + geom_tile(color ="ghostwhite") + + scale_fill_gradient2(low = "red", high = "#00CC00", mid = "white", midpoint = 0, limit = c(-1,1), + name = paste(corr.method, "correlation", sep = "\n")) + + theme_classic() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5), + plot.title = element_text(hjust = 0.5)) + } + + + ggsave(output2, device = "pdf", width = 10+0.075*dim(tab.corr)[2], height = 5+0.075*dim(tab.corr)[1], limitsize = FALSE) + + + } # End if(length(tab.corr)==0)else + + } # End of correlation.tab + + + # Function call + # correlation.tab(tab1.name, tab2.name, param1.samples, param2.samples, corr.method, test.corr, alpha, multi.name, filter, + # filters.choice, threshold, reorder.var, color.heatmap, type.classes, + # reg.value, irreg.vect, output1, output2)