Mercurial > repos > melpetera > corr_table
view CorrTable/Corr_Script_samples_row.R @ 0:b22c453e4cf4 draft
Uploaded
author | melpetera |
---|---|
date | Thu, 11 Oct 2018 05:35:55 -0400 |
parents | |
children | 29ec7e3afdd4 |
line wrap: on
line source
################################################################################################# # CORRELATION TABLE # # # # # # Input : 2 tables with common samples # # Output : Correlation table ; Heatmap (pdf) # # # # Dependencies : Libraries "ggplot2" and "reshape2" # # # ################################################################################################# # Parameters (for dev) if(FALSE){ rm(list = ls()) setwd(dir = "Y:/Developpement") tab1.name <- "Test/Ressources/Inputs/CT2_DM.tabular" tab2.name <- "Test/Ressources/Inputs/CT2_base_Diapason_14ClinCES_PRIN.txt" param1.samples <- "column" param2.samples <- "row" corr.method <- "pearson" test.corr <- "yes" alpha <- 0.05 multi.name <- "none" filter <- "yes" filters.choice <- "filters_0_thr" threshold <- 0.2 reorder.var <- "yes" color.heatmap <- "yes" type.classes <-"irregular" reg.value <- 1/3 irreg.vect <- c(-0.3, -0.2, -0.1, 0, 0.3, 0.4) output1 <- "Correlation_table.txt" output2 <- "Heatmap.pdf" } correlation.tab <- function(tab1.name, tab2.name, param1.samples, param2.samples, corr.method, test.corr, alpha, multi.name, filter, filters.choice, threshold, reorder.var, color.heatmap, type.classes, reg.value, irreg.vect, output1, output2){ # This function allows to visualize the correlation between two tables # # Parameters: # - tab1.name: table 1 file's access # - tab2.name: table 2 file's access # - param1.samples ("row" or "column"): where the samples are in tab1 # - param2.samples ("row" or "column"): where the samples are in tab2 # - corr.method ("pearson", "spearman", "kendall"): # - test.corr ("yes" or "no"): test the significance of a correlation coefficient # - alpha (value between 0 and 1): risk for the correlation significance test # - multi.name ("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"): correction of multiple tests # - filter ("yes", "no"): use filter.0 or/and filter.threshold # - filters.choice ("filter_0" or "filters_0_thr"): zero filter removes variables with all their correlation coefficients = 0 # and threshold filter remove variables with all their correlation coefficients in abs < threshold # - threshold (value between 0 and 1): threshold for filter threshold # - reorder.var ("yes" or "no"): reorder variables in the correlation table thanks to the HCA # - color.heatmap ("yes" or "no"): color the heatmap with classes defined by the user # - type.classes ("regular" or "irregular"): choose to color the heatmap with regular or irregular classes # - reg.value (value between 0 and 1): value for regular classes # - irreg.vect (vector with values between -1 and 1): vector which indicates values for intervals (irregular classes) # - output1: correlation table file's access # - output2: heatmap (colored correlation table) file's access # Input ---------------------------------------------------------------------------------------------- tab1 <- read.table(tab1.name, sep = "\t", header = TRUE, check.names = FALSE, row.names = 1) tab2 <- read.table(tab2.name, sep = "\t", header = TRUE, check.names = FALSE, row.names = 1) # Transpose tables according to the samples if(param1.samples == "column"){ tab1 <- t(tab1) } if(param2.samples == "column"){ tab2 <- t(tab2) } # Sorting tables in alphabetical order of the samples tab1 <- tab1[order(rownames(tab1)),] tab2 <- tab2[order(rownames(tab2)),] # Check if the 2 datasets match regarding samples identifiers # Adapt from functions "check.err" and "match2", RcheckLibrary.R err.stock <- NULL id1 <- rownames(tab1) id2 <- rownames(tab2) if(sum(id1 != id2) > 0){ err.stock <- c("\nThe two tables do not match regarding sample identifiers.\n") if(length(which(id1%in%id2)) != length(id1)){ identif <- id1[which(!(id1%in%id2))] if (length(identif) < 4){ err.stock <- c(err.stock, "\nThe following identifier(s) found in the first table do not appear in the second table:\n") } else { err.stock <- c(err.stock, "\nFor example, the following identifiers found in the first table do not appear in the second table:\n") } identif <- identif[1:min(3,length(which(!(id1%in%id2))))] err.stock <- c(err.stock," ",paste(identif,collapse="\n "),"\n") } if(length(which(id2%in%id1)) != length(id2)){ identif <- id2[which(!(id2%in%id1))] if (length(identif) < 4){ err.stock <- c(err.stock, "\nThe following identifier(s) found in the second table do not appear in the first table:\n") } else{ err.stock <- c(err.stock, "\nFor example, the following identifiers found in the second table do not appear in the first table:\n") } identif <- identif[1:min(3,length(which(!(id2%in%id1))))] err.stock <- c(err.stock," ",paste(identif,collapse="\n "),"\n") } err.stock <- c(err.stock,"\nPlease check your data.\n") } if(length(err.stock)!=0){ stop("\n- - - - - - - - -\n",err.stock,"\n- - - - - - - - -\n\n") } # Check qualitative variables in each input tables err.msg <- NULL var1.quali <- vector() var2.quali <- vector() for (i in 1:dim(tab1)[2]){ if(class(tab1[,i]) != "numeric" & class(tab1[,i]) != "integer"){ var1.quali <- c(var1.quali,i) } } for (j in 1:dim(tab2)[2]){ if(class(tab2[,j]) != "numeric" & class(tab2[,j]) != "integer"){ var2.quali <- c(var2.quali, j) } } if (length(var1.quali) != 0 | length(var2.quali) != 0){ err.msg <- c(err.msg, "\nThere are qualitative variables in your input tables which have been removed to compute the correlation table.\n\n") if(length(var1.quali) != 0 && length(var1.quali) < 4){ err.msg <- c(err.msg, "In table 1, the following qualitative variables have been removed:\n", " ",paste(colnames(tab1)[var1.quali],collapse="\n "),"\n") } else if(length(var1.quali) != 0 && length(var1.quali) > 3){ err.msg <- c(err.msg, "For example, in table 1, the following qualitative variables have been removed:\n", " ",paste(colnames(tab1)[var1.quali[1:3]],collapse="\n "),"\n") } if(length(var2.quali) != 0 && length(var2.quali) < 4){ err.msg <- c(err.msg, "In table 2, the following qualitative variables have been removed:\n", " ",paste(colnames(tab2)[var2.quali],collapse="\n "),"\n") } else if(length(var2.quali) != 0 && length(var2.quali) > 3){ err.msg <- c(err.msg, "For example, in table 2, the following qualitative variables have been removed:\n", " ",paste(colnames(tab2)[var2.quali[1:3]],collapse="\n "),"\n") } } if(length(var1.quali) != 0){ tab1 <- tab1[,-var1.quali] } if(length(var2.quali) != 0){ tab2 <- tab2[,-var2.quali] } if(length(err.msg) != 0){ cat("\n- - - - - - - - -\n",err.msg,"\n- - - - - - - - -\n\n") } # Correlation table --------------------------------------------------------------------------------- tab.corr <- matrix(nrow = dim(tab2)[2], ncol = dim(tab1)[2]) for (i in 1:dim(tab2)[2]){ for (j in 1:dim(tab1)[2]){ tab.corr[i,j] <- cor(tab2[,i], tab1[,j], method = corr.method, use = "pairwise.complete.obs") } } colnames(tab.corr) <- colnames(tab1) rownames(tab.corr) <- colnames(tab2) # Significance of correlation test ------------------------------------------------------------------ if (test.corr == "yes"){ pvalue <- vector() for (i in 1:dim(tab.corr)[1]){ for (j in 1:dim(tab.corr)[2]){ suppressWarnings(corrtest <- cor.test(tab2[,i], tab1[,j], method = corr.method)) pvalue <- c(pvalue, corrtest$p.value) if (multi.name == "none"){ if (corrtest$p.value > alpha){ tab.corr[i,j] <- 0 } } } } if(multi.name != "none"){ adjust <- matrix(p.adjust(pvalue, method = multi.name), nrow = dim(tab.corr)[1], ncol = dim(tab.corr)[2], byrow = T) tab.corr[adjust > alpha] <- 0 } } # Filter settings ------------------------------------------------------------------------------------ if (filter == "yes"){ # Remove variables with all their correlation coefficients = 0 : if (filters.choice == "filter_0"){ threshold <- 0 } var2.thres <- vector() for (i in 1:dim(tab.corr)[1]){ if (length(which(abs(tab.corr[i,]) <= threshold)) == dim(tab.corr)[2]){ var2.thres <- c(var2.thres, i) } } if (length(var2.thres) != 0){ tab.corr <- tab.corr[-var2.thres,] tab2 <- tab2[, -var2.thres] } var1.thres <- vector() for (i in 1:dim(tab.corr)[2]){ if (length(which(abs(tab.corr[,i]) <= threshold)) == dim(tab.corr)[1]){ var1.thres <- c(var1.thres, i) } } if (length(var1.thres) != 0){ tab.corr <- tab.corr[,-var1.thres] tab1 <- tab1[,-var1.thres] } } # Reorder variables in the correlation table (with the HCA) ------------------------------------------ if (reorder.var == "yes"){ cormat.tab2 <- cor(tab2, method = corr.method, use = "pairwise.complete.obs") dist.tab2 <- as.dist(1 - cormat.tab2) hc.tab2 <- hclust(dist.tab2, method = "ward.D2") tab.corr <- tab.corr[hc.tab2$order,] cormat.tab1 <- cor(tab1, method = corr.method, use = "pairwise.complete.obs") dist.tab1 <- as.dist(1 - cormat.tab1) hc.tab1 <- hclust(dist.tab1, method = "ward.D2") tab.corr <- tab.corr[,hc.tab1$order] } # Output 1 : Correlation table ----------------------------------------------------------------------- # Export correlation table write.table(x = data.frame(name = rownames(tab.corr), tab.corr), file = output1, sep = "\t", quote = FALSE, row.names = FALSE) # Create the heatmap --------------------------------------------------------------------------------- # A message if no variable kept if(length(tab.corr)==0){ pdf(output2) plot.new() legend("center","Filtering leads to no remaining correlation coefficient.") dev.off() } else { library(ggplot2) library(reshape2) # Melt the correlation table : melted.tab.corr <- melt(tab.corr) if (color.heatmap == "yes") { # Add a column for the classes of each correlation coefficient classe <- rep(0, dim(melted.tab.corr)[1]) melted <- cbind(melted.tab.corr, classe) if (type.classes == "regular"){ vect <- vector() if (seq(-1,0,reg.value)[length(seq(-1,0,reg.value))] == 0){ vect <- c(seq(-1,0,reg.value)[-length(seq(-1,0,reg.value))], rev(seq(1,0,-reg.value))) } else { vect <- c(seq(-1,0,reg.value), 0, rev(seq(1,0,-reg.value))) } } else if (type.classes == "irregular") { irreg.vect <- c(-1, irreg.vect, 1) vect <- irreg.vect } # Color palette : myPal <- colorRampPalette(c("#00CC00", "white", "red"), space = "Lab", interpolate = "spline") # Create vector intervals cl <- vector() cl <- paste("[", vect[1], ";", round(vect[2],3), "]", sep = "") for (x in 2:(length(vect)-1)) { if (vect[x+1] == 0) { cl <- c(cl, paste("]", round(vect[x],3), ";", round(vect[x+1],3), "[", sep = "")) } else { cl <- c(cl, paste("]", round(vect[x],3), ";", round(vect[x+1],3), "]", sep = "")) } } # Assign an interval to each correlation coefficient for (i in 1:dim(melted.tab.corr)[1]){ for (j in 1:(length(cl))){ if (vect[j] == -1){ melted$classe[i][melted$value[i] >= vect[j] && melted$value[i] <= vect[j+1]] <- cl[j] } else { melted$classe[i][melted$value[i] > vect[j] && melted$value[i] <= vect[j+1]] <- cl[j] } } } # Find the 0 and assign it the white as name if (length(which(vect == 0)) == 1) { melted$classe[melted$value == 0] <- "0" indic <- which(vect == 0) cl <- c(cl[1:(indic-1)], 0, cl[indic:length(cl)]) names(cl)[indic] <- "#FFFFFF" } else if (length(which(vect == 0)) == 0) { indic <- 0 for (x in 1:(length(vect)-1)) { if (0 > vect[x] && 0 <= vect[x+1]) { names(cl)[x] <- "#FFFFFF" indic <- x } } } indic <- length(cl) - indic + 1 cl <- rev(cl) # Assign the colors of each intervals as their name names(cl)[1:(indic-1)] <- myPal(length(cl[1:indic])*2-1)[1:indic-1] names(cl)[(indic+1):length(cl)] <- myPal(length(cl[indic:length(cl)])*2-1)[(ceiling(length(myPal(length(cl[indic:length(cl)])*2-1))/2)+1):length(myPal(length(cl[indic:length(cl)])*2-1))] melted$classe <- factor(melted$classe) melted$classe <- factor(melted$classe, levels = cl[cl%in%levels(melted$classe)]) # Heatmap if color.heatmap = yes : ggplot(melted, aes(Var2, Var1, fill = classe)) + ggtitle("Colored correlation table" ) + xlab("Table 1") + ylab("Table 2") + geom_tile(color ="ghostwhite") + scale_fill_manual( breaks = levels(melted$classe), values = names(cl)[cl%in%levels(melted$classe)], name = paste(corr.method, "correlation", sep = "\n")) + theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5)) } else { # Heatmap if color.heatmap = no : ggplot(melted.tab.corr, aes(Var2, Var1, fill = value)) + ggtitle("Colored correlation table" ) + xlab("Table 1") + ylab("Table 2") + geom_tile(color ="ghostwhite") + scale_fill_gradient2(low = "red", high = "#00CC00", mid = "white", midpoint = 0, limit = c(-1,1), name = paste(corr.method, "correlation", sep = "\n")) + theme_classic() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5), plot.title = element_text(hjust = 0.5)) } ggsave(output2, device = "pdf", width = 10+0.075*dim(tab.corr)[2], height = 5+0.075*dim(tab.corr)[1], limitsize = FALSE) } # End if(length(tab.corr)==0)else } # End of correlation.tab # Function call # correlation.tab(tab1.name, tab2.name, param1.samples, param2.samples, corr.method, test.corr, alpha, multi.name, filter, # filters.choice, threshold, reorder.var, color.heatmap, type.classes, # reg.value, irreg.vect, output1, output2)