Mercurial > repos > artbio > gsc_high_dimensions_visualisation
changeset 7:18a1dc4aec4a draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_high_dimension_visualization commit ef87a68f9a33f8418699d97627eb5f49a5e2c4a6
author | artbio |
---|---|
date | Sun, 21 May 2023 16:26:23 +0000 |
parents | 19bef589f876 |
children | fe6f76030168 |
files | high_dim_visu.R high_dim_visu.xml test-data/pca.nolabels.factors.pdf |
diffstat | 3 files changed, 170 insertions(+), 179 deletions(-) [+] |
line wrap: on
line diff
--- a/high_dim_visu.R Wed Jun 24 06:22:53 2020 -0400 +++ b/high_dim_visu.R Sun May 21 16:26:23 2023 +0000 @@ -1,6 +1,9 @@ -# load packages that are provided in the conda env -options( show.error.messages=F, - error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) +options(show.error.messages = FALSE, + error = function() { + cat(geterrmessage(), file = stderr()) + q("no", 1, FALSE) + } +) loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") warnings() library(optparse) @@ -12,211 +15,212 @@ library(RColorBrewer) library(ClusterR) library(data.table) +library(Polychrome) # Arguments -option_list = list( +option_list <- list( make_option( "--data", default = NA, - type = 'character', + type = "character", help = "Input file that contains expression value to visualise" ), make_option( "--sep", - default = '\t', - type = 'character', + default = "\t", + type = "character", help = "File separator [default : '%default' ]" ), make_option( "--colnames", default = TRUE, - type = 'logical', + type = "logical", help = "Consider first line as header ? [default : '%default' ]" ), make_option( "--out", default = "res.tab", - type = 'character', + type = "character", help = "Output name [default : '%default' ]" ), make_option( "--labels", default = FALSE, - type = 'logical', + type = "logical", help = "add labels in scatter plots [default : '%default' ]" ), make_option( "--factor", - default = '', - type = 'character', + default = "", + type = "character", help = "A two column table that specifies factor levels for contrasting data [default : '%default' ]" ), make_option( "--visu_choice", - default = 'PCA', - type = 'character', + default = "PCA", + type = "character", help = "visualisation method ('PCA', 'tSNE', 'HCPC') [default : '%default' ]" ), make_option( "--table_coordinates", - default = '', - type = 'character', + default = "", + type = "character", help = "Table with plot coordinates [default : '%default' ]" ), make_option( "--Rtsne_seed", default = 42, - type = 'integer', + type = "integer", help = "Seed value for reproducibility [default : '%default' ]" ), make_option( "--Rtsne_dims", default = 2, - type = 'integer', + type = "integer", help = "Output dimensionality [default : '%default' ]" ), make_option( "--Rtsne_initial_dims", default = 50, - type = 'integer', + type = "integer", help = "The number of dimensions that should be retained in the initial PCA step [default : '%default' ]" ), make_option( "--Rtsne_perplexity", default = 5.0, - type = 'numeric', + type = "numeric", help = "perplexity [default : '%default' ]" ), make_option( "--Rtsne_theta", default = 1.0, - type = 'numeric', + type = "numeric", help = "theta [default : '%default' ]" ), make_option( "--Rtsne_max_iter", default = 1000, - type = 'integer', + type = "integer", help = "max_iter [default : '%default' ]" ), make_option( "--Rtsne_pca", default = TRUE, - type = 'logical', + type = "logical", help = "Whether an initial PCA step should be performed [default : '%default' ]" ), make_option( "--Rtsne_pca_center", default = TRUE, - type = 'logical', + type = "logical", help = "Should data be centered before pca is applied? [default : '%default' ]" ), make_option( "--Rtsne_pca_scale", default = FALSE, - type = 'logical', + type = "logical", help = "Should data be scaled before pca is applied? [default : '%default' ]" ), make_option( "--Rtsne_normalize", default = TRUE, - type = 'logical', + type = "logical", help = "Should data be normalized internally prior to distance calculations? [default : '%default' ]" ), make_option( "--Rtsne_exaggeration_factor", default = 12.0, - type = 'numeric', + type = "numeric", help = " Exaggeration factor used to multiply the P matrix in the first part of the optimization [default : '%default' ]" ), make_option( "--PCA_npc", default = 5, - type = 'integer', + type = "integer", help = "number of dimensions kept in the results [default : '%default' ]" ), make_option( "--PCA_x_axis", default = 1, - type = 'integer', + type = "integer", help = "PC to plot in the x axis [default : '%default' ]" ), make_option( "--PCA_y_axis", default = 2, - type = 'integer', + type = "integer", help = "PC to plot in the y axis [default : '%default' ]" ), make_option( "--HCPC_ncluster", default = -1, - type = 'numeric', + type = "numeric", help = "nb.clust, number of clusters to consider in the hierarchical clustering. [default : -1 let HCPC to optimize the number]" ), make_option( "--HCPC_npc", default = 5, - type = 'integer', + type = "integer", help = "npc, number of dimensions which are kept for HCPC analysis [default : '%default' ]" ), make_option( "--HCPC_metric", - default = 'euclidean', - type = 'character', + default = "euclidean", + type = "character", help = "Metric to be used for calculating dissimilarities between observations, available 'euclidean' or 'manhattan' [default : '%default' ]" ), make_option( "--HCPC_method", - default = 'ward', - type = 'character', + default = "ward", + type = "character", help = "Clustering method between 'ward','average','single', 'complete', 'weighted' [default :'%default']" ), make_option( "--pdf_out", default = "out.pdf", - type = 'character', + type = "character", help = "pdf of plots [default : '%default' ]" ), make_option( "--HCPC_consol", - default = 'TRUE', - type = 'logical', + default = "TRUE", + type = "logical", help = "If TRUE, a k-means consolidation is performed [default :'%default']" ), make_option( "--HCPC_itermax", - default = '10', - type = 'integer', + default = "10", + type = "integer", help = "The maximum number of iterations for the consolidation [default :'%default']" ), make_option( "--HCPC_min", - default = '3', - type = 'integer', + default = "3", + type = "integer", help = "The least possible number of clusters suggested [default :'%default']" ), make_option( "--HCPC_max", default = -1, - type = 'integer', + type = "integer", help = "The higher possible number of clusters suggested [default :'%default']" ), make_option( "--HCPC_clusterCA", - default = 'rows', - type = 'character', + default = "rows", + type = "character", help = "A string equals to 'rows' or 'columns' for the clustering of Correspondence Analysis results [default :'%default']" ), make_option( "--HCPC_kk", default = Inf, - type = 'numeric', + type = "numeric", help = "The maximum number of iterations for the consolidation [default :'%default']" ), make_option( "--HCPC_clust", default = "", - type = 'character', + type = "character", help = "Output result of HCPC clustering : two column table (cell identifiers and clusters) [default :'%default']" ), make_option( @@ -233,16 +237,24 @@ ) ) -opt = parse_args(OptionParser(option_list = option_list), - args = commandArgs(trailingOnly = TRUE)) +opt <- parse_args(OptionParser(option_list = option_list), + args = commandArgs(trailingOnly = TRUE)) -if (opt$sep == "tab") {opt$sep <- "\t"} -if (opt$sep == "comma") {opt$sep <- ","} -if(opt$HCPC_max == -1) {opt$HCPC_max <- NULL} -if(opt$HCPC_kk == -1) {opt$HCPC_kk <- Inf} +if (opt$sep == "tab") { + opt$sep <- "\t" +} +if (opt$sep == "comma") { + opt$sep <- "," +} +if (opt$HCPC_max == -1) { + opt$HCPC_max <- NULL +} +if (opt$HCPC_kk == -1) { + opt$HCPC_kk <- Inf +} ##Add legend to plot() -legend.col <- function(col, lev){ +legend_col <- function(col, lev) { opar <- par @@ -250,22 +262,20 @@ bx <- par("usr") -box.cx <- c(bx[2] + (bx[2] - bx[1]) / 1000, +box_cx <- c(bx[2] + (bx[2] - bx[1]) / 1000, bx[2] + (bx[2] - bx[1]) / 1000 + (bx[2] - bx[1]) / 50) -box.cy <- c(bx[3], bx[3]) -box.sy <- (bx[4] - bx[3]) / n +box_cy <- c(bx[3], bx[3]) +box_sy <- (bx[4] - bx[3]) / n -xx <- rep(box.cx, each = 2) +xx <- rep(box_cx, each = 2) par(xpd = TRUE) -for(i in 1:n){ - -yy <- c(box.cy[1] + (box.sy * (i - 1)), -box.cy[1] + (box.sy * (i)), -box.cy[1] + (box.sy * (i)), -box.cy[1] + (box.sy * (i - 1))) -polygon(xx, yy, col = col[i], border = col[i]) - +for (i in 1:n) { + yy <- c(box_cy[1] + (box_sy * (i - 1)), + box_cy[1] + (box_sy * (i)), + box_cy[1] + (box_sy * (i)), + box_cy[1] + (box_sy * (i - 1))) + polygon(xx, yy, col = col[i], border = col[i]) } par(new = TRUE) plot(0, 0, type = "n", @@ -277,8 +287,7 @@ par <- opar } - -data = read.delim( +data <- read.delim( opt$data, check.names = FALSE, header = opt$colnames, @@ -287,125 +296,127 @@ ) # Contrasting factor and its colors -if (opt$factor != '') { +if (opt$factor != "") { contrasting_factor <- read.delim( opt$factor, header = TRUE ) - rownames(contrasting_factor) <- contrasting_factor[,1] - contrasting_factor <- contrasting_factor[colnames(data),] - colnames(contrasting_factor) <- c("id","factor") - if(is.numeric(contrasting_factor$factor)){ + rownames(contrasting_factor) <- contrasting_factor[, 1] + contrasting_factor <- contrasting_factor[colnames(data), ] + colnames(contrasting_factor) <- c("id", "factor") + if (is.numeric(contrasting_factor$factor)) { factor_cols <- rev(brewer.pal(n = 11, name = "RdYlGn"))[contrasting_factor$factor] } else { contrasting_factor$factor <- as.factor(contrasting_factor$factor) - if(nlevels(contrasting_factor$factor) == 2){ + if (nlevels(contrasting_factor$factor) == 2) { colors_labels <- c("#E41A1C", "#377EB8") } else { - colors_labels <- brewer.pal(nlevels(contrasting_factor$factor), name = 'Paired') + set.seed(567629) + colors_labels <- createPalette(nlevels(contrasting_factor$factor), c("#5A5156", "#E4E1E3", "#F6222E")) + names(colors_labels) <- NULL } - factorColors <- + factor_colors <- with(contrasting_factor, data.frame(factor = levels(contrasting_factor$factor), color = I(colors_labels) ) ) - factor_cols <- factorColors$color[match(contrasting_factor$factor, - factorColors$factor)] + factor_cols <- factor_colors$color[match(contrasting_factor$factor, + factor_colors$factor)] } } else { factor_cols <- rep("deepskyblue4", length(rownames(data))) } ################ t-SNE #################### -if (opt$visu_choice == 'tSNE') { +if (opt$visu_choice == "tSNE") { # filter and transpose df for tsne and pca - tdf = t(data) + tdf <- t(data) # make tsne and plot results set.seed(opt$Rtsne_seed) ## Sets seed for reproducibility tsne_out <- Rtsne(tdf, dims = opt$Rtsne_dims, - initial_dims = opt$Rtsne_initial_dims, - perplexity = opt$Rtsne_perplexity , + initial_dims = opt$Rtsne_initial_dims, + perplexity = opt$Rtsne_perplexity, theta = opt$Rtsne_theta, max_iter = opt$Rtsne_max_iter, - pca = opt$Rtsne_pca, + pca = opt$Rtsne_pca, pca_center = opt$Rtsne_pca_center, pca_scale = opt$Rtsne_pca_scale, normalize = opt$Rtsne_normalize, - exaggeration_factor=opt$Rtsne_exaggeration_factor) + exaggeration_factor = opt$Rtsne_exaggeration_factor) - embedding <- as.data.frame(tsne_out$Y[,1:2]) + embedding <- as.data.frame(tsne_out$Y[, 1:2]) embedding$Class <- as.factor(rownames(tdf)) - gg_legend = theme(legend.position="right") - if (opt$factor == '') { - ggplot(embedding, aes(x=V1, y=V2)) + - geom_point(size=1, color='deepskyblue4') + + gg_legend <- theme(legend.position = "right") + if (opt$factor == "") { + ggplot(embedding, aes(x = V1, y = V2)) + + geom_point(size = 1, color = "deepskyblue4") + gg_legend + xlab("t-SNE 1") + ylab("t-SNE 2") + - ggtitle('t-SNE') + + ggtitle("t-SNE") + if (opt$labels) { - geom_text(aes(label=Class),hjust=-0.2, vjust=-0.5, size=1.5, color='deepskyblue4') + geom_text(aes(label = Class), hjust = -0.2, vjust = -0.5, size = 1.5, color = "deepskyblue4") } } else { - if(is.numeric(contrasting_factor$factor)){ + if (is.numeric(contrasting_factor$factor)) { embedding$factor <- contrasting_factor$factor } else { embedding$factor <- as.factor(contrasting_factor$factor) } - ggplot(embedding, aes(x=V1, y=V2, color=factor)) + - geom_point(size=1) + #, color=factor_cols + ggplot(embedding, aes(x = V1, y = V2, color = factor)) + + geom_point(size = 1) + gg_legend + xlab("t-SNE 1") + ylab("t-SNE 2") + - ggtitle('t-SNE') + + ggtitle("t-SNE") + if (opt$labels) { - geom_text(aes(label=Class, colour=factor),hjust=-0.2, vjust=-0.5, size=1.5) + geom_text(aes(label = Class, colour = factor), hjust = -0.2, vjust = -0.5, size = 1.5) } - } - ggsave(file=opt$pdf_out, device="pdf") - + } + ggsave(file = opt$pdf_out, device = "pdf") + #save coordinates table - if(opt$table_coordinates != ''){ + if (opt$table_coordinates != "") { coord_table <- cbind(rownames(tdf), round(as.data.frame(tsne_out$Y), 6)) - colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$Rtsne_dims))) + colnames(coord_table) <- c("Cells", paste0("DIM", (1:opt$Rtsne_dims))) } } ######### make PCA with FactoMineR ################# -if (opt$visu_choice == 'PCA') { - pca <- PCA(t(data), ncp=opt$PCA_npc, graph=FALSE) +if (opt$visu_choice == "PCA") { + pca <- PCA(t(data), ncp = opt$PCA_npc, graph = FALSE) pdf(opt$pdf_out) if (opt$labels == FALSE) { - plot(pca, axes = c(opt$PCA_x_axis,opt$PCA_y_axis), label="none" , col.ind = factor_cols) + plot(pca, axes = c(opt$PCA_x_axis, opt$PCA_y_axis), label = "none", col.ind = factor_cols) } else { - plot(pca, axes = c(opt$PCA_x_axis,opt$PCA_y_axis), cex=0.2 , col.ind = factor_cols) + plot(pca, axes = c(opt$PCA_x_axis, opt$PCA_y_axis), cex = 0.2, col.ind = factor_cols) } -if (opt$factor != '') { - if(is.factor(contrasting_factor$factor)) { - legend(x = 'topright', - legend = as.character(factorColors$factor), - col = factorColors$color, pch = 16, bty = 'n', xjust = 1, cex=0.7) +if (opt$factor != "") { + if (is.factor(contrasting_factor$factor)) { + legend(x = "topright", + legend = as.character(factor_colors$factor), + col = factor_colors$color, pch = 16, bty = "n", xjust = 1, cex = 0.7) } else { - legend.col(col = rev(brewer.pal(n = 11, name = "RdYlGn")), lev = cut(contrasting_factor$factor, 11, label = FALSE)) + legend_col(col = rev(brewer.pal(n = 11, name = "RdYlGn")), lev = cut(contrasting_factor$factor, 11, label = FALSE)) } } dev.off() #save coordinates table - if(opt$table_coordinates != ''){ + if (opt$table_coordinates != "") { coord_table <- cbind(rownames(pca$ind$coord), round(as.data.frame(pca$ind$coord), 6)) - colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$PCA_npc))) + colnames(coord_table) <- c("Cells", paste0("DIM", (1:opt$PCA_npc))) } } ########### make HCPC with FactoMineR ########## -if (opt$visu_choice == 'HCPC') { +if (opt$visu_choice == "HCPC") { # HCPC starts with a PCA pca <- PCA( @@ -414,96 +425,76 @@ graph = FALSE, ) -PCA_IndCoord = as.data.frame(pca$ind$coord) # coordinates of observations in PCA +pca_ind_coord <- as.data.frame(pca$ind$coord) # coordinates of observations in PCA # Hierarchical Clustering On Principal Components Followed By Kmean Clustering -res.hcpc <- HCPC(pca, - nb.clust=opt$HCPC_ncluster, metric=opt$HCPC_metric, method=opt$HCPC_method, - graph=F,consol=opt$HCPC_consol,iter.max=opt$HCPC_itermax,min=opt$HCPC_min,max=opt$HCPC_max, - cluster.CA=opt$HCPC_clusterCA,kk=opt$HCPC_kk) +res_hcpc <- HCPC(pca, + nb.clust = opt$HCPC_ncluster, metric = opt$HCPC_metric, method = opt$HCPC_method, + graph = FALSE, consol = opt$HCPC_consol, iter.max = opt$HCPC_itermax, min = opt$HCPC_min, max = opt$HCPC_max, + cluster.CA = opt$HCPC_clusterCA, kk = opt$HCPC_kk) # HCPC plots -dims <- head(as.data.frame(res.hcpc$call$t$res$eig),2) # dims variances in column 2 +dims <- head(as.data.frame(res_hcpc$call$t$res$eig), 2) # dims variances in column 2 pdf(opt$pdf_out) -plot(res.hcpc, choice="tree") -plot(res.hcpc, choice="bar") -plot(res.hcpc, choice="3D.map") +plot(res_hcpc, choice = "tree") +plot(res_hcpc, choice = "bar") +plot(res_hcpc, choice = "3D.map") if (opt$labels == FALSE) { -plot(res.hcpc, choice="map", label="none") +plot(res_hcpc, choice = "map", label = "none") } else { -plot(res.hcpc, choice="map") +plot(res_hcpc, choice = "map") } # user contrasts on the pca -if (opt$factor != '') { - plot(pca, label="none", col.ind = factor_cols) - if(is.factor(contrasting_factor$factor)) { - legend(x = 'topright', - legend = as.character(factorColors$factor), - col = factorColors$color, pch = 16, bty = 'n', xjust = 1, cex=0.7) - +if (opt$factor != "") { + plot(pca, label = "none", col.ind = factor_cols) + if (is.factor(contrasting_factor$factor)) { + legend(x = "topright", + legend = as.character(factor_colors$factor), + col = factor_colors$color, pch = 16, bty = "n", xjust = 1, cex = 0.7) + ## Normalized Mutual Information sink(opt$HCPC_mutual_info) res <- external_validation( true_labels = as.numeric(contrasting_factor$factor), - clusters = as.numeric(res.hcpc$data.clust$clust), + clusters = as.numeric(res_hcpc$data.clust$clust), summary_stats = TRUE ) sink() } else { - legend.col(col = rev(brewer.pal(n = 11, name = "RdYlGn")), lev = cut(contrasting_factor$factor, 11, label = FALSE)) + legend_col(col = rev(brewer.pal(n = 11, name = "RdYlGn")), lev = cut(contrasting_factor$factor, 11, label = FALSE)) } } -## Clusters to which individual observations belong # used ? -# Clust <- data.frame(Cluster = res.hcpc$data.clust[, (nrow(data) + 1)], -# Observation = rownames(res.hcpc$data.clust)) -# metadata <- data.frame(Observation=colnames(data), row.names=colnames(data)) -# metadata = merge(y = metadata, -# x = Clust, -# by = "Observation") - -# unclear utility -# ObsNumberPerCluster = as.data.frame(table(metadata$Cluster)) -# colnames(ObsNumberPerCluster) = c("Cluster", "ObsNumber") -# -# ## Silhouette Plot # not used -# hc.cut = hcut(PCA_IndCoord, -# k = nlevels(metadata$Cluster), -# hc_method = "ward.D2") -# -# Sil = fviz_silhouette(hc.cut) -# sil1 = as.data.frame(Sil$data) - dev.off() - if(opt$table_coordinates != ''){ - coord_table <- cbind(Cell=rownames(res.hcpc$call$X), - round(as.data.frame(res.hcpc$call$X[, -length(res.hcpc$call$X)]), 6), - as.data.frame(res.hcpc$call$X[, length(res.hcpc$call$X)]) +if (opt$table_coordinates != "") { + coord_table <- cbind(Cell = rownames(res_hcpc$call$X), + round(as.data.frame(res_hcpc$call$X[, -length(res_hcpc$call$X)]), 6), + as.data.frame(res_hcpc$call$X[, length(res_hcpc$call$X)]) ) - colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$HCPC_npc)),"Cluster") + colnames(coord_table) <- c("Cells", paste0("DIM", (1:opt$HCPC_npc)), "Cluster") } - if(opt$HCPC_clust != ""){ - res_clustering <- data.frame(Cell = rownames(res.hcpc$data.clust), - Cluster = res.hcpc$data.clust$clust) - +if (opt$HCPC_clust != "") { +res_clustering <- data.frame(Cell = rownames(res_hcpc$data.clust), + Cluster = res_hcpc$data.clust$clust) + } # Description of cluster by most contributing variables / gene expressions # first transform list of vectors in a list of dataframes -extract_description <- lapply(res.hcpc$desc.var$quanti, as.data.frame) +extract_description <- lapply(res_hcpc$desc.var$quanti, as.data.frame) # second, transfer rownames (genes) to column in the dataframe, before rbinding extract_description_w_genes <- Map(cbind, extract_description, - genes= lapply(extract_description, rownames) + genes = lapply(extract_description, rownames) ) # Then use data.table to collapse all generated dataframe, with the cluster id in first column # using the {data.table} rbindlist function cluster_description <- rbindlist(extract_description_w_genes, idcol = "cluster_id") -cluster_description = cluster_description[ ,c(8, 1, 2, 3,4,5,6,7)] # reorganize columns +cluster_description <- cluster_description[, c(8, 1, 2, 3, 4, 5, 6, 7)] # reorganize columns # Finally, output cluster description data frame @@ -511,34 +502,34 @@ cluster_description, file = opt$HCPC_cluster_description, sep = "\t", - quote = F, - col.names = T, - row.names = F + quote = FALSE, + col.names = TRUE, + row.names = FALSE ) } ## Return coordinates file to user -if(opt$table_coordinates != ''){ +if (opt$table_coordinates != "") { write.table( coord_table, file = opt$table_coordinates, sep = "\t", - quote = F, - col.names = T, - row.names = F + quote = FALSE, + col.names = TRUE, + row.names = FALSE ) } -if(opt$HCPC_clust != ""){ +if (opt$HCPC_clust != "") { write.table( res_clustering, file = opt$HCPC_clust, sep = "\t", - quote = F, - col.names = T, - row.names = F + quote = FALSE, + col.names = TRUE, + row.names = FALSE ) }
--- a/high_dim_visu.xml Wed Jun 24 06:22:53 2020 -0400 +++ b/high_dim_visu.xml Sun May 21 16:26:23 2023 +0000 @@ -7,7 +7,7 @@ <requirement type="package" version="0.15=r351he1b5a44_0">r-rtsne</requirement> <requirement type="package" version="0.4.7=r351h6115d3f_0">r-ggfortify</requirement> <requirement type="package" version="1.1.9=r351h0357c0b_0">r-clusterr</requirement> - <!-- <requirement type="package" version="1.11.6=r351hc070d10_0">r-data.table</requirement> --> + <requirement type="package" version="1.2.5=r35h6115d3f_0">r-polychrome</requirement> </requirements> <stdio> <exit_code range="1:" level="fatal" description="Tool exception" />