Mercurial > repos > artbio > ez_histograms
diff ez_histograms.R @ 0:bdf40b0924cb draft
planemo upload for repository https://github.com/artbio/tools-artbio/tree/main/tools/ez_histograms commit 443759a746f78d67dc4ffcafdc6610d09d278846
author | artbio |
---|---|
date | Wed, 07 Feb 2024 19:49:56 +0000 |
parents | |
children | fbedb212982d |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ez_histograms.R Wed Feb 07 19:49:56 2024 +0000 @@ -0,0 +1,169 @@ +library(ggplot2) +library(reshape2) +library(dplyr) +library(scales) +library(vtable) +library(optparse) + +options(show.error.messages = FALSE, + error = function() { + cat(geterrmessage(), file = stderr()) + q("no", 1, FALSE) + } +) + +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") +warnings() + +option_list <- list( + make_option( + c("-f", "--file"), + default = NA, + type = "character", + help = "Input file that contains count values to transform" + ), + make_option( + c("-d", "--profile"), + default = "count", + type = "character", + help = "Whether y-axis shows absolute counts or density: 'count' or 'density' [default : '%default' ]" + ), + make_option( + "--xscale", + default = "cartesian", + type = "character", + help = "Whether x-axis is 'cartesian', 'log2' or 'log10' [default : '%default' ]" + ), + make_option( + "--yscale", + default = "cartesian", + type = "character", + help = "Whether y-axis is 'cartesian', 'log2' or 'log10' [default : '%default' ]" + ), + make_option( + c("-p", "--pdf"), + default = "histograms.pdf", + type = "character", + help = "Output pdf file name [default : '%default' ]" + ), + make_option( + c("-s", "--summary"), + default = "summary.tsv", + type = "character", + help = "statistics summary file name [default : '%default' ]" + ) +) + +opt <- parse_args(OptionParser(option_list = option_list), + args = commandArgs(trailingOnly = TRUE)) + +plot_histograms <- function(mdata, profile = "count", xscale = "cartesian", yscale = "cartesian", bins = 30) { + if (profile == "count") { + # count histogram + p <- ggplot(mdata, aes(x = value, fill = variable, color = variable, y = after_stat(count)), show.legend = FALSE) + + geom_histogram(bins = bins) + theme(legend.position = "none") + if (xscale == "cartesian") { + if (yscale == "log2") { + p <- p + scale_y_continuous(trans = "log2", labels = trans_format("log2", math_format(2^.x))) + } else { + if (yscale == "log10") { + p <- p + scale_y_continuous(trans = "log10", labels = trans_format("log10", math_format(10^.x))) + } + } + } + if (xscale == "log2") { + p <- p + scale_x_continuous(trans = "log2", labels = trans_format("log2", math_format(2^.x))) + if (yscale == "log2") { + p <- p + scale_y_continuous(trans = "log2", labels = trans_format("log2", math_format(2^.x))) + } else { + if (yscale == "log10") { + p <- p + scale_y_continuous(trans = "log10", labels = trans_format("log10", math_format(10^.x))) + } + } + } + if (xscale == "log10") { + p <- p + scale_x_continuous(trans = "log10", labels = trans_format("log10", math_format(10^.x))) + if (yscale == "log2") { + p <- p + scale_y_continuous(trans = "log2", labels = trans_format("log2", math_format(2^.x))) + } else { + if (yscale == "log10") { + p <- p + scale_y_continuous(trans = "log10", labels = trans_format("log10", math_format(10^.x))) + } + } + } + } + + if (profile == "density") { + # density histogram + p <- ggplot(mdata, aes(x = value, fill = variable, color = variable)) + + geom_density() + theme(legend.position = "none") + if (xscale == "log2") { + p <- p + scale_x_continuous(trans = "log2", labels = trans_format("log2", math_format(2^.x))) + } + if (xscale == "log10") { + p <- p + scale_x_continuous(trans = "log10", labels = trans_format("log10", math_format(10^.x))) + } + } + return(p) +} + +test_header <- function(file) { + data <- read.delim(file = file, header = FALSE, row.names = 1, nrows = 2) + if (all(is.na(as.numeric(data[1, seq_len(ncol(data))])))) { + return(TRUE) + } else { + return(FALSE) + } +} + +test_rownames <- function(file) { + data <- read.delim(file = file, header = FALSE, row.names = NULL, nrows = 2) + if (is.na(as.numeric(data[2, 1]))) { + return(1) + } else { + return(NULL) + } +} + +##### prepare input data +data <- read.delim(file = opt$file, header = test_header(opt$file), row.names = test_rownames(opt$file)) +data <- data %>% select(where(is.numeric)) # remove non numeric columns +mdata <- melt(data) + +##### main + +# determine optimal number of bins (Sturges’ Rule) +bins <- ceiling(log2(nrow(data)) + 1) +# plot +p <- plot_histograms(mdata, profile = opt$profile, xscale = opt$xscale, bins = bins, yscale = opt$yscale) + +# determine optimal width for the graph +width <- length(data) +width <- case_when( + width == 1 ~ 14 / 3, + width == 2 ~ (2 / 3) * 14, + TRUE ~ 14 +) +# determine optimal height for the graph +height <- length(data) +height <- case_when( + height <= 3 ~ 3, + height <= 6 ~ 6, + TRUE ~ (floor(height / 3) + 1) * 3 +) +# determine optimal number of col for the graph +ncol <- length(data) +ncol <- case_when( + ncol == 1 ~ 1, + ncol == 2 ~ 2, + TRUE ~ 3 +) +pdf(opt$pdf, width = width, height = height) +print(p + facet_wrap(~variable, ncol = ncol, scales = "free")) +dev.off() + +# Summary statistics with vtable package +summary_df <- sumtable(data, digits = 8, out = "return", add.median = TRUE, + summ.names = c("N", "Mean", "Std. Dev.", "Min", "Pctl. 25", + "Median", "Pctl. 75", "Max")) +write.table(summary_df, file = opt$summary, sep = "\t", quote = FALSE, row.names = FALSE)