Mercurial > repos > artbio > gsc_signature_score
view signature_score.R @ 4:31da579595c5 draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/gsc_signature_score commit 84220e72e5d52cc9265b7eec20e6826e04d91f1f
author | artbio |
---|---|
date | Sun, 10 Dec 2023 04:11:16 +0000 |
parents | 3351ca630a01 |
children |
line wrap: on
line source
# Signature score # Compute the signature score based on the geometric mean of the target gene expression # and split cells in 2 groups (high/low) using this signature score. options(show.error.messages = FALSE, error = function() { cat(geterrmessage(), file = stderr()) q("no", 1, FALSE) } ) loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") warnings() library(optparse) library(psych) library(ggplot2) library(gridExtra) # Arguments option_list <- list( make_option( "--input", default = NA, type = "character", help = "Input file that contains log2(CPM +1) values" ), make_option( "--sep", default = "\t", type = "character", help = "File separator [default : '%default' ]" ), make_option( "--colnames", default = TRUE, type = "logical", help = "Consider first line as header ? [default : '%default' ]" ), make_option( "--genes", default = NA, type = "character", help = "List of genes comma separated" ), make_option( "--percentile_threshold", default = 20, type = "integer", help = "detection threshold to keep a gene in signature set [default : '%default' ]" ), make_option( "--output", default = "./output.tab", type = "character", help = "Output path [default : '%default' ]" ), make_option( "--stats", default = "./statistics.tab", type = "character", help = "statistics path [default : '%default' ]" ), make_option( "--correlations", default = "./correlations.tab", type = "character", help = "Correlations between signature genes [default : '%default' ]" ), make_option( "--covariances", default = "./statistics.tab", type = "character", help = "Covariances between signature genes [default : '%default' ]" ), make_option( "--pdf", default = "~/output.pdf", type = "character", help = "pdf path [default : '%default' ]" ) ) opt <- parse_args(OptionParser(option_list = option_list), args = commandArgs(trailingOnly = TRUE)) if (opt$sep == "tab") { opt$sep <- "\t" } if (opt$sep == "comma") { opt$sep <- "," } # Take input data data.counts <- read.table( opt$input, h = opt$colnames, row.names = 1, sep = opt$sep, check.names = FALSE ) # Get vector of target genes genes <- unlist(strsplit(opt$genes, ",")) if (length(unique(genes %in% rownames(data.counts))) == 1) { if (unique(genes %in% rownames(data.counts)) == FALSE) stop("None of these genes are in your dataset: ", opt$genes) } logical_genes <- rownames(data.counts) %in% genes # Retrieve target genes in counts data signature.counts <- subset(data.counts, logical_genes) # compute covariance signature.covariances <- as.data.frame(cov(t(signature.counts))) signature.covariances <- cbind(gene = rownames(signature.covariances), signature.covariances) write.table(signature.covariances, file = opt$covariances, quote = FALSE, row.names = FALSE, sep = "\t") # compute signature.correlations signature.correlations <- as.data.frame(cor(t(signature.counts))) signature.correlations <- cbind(gene = rownames(signature.correlations), signature.correlations) write.table(signature.correlations, file = opt$correlations, quote = FALSE, row.names = FALSE, sep = "\t") ## Descriptive Statistics Function descriptive_stats <- function(InputData) { SummaryData <- data.frame( mean = rowMeans(InputData), SD = apply(InputData, 1, sd), Variance = apply(InputData, 1, var), Percentage_Detection = apply(InputData, 1, function(x, y = InputData) { (sum(x != 0) / ncol(y)) * 100 }) ) return(SummaryData) } signature_stats <- descriptive_stats(signature.counts) # Find poorly detected genes from the signature kept_genes <- signature_stats$Percentage_Detection >= opt$percentile_threshold # Add warnings if (length(unique(kept_genes)) > 1) { cat( "WARNINGS ! Following genes were removed from further analysis due to low gene expression :", paste(paste(rownames(signature.counts)[!kept_genes], round(signature_stats$Percentage_Detection[!kept_genes], 2), sep = " : "), collapse = ", "), "\n" ) } else { if (unique(kept_genes) == FALSE) { stop( "None of these genes are detected in ", opt$percent, "% of your cells: ", paste(rownames(signature_stats), collapse = ", "), ". You can be less stringent thanks to --percent parameter." ) } } # Remove genes poorly detected in the dataset signature.counts <- signature.counts[kept_genes, ] # Replace 0 by 1 counts signature.counts[signature.counts == 0] <- 1 # Geometric mean by cell score <- apply(signature.counts, 2, geometric.mean) # geometric.mean requires psych # Add results in signature_output signature_output <- data.frame(cell = names(score), score = score, rate = ifelse(score > mean(score), "HIGH", "LOW"), nGenes = colSums(data.counts != 0), total_counts = colSums(data.counts)) # statistics of input genes, signature genes first lines statistics.counts <- rbind(subset(data.counts, logical_genes), subset(data.counts, !logical_genes)) statistics <- descriptive_stats(statistics.counts) statistics <- cbind(gene = rownames(statistics), statistics) # Re-arrange score matrix for plots score <- data.frame(score = score, order = rank(score, ties.method = "first"), signature = signature_output$rate, stringsAsFactors = FALSE) pdf(file = opt$pdf) myplot <- ggplot(signature_output, aes(x = rate, y = score)) + geom_violin(aes(fill = rate), alpha = .5, trim = FALSE, show.legend = FALSE, cex = 0.5) + geom_abline(slope = 0, intercept = mean(score$score), lwd = 0.5, color = "red") + scale_fill_manual(values = c("#ff0000", "#08661e")) + geom_jitter(size = 0.2) + labs(y = "Score", x = "Rate") + annotate("text", x = 0.55, y = mean(score$score), cex = 3, vjust = 1.5, color = "black", label = mean(score$score), parse = TRUE) + labs(title = "Violin plots of Cell signature scores") print(myplot) dev.off() # Save file write.table( signature_output, opt$output, sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE ) write.table( statistics, opt$stats, sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE )