Mercurial > repos > artbio > gsc_signature_score
diff signature_score.R @ 4:31da579595c5 draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/gsc_signature_score commit 84220e72e5d52cc9265b7eec20e6826e04d91f1f
author | artbio |
---|---|
date | Sun, 10 Dec 2023 04:11:16 +0000 |
parents | 3351ca630a01 |
children |
line wrap: on
line diff
--- a/signature_score.R Sat Oct 31 18:34:40 2020 +0000 +++ b/signature_score.R Sun Dec 10 04:11:16 2023 +0000 @@ -1,17 +1,13 @@ -######################### -# Signature score # -######################### - +# Signature score # Compute the signature score based on the geometric mean of the target gene expression # and split cells in 2 groups (high/low) using this signature score. -# Example of command -# Rscript 4-signature_score.R --input <input.tsv> --genes ARNT2,SALL2,SOX9,OLIG2,POU3F2 -# --output <output.tab> --pdf <output.pdf> - -# load packages that are provided in the conda env -options( show.error.messages=F, - error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) +options(show.error.messages = FALSE, + error = function() { + cat(geterrmessage(), file = stderr()) + q("no", 1, FALSE) + } +) loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") warnings() @@ -21,74 +17,78 @@ library(gridExtra) # Arguments -option_list = list( +option_list <- list( make_option( "--input", default = NA, - type = 'character', + type = "character", help = "Input file that contains log2(CPM +1) values" ), make_option( "--sep", - default = '\t', - type = 'character', + default = "\t", + type = "character", help = "File separator [default : '%default' ]" ), make_option( "--colnames", default = TRUE, - type = 'logical', + type = "logical", help = "Consider first line as header ? [default : '%default' ]" - ), + ), make_option( "--genes", default = NA, - type = 'character', + type = "character", help = "List of genes comma separated" ), make_option( "--percentile_threshold", default = 20, - type = 'integer', + type = "integer", help = "detection threshold to keep a gene in signature set [default : '%default' ]" ), make_option( "--output", default = "./output.tab", - type = 'character', + type = "character", help = "Output path [default : '%default' ]" ), make_option( "--stats", default = "./statistics.tab", - type = 'character', + type = "character", help = "statistics path [default : '%default' ]" ), make_option( "--correlations", default = "./correlations.tab", - type = 'character', + type = "character", help = "Correlations between signature genes [default : '%default' ]" ), make_option( "--covariances", default = "./statistics.tab", - type = 'character', + type = "character", help = "Covariances between signature genes [default : '%default' ]" ), make_option( "--pdf", default = "~/output.pdf", - type = 'character', + type = "character", help = "pdf path [default : '%default' ]" ) ) -opt = parse_args(OptionParser(option_list = option_list), - args = commandArgs(trailingOnly = TRUE)) +opt <- parse_args(OptionParser(option_list = option_list), + args = commandArgs(trailingOnly = TRUE)) -if (opt$sep == "tab") {opt$sep = "\t"} -if (opt$sep == "comma") {opt$sep = ","} +if (opt$sep == "tab") { + opt$sep <- "\t" +} +if (opt$sep == "comma") { + opt$sep <- "," +} # Take input data data.counts <- read.table( @@ -96,17 +96,17 @@ h = opt$colnames, row.names = 1, sep = opt$sep, - check.names = F + check.names = FALSE ) # Get vector of target genes genes <- unlist(strsplit(opt$genes, ",")) if (length(unique(genes %in% rownames(data.counts))) == 1) { - if (unique(genes %in% rownames(data.counts)) == F) + if (unique(genes %in% rownames(data.counts)) == FALSE) stop("None of these genes are in your dataset: ", opt$genes) } - + logical_genes <- rownames(data.counts) %in% genes # Retrieve target genes in counts data @@ -114,17 +114,21 @@ # compute covariance signature.covariances <- as.data.frame(cov(t(signature.counts))) -signature.covariances <- cbind(gene=rownames(signature.covariances), signature.covariances) -write.table(signature.covariances, file=opt$covariances, quote=F, row.names=F, sep="\t") +signature.covariances <- cbind(gene = rownames(signature.covariances), signature.covariances) +write.table(signature.covariances, + file = opt$covariances, + quote = FALSE, + row.names = FALSE, + sep = "\t") # compute signature.correlations signature.correlations <- as.data.frame(cor(t(signature.counts))) -signature.correlations <- cbind(gene=rownames(signature.correlations), signature.correlations) -write.table(signature.correlations, file=opt$correlations, quote=F, row.names=F, sep="\t") +signature.correlations <- cbind(gene = rownames(signature.correlations), signature.correlations) +write.table(signature.correlations, file = opt$correlations, quote = FALSE, row.names = FALSE, sep = "\t") ## Descriptive Statistics Function -descriptive_stats = function(InputData) { - SummaryData = data.frame( +descriptive_stats <- function(InputData) { + SummaryData <- data.frame( mean = rowMeans(InputData), SD = apply(InputData, 1, sd), Variance = apply(InputData, 1, var), @@ -137,7 +141,7 @@ signature_stats <- descriptive_stats(signature.counts) -# Find poorly detected genes from the signature +# Find poorly detected genes from the signature kept_genes <- signature_stats$Percentage_Detection >= opt$percentile_threshold # Add warnings @@ -148,7 +152,7 @@ "\n" ) } else { - if (unique(kept_genes) == F) { + if (unique(kept_genes) == FALSE) { stop( "None of these genes are detected in ", opt$percent, @@ -160,8 +164,8 @@ } # Remove genes poorly detected in the dataset -signature.counts <- signature.counts[kept_genes,] - +signature.counts <- signature.counts[kept_genes, ] + # Replace 0 by 1 counts signature.counts[signature.counts == 0] <- 1 @@ -169,19 +173,17 @@ score <- apply(signature.counts, 2, geometric.mean) # geometric.mean requires psych # Add results in signature_output -signature_output <- data.frame( - cell = names(score), - score = score, - rate = ifelse(score > mean(score), "HIGH", "LOW"), - nGenes = colSums(data.counts != 0), - total_counts = colSums(data.counts) - ) +signature_output <- data.frame(cell = names(score), + score = score, + rate = ifelse(score > mean(score), "HIGH", "LOW"), + nGenes = colSums(data.counts != 0), + total_counts = colSums(data.counts)) # statistics of input genes, signature genes first lines statistics.counts <- rbind(subset(data.counts, logical_genes), - subset(data.counts, !logical_genes)) + subset(data.counts, !logical_genes)) statistics <- descriptive_stats(statistics.counts) -statistics <- cbind(gene=rownames(statistics), statistics) +statistics <- cbind(gene = rownames(statistics), statistics) @@ -189,17 +191,17 @@ score <- data.frame(score = score, order = rank(score, ties.method = "first"), signature = signature_output$rate, - stringsAsFactors = F) + stringsAsFactors = FALSE) pdf(file = opt$pdf) -myplot <- ggplot(signature_output, aes(x=rate, y=score)) + - geom_violin(aes(fill = rate), alpha = .5, trim = F, show.legend = F, cex=0.5) + - geom_abline(slope=0, intercept=mean(score$score), lwd=.5, color="red") + - scale_fill_manual(values=c("#ff0000", "#08661e")) + - geom_jitter(size=0.2) + labs(y = "Score", x = "Rate") + - annotate("text", x = 0.55, y = mean(score$score), cex = 3, vjust=1.5, - color="black", label = mean(score$score), parse = TRUE) + - labs(title = "Violin plots of Cell signature scores") +myplot <- ggplot(signature_output, aes(x = rate, y = score)) + + geom_violin(aes(fill = rate), alpha = .5, trim = FALSE, show.legend = FALSE, cex = 0.5) + + geom_abline(slope = 0, intercept = mean(score$score), lwd = 0.5, color = "red") + + scale_fill_manual(values = c("#ff0000", "#08661e")) + + geom_jitter(size = 0.2) + labs(y = "Score", x = "Rate") + + annotate("text", x = 0.55, y = mean(score$score), cex = 3, vjust = 1.5, + color = "black", label = mean(score$score), parse = TRUE) + + labs(title = "Violin plots of Cell signature scores") print(myplot) dev.off() @@ -209,16 +211,16 @@ signature_output, opt$output, sep = "\t", - quote = F, - col.names = T, - row.names = F + quote = FALSE, + col.names = TRUE, + row.names = FALSE ) write.table( statistics, opt$stats, sep = "\t", - quote = F, - col.names = T, - row.names = F + quote = FALSE, + col.names = TRUE, + row.names = FALSE )