# HG changeset patch # User artbio # Date 1731019438 0 # Node ID bd810b6d8a3bba134c29331eba15f8f9288d064c # Parent 31da579595c5dd049a602f7a504db24b9cba8423 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/gsc_signature_score commit ae36c63e1e5c0c285593d427f0ffd348a3d89e3f diff -r 31da579595c5 -r bd810b6d8a3b signature_score.R --- a/signature_score.R Sun Dec 10 04:11:16 2023 +0000 +++ b/signature_score.R Thu Nov 07 22:43:58 2024 +0000 @@ -2,11 +2,12 @@ # Compute the signature score based on the geometric mean of the target gene expression # and split cells in 2 groups (high/low) using this signature score. -options(show.error.messages = FALSE, - error = function() { - cat(geterrmessage(), file = stderr()) - q("no", 1, FALSE) - } +options( + show.error.messages = FALSE, + error = function() { + cat(geterrmessage(), file = stderr()) + q("no", 1, FALSE) + } ) loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") warnings() @@ -18,93 +19,95 @@ # Arguments option_list <- list( - make_option( - "--input", - default = NA, - type = "character", - help = "Input file that contains log2(CPM +1) values" - ), - make_option( - "--sep", - default = "\t", - type = "character", - help = "File separator [default : '%default' ]" - ), - make_option( - "--colnames", - default = TRUE, - type = "logical", - help = "Consider first line as header ? [default : '%default' ]" - ), - make_option( - "--genes", - default = NA, - type = "character", - help = "List of genes comma separated" - ), - make_option( - "--percentile_threshold", - default = 20, - type = "integer", - help = "detection threshold to keep a gene in signature set [default : '%default' ]" - ), - make_option( - "--output", - default = "./output.tab", - type = "character", - help = "Output path [default : '%default' ]" - ), - make_option( - "--stats", - default = "./statistics.tab", - type = "character", - help = "statistics path [default : '%default' ]" - ), - make_option( - "--correlations", - default = "./correlations.tab", - type = "character", - help = "Correlations between signature genes [default : '%default' ]" - ), - make_option( - "--covariances", - default = "./statistics.tab", - type = "character", - help = "Covariances between signature genes [default : '%default' ]" - ), - make_option( - "--pdf", - default = "~/output.pdf", - type = "character", - help = "pdf path [default : '%default' ]" - ) + make_option( + "--input", + default = NA, + type = "character", + help = "Input file that contains log2(CPM +1) values" + ), + make_option( + "--sep", + default = "\t", + type = "character", + help = "File separator [default : '%default' ]" + ), + make_option( + "--colnames", + default = TRUE, + type = "logical", + help = "Consider first line as header ? [default : '%default' ]" + ), + make_option( + "--genes", + default = NA, + type = "character", + help = "List of genes comma separated" + ), + make_option( + "--percentile_threshold", + default = 20, + type = "integer", + help = "detection threshold to keep a gene in signature set [default : '%default' ]" + ), + make_option( + "--output", + default = "./output.tab", + type = "character", + help = "Output path [default : '%default' ]" + ), + make_option( + "--stats", + default = "./statistics.tab", + type = "character", + help = "statistics path [default : '%default' ]" + ), + make_option( + "--correlations", + default = "./correlations.tab", + type = "character", + help = "Correlations between signature genes [default : '%default' ]" + ), + make_option( + "--covariances", + default = "./statistics.tab", + type = "character", + help = "Covariances between signature genes [default : '%default' ]" + ), + make_option( + "--pdf", + default = "~/output.pdf", + type = "character", + help = "pdf path [default : '%default' ]" + ) ) opt <- parse_args(OptionParser(option_list = option_list), - args = commandArgs(trailingOnly = TRUE)) + args = commandArgs(trailingOnly = TRUE) +) if (opt$sep == "tab") { - opt$sep <- "\t" + opt$sep <- "\t" } if (opt$sep == "comma") { - opt$sep <- "," + opt$sep <- "," } # Take input data data.counts <- read.table( - opt$input, - h = opt$colnames, - row.names = 1, - sep = opt$sep, - check.names = FALSE + opt$input, + h = opt$colnames, + row.names = 1, + sep = opt$sep, + check.names = FALSE ) # Get vector of target genes genes <- unlist(strsplit(opt$genes, ",")) if (length(unique(genes %in% rownames(data.counts))) == 1) { - if (unique(genes %in% rownames(data.counts)) == FALSE) - stop("None of these genes are in your dataset: ", opt$genes) + if (unique(genes %in% rownames(data.counts)) == FALSE) { + stop("None of these genes are in your dataset: ", opt$genes) + } } logical_genes <- rownames(data.counts) %in% genes @@ -116,10 +119,11 @@ signature.covariances <- as.data.frame(cov(t(signature.counts))) signature.covariances <- cbind(gene = rownames(signature.covariances), signature.covariances) write.table(signature.covariances, - file = opt$covariances, - quote = FALSE, - row.names = FALSE, - sep = "\t") + file = opt$covariances, + quote = FALSE, + row.names = FALSE, + sep = "\t" +) # compute signature.correlations signature.correlations <- as.data.frame(cor(t(signature.counts))) @@ -128,15 +132,15 @@ ## Descriptive Statistics Function descriptive_stats <- function(InputData) { - SummaryData <- data.frame( - mean = rowMeans(InputData), - SD = apply(InputData, 1, sd), - Variance = apply(InputData, 1, var), - Percentage_Detection = apply(InputData, 1, function(x, y = InputData) { - (sum(x != 0) / ncol(y)) * 100 - }) - ) - return(SummaryData) + SummaryData <- data.frame( + mean = rowMeans(InputData), + SD = apply(InputData, 1, sd), + Variance = apply(InputData, 1, var), + Percentage_Detection = apply(InputData, 1, function(x, y = InputData) { + (sum(x != 0) / ncol(y)) * 100 + }) + ) + return(SummaryData) } signature_stats <- descriptive_stats(signature.counts) @@ -146,21 +150,21 @@ # Add warnings if (length(unique(kept_genes)) > 1) { - cat( - "WARNINGS ! Following genes were removed from further analysis due to low gene expression :", - paste(paste(rownames(signature.counts)[!kept_genes], round(signature_stats$Percentage_Detection[!kept_genes], 2), sep = " : "), collapse = ", "), - "\n" - ) + cat( + "WARNINGS ! Following genes were removed from further analysis due to low gene expression :", + paste(paste(rownames(signature.counts)[!kept_genes], round(signature_stats$Percentage_Detection[!kept_genes], 2), sep = " : "), collapse = ", "), + "\n" + ) } else { - if (unique(kept_genes) == FALSE) { - stop( - "None of these genes are detected in ", - opt$percent, - "% of your cells: ", - paste(rownames(signature_stats), collapse = ", "), - ". You can be less stringent thanks to --percent parameter." - ) - } + if (unique(kept_genes) == FALSE) { + stop( + "None of these genes are detected in ", + opt$percent, + "% of your cells: ", + paste(rownames(signature_stats), collapse = ", "), + ". You can be less stringent thanks to --percent parameter." + ) + } } # Remove genes poorly detected in the dataset @@ -173,54 +177,63 @@ score <- apply(signature.counts, 2, geometric.mean) # geometric.mean requires psych # Add results in signature_output -signature_output <- data.frame(cell = names(score), - score = score, - rate = ifelse(score > mean(score), "HIGH", "LOW"), - nGenes = colSums(data.counts != 0), - total_counts = colSums(data.counts)) +signature_output <- data.frame( + cell = names(score), + score = score, + rate = ifelse(score > mean(score), "HIGH", "LOW"), + nGenes = colSums(data.counts != 0), + total_counts = colSums(data.counts) +) # statistics of input genes, signature genes first lines -statistics.counts <- rbind(subset(data.counts, logical_genes), - subset(data.counts, !logical_genes)) +statistics.counts <- rbind( + subset(data.counts, logical_genes), + subset(data.counts, !logical_genes) +) statistics <- descriptive_stats(statistics.counts) statistics <- cbind(gene = rownames(statistics), statistics) # Re-arrange score matrix for plots -score <- data.frame(score = score, - order = rank(score, ties.method = "first"), - signature = signature_output$rate, - stringsAsFactors = FALSE) +score <- data.frame( + score = score, + order = rank(score, ties.method = "first"), + signature = signature_output$rate, + stringsAsFactors = FALSE +) pdf(file = opt$pdf) myplot <- ggplot(signature_output, aes(x = rate, y = score)) + - geom_violin(aes(fill = rate), alpha = .5, trim = FALSE, show.legend = FALSE, cex = 0.5) + - geom_abline(slope = 0, intercept = mean(score$score), lwd = 0.5, color = "red") + - scale_fill_manual(values = c("#ff0000", "#08661e")) + - geom_jitter(size = 0.2) + labs(y = "Score", x = "Rate") + - annotate("text", x = 0.55, y = mean(score$score), cex = 3, vjust = 1.5, - color = "black", label = mean(score$score), parse = TRUE) + - labs(title = "Violin plots of Cell signature scores") + geom_violin(aes(fill = rate), alpha = .5, trim = FALSE, show.legend = FALSE, cex = 0.5) + + geom_abline(slope = 0, intercept = mean(score$score), lwd = 0.5, color = "red") + + scale_fill_manual(values = c("#ff0000", "#08661e")) + + geom_jitter(size = 0.2) + + labs(y = "Score", x = "Rate") + + annotate("text", + x = 0.55, y = mean(score$score), cex = 3, vjust = 1.5, + color = "black", label = mean(score$score), parse = TRUE + ) + + labs(title = "Violin plots of Cell signature scores") print(myplot) dev.off() # Save file write.table( - signature_output, - opt$output, - sep = "\t", - quote = FALSE, - col.names = TRUE, - row.names = FALSE + signature_output, + opt$output, + sep = "\t", + quote = FALSE, + col.names = TRUE, + row.names = FALSE ) write.table( - statistics, - opt$stats, - sep = "\t", - quote = FALSE, - col.names = TRUE, - row.names = FALSE + statistics, + opt$stats, + sep = "\t", + quote = FALSE, + col.names = TRUE, + row.names = FALSE ) diff -r 31da579595c5 -r bd810b6d8a3b signature_score.xml --- a/signature_score.xml Sun Dec 10 04:11:16 2023 +0000 +++ b/signature_score.xml Thu Nov 07 22:43:58 2024 +0000 @@ -1,5 +1,8 @@ - + in single cell RNAseq + + galaxy_single_cell_suite + r-optparse r-ggplot2