diff signature_score.R @ 4:31da579595c5 draft default tip

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/gsc_signature_score commit 84220e72e5d52cc9265b7eec20e6826e04d91f1f
author artbio
date Sun, 10 Dec 2023 04:11:16 +0000
parents 3351ca630a01
children
line wrap: on
line diff
--- a/signature_score.R	Sat Oct 31 18:34:40 2020 +0000
+++ b/signature_score.R	Sun Dec 10 04:11:16 2023 +0000
@@ -1,17 +1,13 @@
-#########################
-#    Signature score    #
-#########################
-
+# Signature score
 # Compute the signature score based on the geometric mean of the target gene expression
 # and split cells  in 2 groups (high/low) using this signature score.
 
-# Example of command
-# Rscript 4-signature_score.R --input <input.tsv> --genes  ARNT2,SALL2,SOX9,OLIG2,POU3F2
-#                             --output <output.tab> --pdf <output.pdf>
-
-# load packages that are provided in the conda env
-options( show.error.messages=F,
-       error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+options(show.error.messages = FALSE,
+  error = function() {
+    cat(geterrmessage(), file = stderr())
+    q("no", 1, FALSE)
+  }
+)
 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
 warnings()
 
@@ -21,74 +17,78 @@
 library(gridExtra)
 
 # Arguments
-option_list = list(
+option_list <- list(
   make_option(
     "--input",
     default = NA,
-    type = 'character',
+    type = "character",
     help = "Input file that contains log2(CPM +1) values"
   ),
   make_option(
     "--sep",
-    default = '\t',
-    type = 'character',
+    default = "\t",
+    type = "character",
     help = "File separator [default : '%default' ]"
   ),
   make_option(
     "--colnames",
     default = TRUE,
-    type = 'logical',
+    type = "logical",
     help = "Consider first line as header ? [default : '%default' ]"
-  ),  
+  ),
   make_option(
     "--genes",
     default = NA,
-    type = 'character',
+    type = "character",
     help = "List of genes comma separated"
   ),
   make_option(
     "--percentile_threshold",
     default = 20,
-    type = 'integer',
+    type = "integer",
     help = "detection threshold to keep a gene in signature set [default : '%default' ]"
   ),
   make_option(
     "--output",
     default = "./output.tab",
-    type = 'character',
+    type = "character",
     help = "Output path [default : '%default' ]"
   ),
   make_option(
     "--stats",
     default = "./statistics.tab",
-    type = 'character',
+    type = "character",
     help = "statistics path [default : '%default' ]"
   ),
   make_option(
     "--correlations",
     default = "./correlations.tab",
-    type = 'character',
+    type = "character",
     help = "Correlations between signature genes  [default : '%default' ]"
   ),
   make_option(
     "--covariances",
     default = "./statistics.tab",
-    type = 'character',
+    type = "character",
     help = "Covariances between signature genes [default : '%default' ]"
   ),
   make_option(
     "--pdf",
     default = "~/output.pdf",
-    type = 'character',
+    type = "character",
     help = "pdf path [default : '%default' ]"
   )
 )
 
-opt = parse_args(OptionParser(option_list = option_list),
-                 args = commandArgs(trailingOnly = TRUE))
+opt <- parse_args(OptionParser(option_list = option_list),
+                  args = commandArgs(trailingOnly = TRUE))
 
-if (opt$sep == "tab") {opt$sep = "\t"}
-if (opt$sep == "comma") {opt$sep = ","}
+if (opt$sep == "tab") {
+  opt$sep <- "\t"
+}
+if (opt$sep == "comma") {
+  opt$sep <- ","
+}
 
 # Take input data
 data.counts <- read.table(
@@ -96,17 +96,17 @@
   h = opt$colnames,
   row.names = 1,
   sep = opt$sep,
-  check.names = F
+  check.names = FALSE
 )
 
 # Get vector of target genes
 genes <- unlist(strsplit(opt$genes, ","))
 
 if (length(unique(genes %in% rownames(data.counts))) == 1) {
-  if (unique(genes %in% rownames(data.counts)) == F)
+  if (unique(genes %in% rownames(data.counts)) == FALSE)
     stop("None of these genes are in your dataset: ", opt$genes)
 }
-    
+
 logical_genes <- rownames(data.counts) %in% genes
 
 # Retrieve target genes in counts data
@@ -114,17 +114,21 @@
 
 # compute covariance
 signature.covariances <- as.data.frame(cov(t(signature.counts)))
-signature.covariances <- cbind(gene=rownames(signature.covariances), signature.covariances)
-write.table(signature.covariances, file=opt$covariances, quote=F, row.names=F, sep="\t")
+signature.covariances <- cbind(gene = rownames(signature.covariances), signature.covariances)
+write.table(signature.covariances,
+            file = opt$covariances,
+            quote = FALSE,
+            row.names = FALSE,
+            sep = "\t")
 
 # compute signature.correlations
 signature.correlations <- as.data.frame(cor(t(signature.counts)))
-signature.correlations <- cbind(gene=rownames(signature.correlations), signature.correlations)
-write.table(signature.correlations, file=opt$correlations, quote=F, row.names=F, sep="\t")
+signature.correlations <- cbind(gene = rownames(signature.correlations), signature.correlations)
+write.table(signature.correlations, file = opt$correlations, quote = FALSE, row.names = FALSE, sep = "\t")
 
 ## Descriptive Statistics Function
-descriptive_stats = function(InputData) {
-  SummaryData = data.frame(
+descriptive_stats <- function(InputData) {
+  SummaryData <- data.frame(
     mean = rowMeans(InputData),
     SD = apply(InputData, 1, sd),
     Variance = apply(InputData, 1, var),
@@ -137,7 +141,7 @@
 
 signature_stats <- descriptive_stats(signature.counts)
 
-# Find poorly detected genes from the signature 
+# Find poorly detected genes from the signature
 kept_genes <- signature_stats$Percentage_Detection >= opt$percentile_threshold
 
 # Add warnings
@@ -148,7 +152,7 @@
     "\n"
   )
 } else {
-  if (unique(kept_genes) == F) {
+  if (unique(kept_genes) == FALSE) {
     stop(
       "None of these genes are detected in ",
       opt$percent,
@@ -160,8 +164,8 @@
 }
 
 # Remove genes poorly detected in the dataset
-signature.counts <- signature.counts[kept_genes,]
-    
+signature.counts <- signature.counts[kept_genes, ]
+
 # Replace 0 by 1 counts
 signature.counts[signature.counts == 0] <- 1
 
@@ -169,19 +173,17 @@
 score <- apply(signature.counts, 2, geometric.mean) # geometric.mean requires psych
 
 # Add results in signature_output
-signature_output <- data.frame(
-                         cell = names(score),
-                         score = score,
-                         rate = ifelse(score > mean(score), "HIGH", "LOW"),
-                         nGenes = colSums(data.counts != 0),
-                         total_counts = colSums(data.counts)
-                         )
+signature_output <- data.frame(cell = names(score),
+                               score = score,
+                               rate = ifelse(score > mean(score), "HIGH", "LOW"),
+                               nGenes = colSums(data.counts != 0),
+                               total_counts = colSums(data.counts))
 
 # statistics of input genes, signature genes first lines
 statistics.counts <- rbind(subset(data.counts, logical_genes),
-                    subset(data.counts, !logical_genes))  
+                           subset(data.counts, !logical_genes))
 statistics <- descriptive_stats(statistics.counts)
-statistics <- cbind(gene=rownames(statistics), statistics)
+statistics <- cbind(gene = rownames(statistics), statistics)
 
 
 
@@ -189,17 +191,17 @@
 score <- data.frame(score = score,
                     order = rank(score, ties.method = "first"),
                     signature = signature_output$rate,
-                    stringsAsFactors = F)
+                    stringsAsFactors = FALSE)
 
 pdf(file = opt$pdf)
-myplot <- ggplot(signature_output, aes(x=rate, y=score)) +
-                 geom_violin(aes(fill = rate), alpha = .5, trim = F, show.legend = F, cex=0.5) +
-                 geom_abline(slope=0, intercept=mean(score$score), lwd=.5, color="red") +
-                 scale_fill_manual(values=c("#ff0000", "#08661e")) +
-                 geom_jitter(size=0.2) + labs(y = "Score", x = "Rate") +
-                 annotate("text", x = 0.55, y = mean(score$score), cex = 3, vjust=1.5,
-                           color="black", label = mean(score$score), parse = TRUE) +
-                 labs(title = "Violin plots of Cell signature scores")
+myplot <- ggplot(signature_output, aes(x = rate, y = score)) +
+  geom_violin(aes(fill = rate), alpha = .5, trim = FALSE, show.legend = FALSE, cex = 0.5) +
+  geom_abline(slope = 0, intercept = mean(score$score), lwd = 0.5, color = "red") +
+  scale_fill_manual(values = c("#ff0000", "#08661e")) +
+  geom_jitter(size = 0.2) + labs(y = "Score", x = "Rate") +
+  annotate("text", x = 0.55, y = mean(score$score), cex = 3, vjust = 1.5,
+           color = "black", label = mean(score$score), parse = TRUE) +
+  labs(title = "Violin plots of Cell signature scores")
 
 print(myplot)
 dev.off()
@@ -209,16 +211,16 @@
   signature_output,
   opt$output,
   sep = "\t",
-  quote = F,
-  col.names = T,
-  row.names = F
+  quote = FALSE,
+  col.names = TRUE,
+  row.names = FALSE
 )
 
 write.table(
   statistics,
   opt$stats,
   sep = "\t",
-  quote = F,
-  col.names = T,
-  row.names = F
+  quote = FALSE,
+  col.names = TRUE,
+  row.names = FALSE
 )