Mercurial > repos > artbio > gsc_signature_score
comparison signature_score.R @ 5:bd810b6d8a3b draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/gsc_signature_score commit ae36c63e1e5c0c285593d427f0ffd348a3d89e3f
| author | artbio |
|---|---|
| date | Thu, 07 Nov 2024 22:43:58 +0000 |
| parents | 31da579595c5 |
| children |
comparison
equal
deleted
inserted
replaced
| 4:31da579595c5 | 5:bd810b6d8a3b |
|---|---|
| 1 # Signature score | 1 # Signature score |
| 2 # Compute the signature score based on the geometric mean of the target gene expression | 2 # Compute the signature score based on the geometric mean of the target gene expression |
| 3 # and split cells in 2 groups (high/low) using this signature score. | 3 # and split cells in 2 groups (high/low) using this signature score. |
| 4 | 4 |
| 5 options(show.error.messages = FALSE, | 5 options( |
| 6 error = function() { | 6 show.error.messages = FALSE, |
| 7 cat(geterrmessage(), file = stderr()) | 7 error = function() { |
| 8 q("no", 1, FALSE) | 8 cat(geterrmessage(), file = stderr()) |
| 9 } | 9 q("no", 1, FALSE) |
| 10 } | |
| 10 ) | 11 ) |
| 11 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | 12 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") |
| 12 warnings() | 13 warnings() |
| 13 | 14 |
| 14 library(optparse) | 15 library(optparse) |
| 16 library(ggplot2) | 17 library(ggplot2) |
| 17 library(gridExtra) | 18 library(gridExtra) |
| 18 | 19 |
| 19 # Arguments | 20 # Arguments |
| 20 option_list <- list( | 21 option_list <- list( |
| 21 make_option( | 22 make_option( |
| 22 "--input", | 23 "--input", |
| 23 default = NA, | 24 default = NA, |
| 24 type = "character", | 25 type = "character", |
| 25 help = "Input file that contains log2(CPM +1) values" | 26 help = "Input file that contains log2(CPM +1) values" |
| 26 ), | 27 ), |
| 27 make_option( | 28 make_option( |
| 28 "--sep", | 29 "--sep", |
| 29 default = "\t", | 30 default = "\t", |
| 30 type = "character", | 31 type = "character", |
| 31 help = "File separator [default : '%default' ]" | 32 help = "File separator [default : '%default' ]" |
| 32 ), | 33 ), |
| 33 make_option( | 34 make_option( |
| 34 "--colnames", | 35 "--colnames", |
| 35 default = TRUE, | 36 default = TRUE, |
| 36 type = "logical", | 37 type = "logical", |
| 37 help = "Consider first line as header ? [default : '%default' ]" | 38 help = "Consider first line as header ? [default : '%default' ]" |
| 38 ), | 39 ), |
| 39 make_option( | 40 make_option( |
| 40 "--genes", | 41 "--genes", |
| 41 default = NA, | 42 default = NA, |
| 42 type = "character", | 43 type = "character", |
| 43 help = "List of genes comma separated" | 44 help = "List of genes comma separated" |
| 44 ), | 45 ), |
| 45 make_option( | 46 make_option( |
| 46 "--percentile_threshold", | 47 "--percentile_threshold", |
| 47 default = 20, | 48 default = 20, |
| 48 type = "integer", | 49 type = "integer", |
| 49 help = "detection threshold to keep a gene in signature set [default : '%default' ]" | 50 help = "detection threshold to keep a gene in signature set [default : '%default' ]" |
| 50 ), | 51 ), |
| 51 make_option( | 52 make_option( |
| 52 "--output", | 53 "--output", |
| 53 default = "./output.tab", | 54 default = "./output.tab", |
| 54 type = "character", | 55 type = "character", |
| 55 help = "Output path [default : '%default' ]" | 56 help = "Output path [default : '%default' ]" |
| 56 ), | 57 ), |
| 57 make_option( | 58 make_option( |
| 58 "--stats", | 59 "--stats", |
| 59 default = "./statistics.tab", | 60 default = "./statistics.tab", |
| 60 type = "character", | 61 type = "character", |
| 61 help = "statistics path [default : '%default' ]" | 62 help = "statistics path [default : '%default' ]" |
| 62 ), | 63 ), |
| 63 make_option( | 64 make_option( |
| 64 "--correlations", | 65 "--correlations", |
| 65 default = "./correlations.tab", | 66 default = "./correlations.tab", |
| 66 type = "character", | 67 type = "character", |
| 67 help = "Correlations between signature genes [default : '%default' ]" | 68 help = "Correlations between signature genes [default : '%default' ]" |
| 68 ), | 69 ), |
| 69 make_option( | 70 make_option( |
| 70 "--covariances", | 71 "--covariances", |
| 71 default = "./statistics.tab", | 72 default = "./statistics.tab", |
| 72 type = "character", | 73 type = "character", |
| 73 help = "Covariances between signature genes [default : '%default' ]" | 74 help = "Covariances between signature genes [default : '%default' ]" |
| 74 ), | 75 ), |
| 75 make_option( | 76 make_option( |
| 76 "--pdf", | 77 "--pdf", |
| 77 default = "~/output.pdf", | 78 default = "~/output.pdf", |
| 78 type = "character", | 79 type = "character", |
| 79 help = "pdf path [default : '%default' ]" | 80 help = "pdf path [default : '%default' ]" |
| 80 ) | 81 ) |
| 81 ) | 82 ) |
| 82 | 83 |
| 83 opt <- parse_args(OptionParser(option_list = option_list), | 84 opt <- parse_args(OptionParser(option_list = option_list), |
| 84 args = commandArgs(trailingOnly = TRUE)) | 85 args = commandArgs(trailingOnly = TRUE) |
| 86 ) | |
| 85 | 87 |
| 86 if (opt$sep == "tab") { | 88 if (opt$sep == "tab") { |
| 87 opt$sep <- "\t" | 89 opt$sep <- "\t" |
| 88 } | 90 } |
| 89 if (opt$sep == "comma") { | 91 if (opt$sep == "comma") { |
| 90 opt$sep <- "," | 92 opt$sep <- "," |
| 91 } | 93 } |
| 92 | 94 |
| 93 # Take input data | 95 # Take input data |
| 94 data.counts <- read.table( | 96 data.counts <- read.table( |
| 95 opt$input, | 97 opt$input, |
| 96 h = opt$colnames, | 98 h = opt$colnames, |
| 97 row.names = 1, | 99 row.names = 1, |
| 98 sep = opt$sep, | 100 sep = opt$sep, |
| 99 check.names = FALSE | 101 check.names = FALSE |
| 100 ) | 102 ) |
| 101 | 103 |
| 102 # Get vector of target genes | 104 # Get vector of target genes |
| 103 genes <- unlist(strsplit(opt$genes, ",")) | 105 genes <- unlist(strsplit(opt$genes, ",")) |
| 104 | 106 |
| 105 if (length(unique(genes %in% rownames(data.counts))) == 1) { | 107 if (length(unique(genes %in% rownames(data.counts))) == 1) { |
| 106 if (unique(genes %in% rownames(data.counts)) == FALSE) | 108 if (unique(genes %in% rownames(data.counts)) == FALSE) { |
| 107 stop("None of these genes are in your dataset: ", opt$genes) | 109 stop("None of these genes are in your dataset: ", opt$genes) |
| 110 } | |
| 108 } | 111 } |
| 109 | 112 |
| 110 logical_genes <- rownames(data.counts) %in% genes | 113 logical_genes <- rownames(data.counts) %in% genes |
| 111 | 114 |
| 112 # Retrieve target genes in counts data | 115 # Retrieve target genes in counts data |
| 114 | 117 |
| 115 # compute covariance | 118 # compute covariance |
| 116 signature.covariances <- as.data.frame(cov(t(signature.counts))) | 119 signature.covariances <- as.data.frame(cov(t(signature.counts))) |
| 117 signature.covariances <- cbind(gene = rownames(signature.covariances), signature.covariances) | 120 signature.covariances <- cbind(gene = rownames(signature.covariances), signature.covariances) |
| 118 write.table(signature.covariances, | 121 write.table(signature.covariances, |
| 119 file = opt$covariances, | 122 file = opt$covariances, |
| 120 quote = FALSE, | 123 quote = FALSE, |
| 121 row.names = FALSE, | 124 row.names = FALSE, |
| 122 sep = "\t") | 125 sep = "\t" |
| 126 ) | |
| 123 | 127 |
| 124 # compute signature.correlations | 128 # compute signature.correlations |
| 125 signature.correlations <- as.data.frame(cor(t(signature.counts))) | 129 signature.correlations <- as.data.frame(cor(t(signature.counts))) |
| 126 signature.correlations <- cbind(gene = rownames(signature.correlations), signature.correlations) | 130 signature.correlations <- cbind(gene = rownames(signature.correlations), signature.correlations) |
| 127 write.table(signature.correlations, file = opt$correlations, quote = FALSE, row.names = FALSE, sep = "\t") | 131 write.table(signature.correlations, file = opt$correlations, quote = FALSE, row.names = FALSE, sep = "\t") |
| 128 | 132 |
| 129 ## Descriptive Statistics Function | 133 ## Descriptive Statistics Function |
| 130 descriptive_stats <- function(InputData) { | 134 descriptive_stats <- function(InputData) { |
| 131 SummaryData <- data.frame( | 135 SummaryData <- data.frame( |
| 132 mean = rowMeans(InputData), | 136 mean = rowMeans(InputData), |
| 133 SD = apply(InputData, 1, sd), | 137 SD = apply(InputData, 1, sd), |
| 134 Variance = apply(InputData, 1, var), | 138 Variance = apply(InputData, 1, var), |
| 135 Percentage_Detection = apply(InputData, 1, function(x, y = InputData) { | 139 Percentage_Detection = apply(InputData, 1, function(x, y = InputData) { |
| 136 (sum(x != 0) / ncol(y)) * 100 | 140 (sum(x != 0) / ncol(y)) * 100 |
| 137 }) | 141 }) |
| 138 ) | 142 ) |
| 139 return(SummaryData) | 143 return(SummaryData) |
| 140 } | 144 } |
| 141 | 145 |
| 142 signature_stats <- descriptive_stats(signature.counts) | 146 signature_stats <- descriptive_stats(signature.counts) |
| 143 | 147 |
| 144 # Find poorly detected genes from the signature | 148 # Find poorly detected genes from the signature |
| 145 kept_genes <- signature_stats$Percentage_Detection >= opt$percentile_threshold | 149 kept_genes <- signature_stats$Percentage_Detection >= opt$percentile_threshold |
| 146 | 150 |
| 147 # Add warnings | 151 # Add warnings |
| 148 if (length(unique(kept_genes)) > 1) { | 152 if (length(unique(kept_genes)) > 1) { |
| 149 cat( | 153 cat( |
| 150 "WARNINGS ! Following genes were removed from further analysis due to low gene expression :", | 154 "WARNINGS ! Following genes were removed from further analysis due to low gene expression :", |
| 151 paste(paste(rownames(signature.counts)[!kept_genes], round(signature_stats$Percentage_Detection[!kept_genes], 2), sep = " : "), collapse = ", "), | 155 paste(paste(rownames(signature.counts)[!kept_genes], round(signature_stats$Percentage_Detection[!kept_genes], 2), sep = " : "), collapse = ", "), |
| 152 "\n" | 156 "\n" |
| 153 ) | 157 ) |
| 154 } else { | 158 } else { |
| 155 if (unique(kept_genes) == FALSE) { | 159 if (unique(kept_genes) == FALSE) { |
| 156 stop( | 160 stop( |
| 157 "None of these genes are detected in ", | 161 "None of these genes are detected in ", |
| 158 opt$percent, | 162 opt$percent, |
| 159 "% of your cells: ", | 163 "% of your cells: ", |
| 160 paste(rownames(signature_stats), collapse = ", "), | 164 paste(rownames(signature_stats), collapse = ", "), |
| 161 ". You can be less stringent thanks to --percent parameter." | 165 ". You can be less stringent thanks to --percent parameter." |
| 162 ) | 166 ) |
| 163 } | 167 } |
| 164 } | 168 } |
| 165 | 169 |
| 166 # Remove genes poorly detected in the dataset | 170 # Remove genes poorly detected in the dataset |
| 167 signature.counts <- signature.counts[kept_genes, ] | 171 signature.counts <- signature.counts[kept_genes, ] |
| 168 | 172 |
| 171 | 175 |
| 172 # Geometric mean by cell | 176 # Geometric mean by cell |
| 173 score <- apply(signature.counts, 2, geometric.mean) # geometric.mean requires psych | 177 score <- apply(signature.counts, 2, geometric.mean) # geometric.mean requires psych |
| 174 | 178 |
| 175 # Add results in signature_output | 179 # Add results in signature_output |
| 176 signature_output <- data.frame(cell = names(score), | 180 signature_output <- data.frame( |
| 177 score = score, | 181 cell = names(score), |
| 178 rate = ifelse(score > mean(score), "HIGH", "LOW"), | 182 score = score, |
| 179 nGenes = colSums(data.counts != 0), | 183 rate = ifelse(score > mean(score), "HIGH", "LOW"), |
| 180 total_counts = colSums(data.counts)) | 184 nGenes = colSums(data.counts != 0), |
| 185 total_counts = colSums(data.counts) | |
| 186 ) | |
| 181 | 187 |
| 182 # statistics of input genes, signature genes first lines | 188 # statistics of input genes, signature genes first lines |
| 183 statistics.counts <- rbind(subset(data.counts, logical_genes), | 189 statistics.counts <- rbind( |
| 184 subset(data.counts, !logical_genes)) | 190 subset(data.counts, logical_genes), |
| 191 subset(data.counts, !logical_genes) | |
| 192 ) | |
| 185 statistics <- descriptive_stats(statistics.counts) | 193 statistics <- descriptive_stats(statistics.counts) |
| 186 statistics <- cbind(gene = rownames(statistics), statistics) | 194 statistics <- cbind(gene = rownames(statistics), statistics) |
| 187 | 195 |
| 188 | 196 |
| 189 | 197 |
| 190 # Re-arrange score matrix for plots | 198 # Re-arrange score matrix for plots |
| 191 score <- data.frame(score = score, | 199 score <- data.frame( |
| 192 order = rank(score, ties.method = "first"), | 200 score = score, |
| 193 signature = signature_output$rate, | 201 order = rank(score, ties.method = "first"), |
| 194 stringsAsFactors = FALSE) | 202 signature = signature_output$rate, |
| 203 stringsAsFactors = FALSE | |
| 204 ) | |
| 195 | 205 |
| 196 pdf(file = opt$pdf) | 206 pdf(file = opt$pdf) |
| 197 myplot <- ggplot(signature_output, aes(x = rate, y = score)) + | 207 myplot <- ggplot(signature_output, aes(x = rate, y = score)) + |
| 198 geom_violin(aes(fill = rate), alpha = .5, trim = FALSE, show.legend = FALSE, cex = 0.5) + | 208 geom_violin(aes(fill = rate), alpha = .5, trim = FALSE, show.legend = FALSE, cex = 0.5) + |
| 199 geom_abline(slope = 0, intercept = mean(score$score), lwd = 0.5, color = "red") + | 209 geom_abline(slope = 0, intercept = mean(score$score), lwd = 0.5, color = "red") + |
| 200 scale_fill_manual(values = c("#ff0000", "#08661e")) + | 210 scale_fill_manual(values = c("#ff0000", "#08661e")) + |
| 201 geom_jitter(size = 0.2) + labs(y = "Score", x = "Rate") + | 211 geom_jitter(size = 0.2) + |
| 202 annotate("text", x = 0.55, y = mean(score$score), cex = 3, vjust = 1.5, | 212 labs(y = "Score", x = "Rate") + |
| 203 color = "black", label = mean(score$score), parse = TRUE) + | 213 annotate("text", |
| 204 labs(title = "Violin plots of Cell signature scores") | 214 x = 0.55, y = mean(score$score), cex = 3, vjust = 1.5, |
| 215 color = "black", label = mean(score$score), parse = TRUE | |
| 216 ) + | |
| 217 labs(title = "Violin plots of Cell signature scores") | |
| 205 | 218 |
| 206 print(myplot) | 219 print(myplot) |
| 207 dev.off() | 220 dev.off() |
| 208 | 221 |
| 209 # Save file | 222 # Save file |
| 210 write.table( | 223 write.table( |
| 211 signature_output, | 224 signature_output, |
| 212 opt$output, | 225 opt$output, |
| 213 sep = "\t", | 226 sep = "\t", |
| 214 quote = FALSE, | 227 quote = FALSE, |
| 215 col.names = TRUE, | 228 col.names = TRUE, |
| 216 row.names = FALSE | 229 row.names = FALSE |
| 217 ) | 230 ) |
| 218 | 231 |
| 219 write.table( | 232 write.table( |
| 220 statistics, | 233 statistics, |
| 221 opt$stats, | 234 opt$stats, |
| 222 sep = "\t", | 235 sep = "\t", |
| 223 quote = FALSE, | 236 quote = FALSE, |
| 224 col.names = TRUE, | 237 col.names = TRUE, |
| 225 row.names = FALSE | 238 row.names = FALSE |
| 226 ) | 239 ) |
