Mercurial > repos > artbio > gsc_signature_score
comparison signature_score.R @ 0:baf7a079bce0 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_signature_score commit 09dcd74dbc01f448518cf3db3e646afb0675a6fe
author | artbio |
---|---|
date | Mon, 24 Jun 2019 13:40:08 -0400 |
parents | |
children | e08419b8ec24 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:baf7a079bce0 |
---|---|
1 ######################### | |
2 # Signature score # | |
3 ######################### | |
4 | |
5 # Compute the signature score based on the geometric mean of the target gene expression | |
6 # and split cells in 2 groups (high/low) using this signature score. | |
7 | |
8 # Example of command | |
9 # Rscript 4-signature_score.R --input <input.tsv> --genes ARNT2,SALL2,SOX9,OLIG2,POU3F2 | |
10 # --output <output.tab> --pdf <output.pdf> | |
11 | |
12 # load packages that are provided in the conda env | |
13 options( show.error.messages=F, | |
14 error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) | |
15 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | |
16 warnings() | |
17 | |
18 library(optparse) | |
19 library(psych) | |
20 library(ggplot2) | |
21 library(gridExtra) | |
22 | |
23 # Arguments | |
24 option_list = list( | |
25 make_option( | |
26 "--input", | |
27 default = NA, | |
28 type = 'character', | |
29 help = "Input file that contains log2(CPM +1) values" | |
30 ), | |
31 make_option( | |
32 "--sep", | |
33 default = '\t', | |
34 type = 'character', | |
35 help = "File separator [default : '%default' ]" | |
36 ), | |
37 make_option( | |
38 "--colnames", | |
39 default = TRUE, | |
40 type = 'logical', | |
41 help = "Consider first line as header ? [default : '%default' ]" | |
42 ), | |
43 make_option( | |
44 "--genes", | |
45 default = NA, | |
46 type = 'character', | |
47 help = "List of genes comma separated" | |
48 ), | |
49 make_option( | |
50 "--percentile_threshold", | |
51 default = 20, | |
52 type = 'integer', | |
53 help = "detection threshold to keep a gene in signature set [default : '%default' ]" | |
54 ), | |
55 make_option( | |
56 "--output", | |
57 default = "./output.tab", | |
58 type = 'character', | |
59 help = "Output path [default : '%default' ]" | |
60 ), | |
61 make_option( | |
62 "--stats", | |
63 default = "./statistics.tab", | |
64 type = 'character', | |
65 help = "statistics path [default : '%default' ]" | |
66 ), | |
67 make_option( | |
68 "--pdf", | |
69 default = "~/output.pdf", | |
70 type = 'character', | |
71 help = "pdf path [default : '%default' ]" | |
72 ) | |
73 ) | |
74 | |
75 opt = parse_args(OptionParser(option_list = option_list), | |
76 args = commandArgs(trailingOnly = TRUE)) | |
77 | |
78 if (opt$sep == "tab") {opt$sep = "\t"} | |
79 if (opt$sep == "comma") {opt$sep = ","} | |
80 | |
81 # Take input data | |
82 data.counts <- read.table( | |
83 opt$input, | |
84 h = opt$colnames, | |
85 row.names = 1, | |
86 sep = opt$sep, | |
87 check.names = F | |
88 ) | |
89 | |
90 # Get vector of target genes | |
91 genes <- unlist(strsplit(opt$genes, ",")) | |
92 | |
93 if (length(unique(genes %in% rownames(data.counts))) == 1) { | |
94 if (unique(genes %in% rownames(data.counts)) == F) | |
95 stop("None of these genes are in your dataset: ", opt$genes) | |
96 } | |
97 | |
98 logical_genes <- rownames(data.counts) %in% genes | |
99 | |
100 # Retrieve target genes in counts data | |
101 signature.counts <- subset(data.counts, logical_genes) | |
102 | |
103 | |
104 ## Descriptive Statistics Function | |
105 descriptive_stats = function(InputData) { | |
106 SummaryData = data.frame( | |
107 mean = rowMeans(InputData), | |
108 SD = apply(InputData, 1, sd), | |
109 Variance = apply(InputData, 1, var), | |
110 Percentage_Detection = apply(InputData, 1, function(x, y = InputData) { | |
111 (sum(x != 0) / ncol(y)) * 100 | |
112 }) | |
113 ) | |
114 return(SummaryData) | |
115 } | |
116 | |
117 signature_stats <- descriptive_stats(signature.counts) | |
118 | |
119 # Find poorly detected genes from the signature | |
120 kept_genes <- signature_stats$Percentage_Detection >= opt$percentile_threshold | |
121 | |
122 # Add warnings | |
123 if (length(unique(kept_genes)) > 1) { | |
124 cat( | |
125 "WARNINGS ! Following genes were removed from further analysis due to low gene expression :", | |
126 paste(paste(rownames(signature.counts)[!kept_genes], round(signature_stats$Percentage_Detection[!kept_genes], 2), sep = " : "), collapse = ", "), | |
127 "\n" | |
128 ) | |
129 } else { | |
130 if (unique(kept_genes) == F) { | |
131 stop( | |
132 "None of these genes are detected in ", | |
133 opt$percent, | |
134 "% of your cells: ", | |
135 paste(rownames(signature_stats), collapse = ", "), | |
136 ". You can be less stringent thanks to --percent parameter." | |
137 ) | |
138 } | |
139 } | |
140 | |
141 # Remove genes poorly detected in the dataset | |
142 signature.counts <- signature.counts[kept_genes,] | |
143 | |
144 # Replace 0 by 1 counts | |
145 signature.counts[signature.counts == 0] <- 1 | |
146 | |
147 # Geometric mean by cell | |
148 score <- apply(signature.counts, 2, geometric.mean) # geometric.mean requires psych | |
149 | |
150 # Add results in signature_output | |
151 signature_output <- data.frame( | |
152 cell = names(score), | |
153 score = score, | |
154 rate = ifelse(score > mean(score), "HIGH", "LOW"), | |
155 nGenes = colSums(data.counts != 0), | |
156 total_counts = colSums(data.counts) | |
157 ) | |
158 | |
159 # statistics of input genes, signature genes first lines | |
160 statistics.counts <- rbind(subset(data.counts, logical_genes), | |
161 subset(data.counts, !logical_genes)) | |
162 statistics <- descriptive_stats(statistics.counts) | |
163 statistics <- cbind(gene=rownames(statistics), statistics) | |
164 | |
165 | |
166 | |
167 # Re-arrange score matrix for plots | |
168 score <- data.frame(score = score, | |
169 order = rank(score, ties.method = "first"), | |
170 signature = signature_output$rate, | |
171 stringsAsFactors = F) | |
172 | |
173 pdf(file = opt$pdf) | |
174 | |
175 ggplot(score, aes(x = order, y = score)) + | |
176 geom_line() + | |
177 geom_segment(x = 0, xend = max(score$order[score$signature == "LOW"]), y = mean(score$score), yend = mean(score$score)) + | |
178 geom_area(aes(fill = signature), alpha = .7) + | |
179 scale_fill_manual(values=c("#ff0000", "#08661e")) + | |
180 geom_text(aes(x = 1, y = mean(score)), label = "Mean", vjust = -0.3, colour = "black") + | |
181 labs(title = "Ordered cell signature scores", x = "Cell index", y = "Score") | |
182 | |
183 density_score <- density(score$score) | |
184 ggplot(data.frame(density_score[1:2]), aes(x, y, fill = ifelse(x < mean(score$score), "LOW", "HIGH"))) + | |
185 geom_line() + | |
186 geom_vline(xintercept = mean(score$score)) + | |
187 geom_text(x = mean(score$score), y = max(density_score$y), label = "Mean", hjust = -0.3, colour = "black") + | |
188 geom_area(alpha = .7) + | |
189 scale_fill_manual(values=c("#ff0000", "#08661e")) + | |
190 ylim(0, max(density_score$y)) + | |
191 labs( | |
192 title = "Distribution of Cell signature scores", | |
193 x = paste("N =", density_score$n, "Bandwidth =", density_score$bw), | |
194 y = "Density", | |
195 fill = "Signature" | |
196 ) | |
197 | |
198 # Check score independant of low expression | |
199 p_gene <- ggplot(signature_output, aes(rate, nGenes)) + | |
200 geom_violin(aes(fill = rate), alpha = .5, trim = F, show.legend = F) + | |
201 scale_fill_manual(values=c("#ff0000", "#08661e")) + | |
202 geom_jitter() + labs(y = "Number of detected genes", x = "Signature") | |
203 | |
204 p_counts <- ggplot(signature_output, aes(rate, total_counts)) + | |
205 geom_violin(aes(fill = rate), alpha = .5, trim = F, show.legend = F) + | |
206 scale_fill_manual(values=c("#ff0000", "#08661e")) + | |
207 geom_jitter() + labs(y = "Total counts", x = "Signature") | |
208 | |
209 grid.arrange(p_gene, p_counts, ncol = 2, top = "Influence of library sequencing depth on cell signature scores") | |
210 | |
211 dev.off() | |
212 | |
213 # Save file | |
214 write.table( | |
215 signature_output, | |
216 opt$output, | |
217 sep = "\t", | |
218 quote = F, | |
219 col.names = T, | |
220 row.names = F | |
221 ) | |
222 | |
223 write.table( | |
224 statistics, | |
225 opt$stats, | |
226 sep = "\t", | |
227 quote = F, | |
228 col.names = T, | |
229 row.names = F | |
230 ) |