comparison signature_score.R @ 0:baf7a079bce0 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_signature_score commit 09dcd74dbc01f448518cf3db3e646afb0675a6fe
author artbio
date Mon, 24 Jun 2019 13:40:08 -0400
parents
children e08419b8ec24
comparison
equal deleted inserted replaced
-1:000000000000 0:baf7a079bce0
1 #########################
2 # Signature score #
3 #########################
4
5 # Compute the signature score based on the geometric mean of the target gene expression
6 # and split cells in 2 groups (high/low) using this signature score.
7
8 # Example of command
9 # Rscript 4-signature_score.R --input <input.tsv> --genes ARNT2,SALL2,SOX9,OLIG2,POU3F2
10 # --output <output.tab> --pdf <output.pdf>
11
12 # load packages that are provided in the conda env
13 options( show.error.messages=F,
14 error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
15 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
16 warnings()
17
18 library(optparse)
19 library(psych)
20 library(ggplot2)
21 library(gridExtra)
22
23 # Arguments
24 option_list = list(
25 make_option(
26 "--input",
27 default = NA,
28 type = 'character',
29 help = "Input file that contains log2(CPM +1) values"
30 ),
31 make_option(
32 "--sep",
33 default = '\t',
34 type = 'character',
35 help = "File separator [default : '%default' ]"
36 ),
37 make_option(
38 "--colnames",
39 default = TRUE,
40 type = 'logical',
41 help = "Consider first line as header ? [default : '%default' ]"
42 ),
43 make_option(
44 "--genes",
45 default = NA,
46 type = 'character',
47 help = "List of genes comma separated"
48 ),
49 make_option(
50 "--percentile_threshold",
51 default = 20,
52 type = 'integer',
53 help = "detection threshold to keep a gene in signature set [default : '%default' ]"
54 ),
55 make_option(
56 "--output",
57 default = "./output.tab",
58 type = 'character',
59 help = "Output path [default : '%default' ]"
60 ),
61 make_option(
62 "--stats",
63 default = "./statistics.tab",
64 type = 'character',
65 help = "statistics path [default : '%default' ]"
66 ),
67 make_option(
68 "--pdf",
69 default = "~/output.pdf",
70 type = 'character',
71 help = "pdf path [default : '%default' ]"
72 )
73 )
74
75 opt = parse_args(OptionParser(option_list = option_list),
76 args = commandArgs(trailingOnly = TRUE))
77
78 if (opt$sep == "tab") {opt$sep = "\t"}
79 if (opt$sep == "comma") {opt$sep = ","}
80
81 # Take input data
82 data.counts <- read.table(
83 opt$input,
84 h = opt$colnames,
85 row.names = 1,
86 sep = opt$sep,
87 check.names = F
88 )
89
90 # Get vector of target genes
91 genes <- unlist(strsplit(opt$genes, ","))
92
93 if (length(unique(genes %in% rownames(data.counts))) == 1) {
94 if (unique(genes %in% rownames(data.counts)) == F)
95 stop("None of these genes are in your dataset: ", opt$genes)
96 }
97
98 logical_genes <- rownames(data.counts) %in% genes
99
100 # Retrieve target genes in counts data
101 signature.counts <- subset(data.counts, logical_genes)
102
103
104 ## Descriptive Statistics Function
105 descriptive_stats = function(InputData) {
106 SummaryData = data.frame(
107 mean = rowMeans(InputData),
108 SD = apply(InputData, 1, sd),
109 Variance = apply(InputData, 1, var),
110 Percentage_Detection = apply(InputData, 1, function(x, y = InputData) {
111 (sum(x != 0) / ncol(y)) * 100
112 })
113 )
114 return(SummaryData)
115 }
116
117 signature_stats <- descriptive_stats(signature.counts)
118
119 # Find poorly detected genes from the signature
120 kept_genes <- signature_stats$Percentage_Detection >= opt$percentile_threshold
121
122 # Add warnings
123 if (length(unique(kept_genes)) > 1) {
124 cat(
125 "WARNINGS ! Following genes were removed from further analysis due to low gene expression :",
126 paste(paste(rownames(signature.counts)[!kept_genes], round(signature_stats$Percentage_Detection[!kept_genes], 2), sep = " : "), collapse = ", "),
127 "\n"
128 )
129 } else {
130 if (unique(kept_genes) == F) {
131 stop(
132 "None of these genes are detected in ",
133 opt$percent,
134 "% of your cells: ",
135 paste(rownames(signature_stats), collapse = ", "),
136 ". You can be less stringent thanks to --percent parameter."
137 )
138 }
139 }
140
141 # Remove genes poorly detected in the dataset
142 signature.counts <- signature.counts[kept_genes,]
143
144 # Replace 0 by 1 counts
145 signature.counts[signature.counts == 0] <- 1
146
147 # Geometric mean by cell
148 score <- apply(signature.counts, 2, geometric.mean) # geometric.mean requires psych
149
150 # Add results in signature_output
151 signature_output <- data.frame(
152 cell = names(score),
153 score = score,
154 rate = ifelse(score > mean(score), "HIGH", "LOW"),
155 nGenes = colSums(data.counts != 0),
156 total_counts = colSums(data.counts)
157 )
158
159 # statistics of input genes, signature genes first lines
160 statistics.counts <- rbind(subset(data.counts, logical_genes),
161 subset(data.counts, !logical_genes))
162 statistics <- descriptive_stats(statistics.counts)
163 statistics <- cbind(gene=rownames(statistics), statistics)
164
165
166
167 # Re-arrange score matrix for plots
168 score <- data.frame(score = score,
169 order = rank(score, ties.method = "first"),
170 signature = signature_output$rate,
171 stringsAsFactors = F)
172
173 pdf(file = opt$pdf)
174
175 ggplot(score, aes(x = order, y = score)) +
176 geom_line() +
177 geom_segment(x = 0, xend = max(score$order[score$signature == "LOW"]), y = mean(score$score), yend = mean(score$score)) +
178 geom_area(aes(fill = signature), alpha = .7) +
179 scale_fill_manual(values=c("#ff0000", "#08661e")) +
180 geom_text(aes(x = 1, y = mean(score)), label = "Mean", vjust = -0.3, colour = "black") +
181 labs(title = "Ordered cell signature scores", x = "Cell index", y = "Score")
182
183 density_score <- density(score$score)
184 ggplot(data.frame(density_score[1:2]), aes(x, y, fill = ifelse(x < mean(score$score), "LOW", "HIGH"))) +
185 geom_line() +
186 geom_vline(xintercept = mean(score$score)) +
187 geom_text(x = mean(score$score), y = max(density_score$y), label = "Mean", hjust = -0.3, colour = "black") +
188 geom_area(alpha = .7) +
189 scale_fill_manual(values=c("#ff0000", "#08661e")) +
190 ylim(0, max(density_score$y)) +
191 labs(
192 title = "Distribution of Cell signature scores",
193 x = paste("N =", density_score$n, "Bandwidth =", density_score$bw),
194 y = "Density",
195 fill = "Signature"
196 )
197
198 # Check score independant of low expression
199 p_gene <- ggplot(signature_output, aes(rate, nGenes)) +
200 geom_violin(aes(fill = rate), alpha = .5, trim = F, show.legend = F) +
201 scale_fill_manual(values=c("#ff0000", "#08661e")) +
202 geom_jitter() + labs(y = "Number of detected genes", x = "Signature")
203
204 p_counts <- ggplot(signature_output, aes(rate, total_counts)) +
205 geom_violin(aes(fill = rate), alpha = .5, trim = F, show.legend = F) +
206 scale_fill_manual(values=c("#ff0000", "#08661e")) +
207 geom_jitter() + labs(y = "Total counts", x = "Signature")
208
209 grid.arrange(p_gene, p_counts, ncol = 2, top = "Influence of library sequencing depth on cell signature scores")
210
211 dev.off()
212
213 # Save file
214 write.table(
215 signature_output,
216 opt$output,
217 sep = "\t",
218 quote = F,
219 col.names = T,
220 row.names = F
221 )
222
223 write.table(
224 statistics,
225 opt$stats,
226 sep = "\t",
227 quote = F,
228 col.names = T,
229 row.names = F
230 )