Mercurial > repos > bgruening > music_construct_eset
comparison scripts/compare.R @ 3:7ffaa0968da3 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit 8beed1a19fcd9dc59f7746e1dfa735a2d5f29784"
author | bgruening |
---|---|
date | Thu, 10 Feb 2022 12:53:22 +0000 |
parents | |
children | 282819d09a4f |
comparison
equal
deleted
inserted
replaced
2:7902cd31b9b5 | 3:7ffaa0968da3 |
---|---|
1 suppressWarnings(suppressPackageStartupMessages(library(xbioc))) | |
2 suppressWarnings(suppressPackageStartupMessages(library(MuSiC))) | |
3 suppressWarnings(suppressPackageStartupMessages(library(reshape2))) | |
4 suppressWarnings(suppressPackageStartupMessages(library(cowplot))) | |
5 ## We use this script to estimate the effectiveness of proportion methods | |
6 | |
7 ## Load Conf | |
8 args <- commandArgs(trailingOnly = TRUE) | |
9 source(args[1]) | |
10 | |
11 method_key <- list("MuSiC" = "est_music", | |
12 "NNLS" = "est_nnls")[[est_method]] | |
13 | |
14 | |
15 scale_yaxes <- function(gplot, value) { | |
16 if (is.na(value)) { | |
17 gplot | |
18 } else { | |
19 gplot + scale_y_continuous(lim = c(0, value)) | |
20 } | |
21 } | |
22 | |
23 | |
24 set_factor_data <- function(bulk_data, factor_name = NULL) { | |
25 if (is.null(factor_name)) { | |
26 factor_name <- "None" ## change to something plottable | |
27 } | |
28 pdat <- pData(bulk_data) | |
29 sam_fact <- NULL | |
30 if (factor_name %in% colnames(pdat)) { | |
31 sam_fact <- cbind(rownames(pdat), | |
32 as.character(pdat[[factor_name]])) | |
33 cat(paste0(" - factor: ", factor_name, | |
34 " found in phenotypes\n")) | |
35 } else { | |
36 ## We assign this as the factor for the entire dataset | |
37 sam_fact <- cbind(rownames(pdat), | |
38 factor_name) | |
39 cat(paste0(" - factor: assigning \"", factor_name, | |
40 "\" to whole dataset\n")) | |
41 } | |
42 colnames(sam_fact) <- c("Samples", "Factors") | |
43 return(as.data.frame(sam_fact)) | |
44 } | |
45 | |
46 ## Due to limiting sizes, we need to load and unload | |
47 ## possibly very large datasets. | |
48 process_pair <- function(sc_data, bulk_data, | |
49 ctypes_label, samples_label, ctypes, | |
50 factor_group) { | |
51 ## - Generate | |
52 est_prop <- music_prop( | |
53 bulk.eset = bulk_data, sc.eset = sc_data, | |
54 clusters = ctypes_label, | |
55 samples = samples_label, select.ct = ctypes, verbose = T) | |
56 ## - | |
57 estimated_music_props <- est_prop$Est.prop.weighted | |
58 estimated_nnls_props <- est_prop$Est.prop.allgene | |
59 ## - | |
60 fact_data <- set_factor_data(bulk_data, factor_group) | |
61 ## - | |
62 return(list(est_music = estimated_music_props, | |
63 est_nnls = estimated_nnls_props, | |
64 bulk_sample_totals = colSums(exprs(bulk_data)), | |
65 plot_groups = fact_data)) | |
66 } | |
67 | |
68 music_on_all <- function(files) { | |
69 results <- list() | |
70 for (sc_name in names(files)) { | |
71 cat(paste0("sc-group:", sc_name, "\n")) | |
72 scgroup <- files[[sc_name]] | |
73 ## - sc Data | |
74 sc_est <- readRDS(scgroup$dataset) | |
75 ## - params | |
76 celltypes_label <- scgroup$label_cell | |
77 samples_label <- scgroup$label_sample | |
78 celltypes <- scgroup$celltype | |
79 | |
80 results[[sc_name]] <- list() | |
81 for (bulk_name in names(scgroup$bulk)) { | |
82 cat(paste0(" - bulk-group:", bulk_name, "\n")) | |
83 bulkgroup <- scgroup$bulk[[bulk_name]] | |
84 ## - bulk Data | |
85 bulk_est <- readRDS(bulkgroup$dataset) | |
86 ## - bulk params | |
87 pheno_facts <- bulkgroup$pheno_facts | |
88 pheno_excl <- bulkgroup$pheno_excl | |
89 ## | |
90 results[[sc_name]][[bulk_name]] <- process_pair( | |
91 sc_est, bulk_est, | |
92 celltypes_label, samples_label, | |
93 celltypes, bulkgroup$factor_group) | |
94 ## | |
95 rm(bulk_est) ## unload | |
96 } | |
97 rm(sc_est) ## unload | |
98 } | |
99 return(results) | |
100 } | |
101 | |
102 plot_all_individual_heatmaps <- function(results) { | |
103 pdf(out_heatmulti_pdf, width = 8, height = 8) | |
104 for (sc_name in names(results)) { | |
105 for (bk_name in names(results[[sc_name]])) { | |
106 res <- results[[sc_name]][[bk_name]] | |
107 plot_hmap <- Prop_heat_Est( | |
108 data.matrix(res[[method_key]]), method.name = est_method) + | |
109 ggtitle(paste0("[", est_method, "Cell type ", | |
110 "proportions in ", | |
111 bk_name, " (Bulk) based on ", | |
112 sc_name, " (scRNA)")) + | |
113 xlab("Cell Types (scRNA)") + | |
114 ylab("Samples (Bulk)") + | |
115 theme(axis.text.x = element_text(angle = -90), | |
116 axis.text.y = element_text(size = 6)) | |
117 print(plot_hmap) | |
118 } | |
119 } | |
120 dev.off() | |
121 } | |
122 | |
123 merge_factors_spread <- function(grudat_spread, factor_groups) { | |
124 ## Generated | |
125 merge_it <- function(matr, plot_groups, valname) { | |
126 ren <- melt(lapply(matr, function(mat) { | |
127 mat["ct"] <- rownames(mat); return(mat)})) | |
128 ## - Grab factors and merge into list | |
129 ren_new <- merge(ren, plot_groups, by.x = "variable", by.y = "Samples") | |
130 colnames(ren_new) <- c("Sample", "Cell", valname, "Bulk", "Factors") | |
131 return(ren_new) | |
132 } | |
133 tab <- merge(merge_it(grudat$spread$prop, factor_groups, "value.prop"), | |
134 merge_it(grudat$spread$scale, factor_groups, "value.scale"), | |
135 by = c("Sample", "Cell", "Bulk", "Factors")) | |
136 return(tab) | |
137 } | |
138 | |
139 | |
140 plot_grouped_heatmaps <- function(results) { | |
141 pdf(out_heatmulti_pdf, width = 8, height = 8) | |
142 for (sc_name in names(results)) { | |
143 named_list <- sapply( | |
144 names(results[[sc_name]]), | |
145 function(n) { | |
146 ## We transpose the data here, because | |
147 ## the plotting function omits by default | |
148 ## the Y-axis which are the samples. | |
149 ## Since the celltypes are the common factor | |
150 ## these should be the Y-axis instead. | |
151 t(data.matrix(results[[sc_name]][[n]][[method_key]])) | |
152 }, simplify = F, USE.NAMES = T) | |
153 named_methods <- names(results[[sc_name]]) | |
154 ## | |
155 plot_hmap <- Prop_heat_Est( | |
156 named_list, | |
157 method.name = named_methods) + | |
158 ggtitle(paste0("[", est_method, "] Cell type ", | |
159 "proportions of ", | |
160 "Bulk Datasets based on ", | |
161 sc_name, " (scRNA)")) + | |
162 xlab("Samples (Bulk)") + | |
163 ylab("Cell Types (scRNA)") + | |
164 theme(axis.text.x = element_text(angle = -90), | |
165 axis.text.y = element_text(size = 6)) | |
166 print(plot_hmap) | |
167 } | |
168 dev.off() | |
169 } | |
170 | |
171 ## Desired plots | |
172 ## 1. Pie chart: | |
173 ## - Per Bulk dataset (using just normalised proportions) | |
174 ## - Per Bulk dataset (multiplying proportions by nreads) | |
175 | |
176 unlist_names <- function(results, method, prepend_bkname=FALSE) { | |
177 unique(sort( | |
178 unlist(lapply(names(results), function(scname) { | |
179 lapply(names(results[[scname]]), function(bkname) { | |
180 res <- get(method)(results[[scname]][[bkname]][[method_key]]) | |
181 if (prepend_bkname) { | |
182 ## We *do not* assume unique bulk sample names | |
183 ## across different bulk datasets. | |
184 res <- paste0(bkname, "::", res) | |
185 } | |
186 return(res) | |
187 }) | |
188 })) | |
189 )) | |
190 } | |
191 | |
192 summarized_matrix <- function(results) { # nolint | |
193 ## We assume that cell types MUST be unique, but that sample | |
194 ## names do not need to be. For this reason, we must prepend | |
195 ## the bulk dataset name to the individual sample names. | |
196 all_celltypes <- unlist_names(results, "colnames") | |
197 all_samples <- unlist_names(results, "rownames", prepend_bkname = TRUE) | |
198 | |
199 ## Iterate through all possible samples and populate a table. | |
200 ddff <- data.frame() | |
201 ddff_scale <- data.frame() | |
202 for (cell in all_celltypes) { | |
203 for (sample in all_samples) { | |
204 group_sname <- unlist(strsplit(sample, split = "::")) | |
205 bulk <- group_sname[1] | |
206 id_sample <- group_sname[2] | |
207 for (scgroup in names(results)) { | |
208 if (bulk %in% names(results[[scgroup]])) { | |
209 mat_prop <- results[[scgroup]][[bulk]][[method_key]] | |
210 vec_counts <- results[[scgroup]][[bulk]]$bulk_sample_totals | |
211 ## - We use sample instead of id_sample because we need to | |
212 ## extract bulk sets from the complete matrix later. It's | |
213 ## messy, yes. | |
214 if (cell %in% colnames(mat_prop)) { | |
215 ddff[cell, sample] <- mat_prop[id_sample, cell] | |
216 ddff_scale[cell, sample] <- mat_prop[id_sample, cell] * vec_counts[[id_sample]] #nolint | |
217 } else { | |
218 ddff[cell, sample] <- 0 | |
219 ddff_scale[cell, sample] <- 0 | |
220 } | |
221 } | |
222 } | |
223 } | |
224 } | |
225 return(list(prop = ddff, scaled = ddff_scale)) | |
226 } | |
227 | |
228 flatten_factor_list <- function(results) { | |
229 ## Get a 2d DF of all factors across all bulk samples. | |
230 res <- c() | |
231 for (scgroup in names(results)) { | |
232 for (bulkgroup in names(results[[scgroup]])) { | |
233 dat <- results[[scgroup]][[bulkgroup]]$plot_groups | |
234 dat$Samples <- paste0(bulkgroup, "::", dat$Samples) #nolint | |
235 res <- rbind(res, dat) | |
236 } | |
237 } | |
238 return(res) | |
239 } | |
240 | |
241 group_by_dataset <- function(summat) { | |
242 bulk_names <- unlist( | |
243 lapply(names(files), function(x) names(files[[x]]$bulk))) | |
244 mat_names <- colnames(summat$prop) | |
245 bd <- list() | |
246 bd_scale <- list() | |
247 bd_spread_scale <- list() | |
248 bd_spread_prop <- list() | |
249 for (bname in bulk_names) { | |
250 subs <- mat_names[startsWith(mat_names, paste0(bname, "::"))] | |
251 ## - | |
252 bd[[bname]] <- rowSums(summat$prop[, subs]) | |
253 bd_scale[[bname]] <- rowSums(summat$scaled[, subs]) | |
254 bd_spread_scale[[bname]] <- summat$scaled[, subs] | |
255 bd_spread_prop[[bname]] <- summat$prop[, subs] | |
256 } | |
257 return(list(prop = as.data.frame(bd), | |
258 scaled = as.data.frame(bd_scale), | |
259 spread = list(scale = bd_spread_scale, | |
260 prop = bd_spread_prop))) | |
261 } | |
262 | |
263 summarize_heatmaps <- function(grudat_spread_melt, do_factors) { | |
264 ## - | |
265 do_single <- function(grudat_melted, yaxis, xaxis, fillval, title, | |
266 ylabs = element_blank(), xlabs = element_blank(), | |
267 use_log = TRUE, size = 11) { | |
268 ## Convert from matrix to long format | |
269 melted <- grudat_melted ## copy? | |
270 if (use_log) { | |
271 melted[[fillval]] <- log10(melted[[fillval]] + 1) | |
272 } | |
273 return(ggplot(melted) + | |
274 geom_tile(aes_string(y = yaxis, x = xaxis, fill = fillval), | |
275 colour = "white") + | |
276 scale_fill_gradient2(low = "steelblue", high = "red", | |
277 mid = "white", name = element_blank()) + | |
278 theme(axis.text.x = element_text(angle = -90, hjust = 0, | |
279 size = size)) + | |
280 ggtitle(label = title) + xlab(xlabs) + ylab(ylabs)) | |
281 } | |
282 | |
283 do_gridplot <- function(title, xvar, plot="both", ncol=2, size = 11) { | |
284 do_logged <- (plot %in% c("log", "both")) | |
285 do_normal <- (plot %in% c("normal", "both")) | |
286 plist <- list() | |
287 if (do_logged) { | |
288 plist[["1"]] <- do_single(grudat_spread_melt, "Cell", xvar, | |
289 "value.scale", "Reads (log10+1)", | |
290 size = size) | |
291 plist[["2"]] <- do_single(grudat_spread_melt, "Cell", xvar, | |
292 "value.prop", "Sample (log10+1)", | |
293 size = size) | |
294 } | |
295 if (do_normal) { | |
296 plist[["A"]] <- do_single(grudat_spread_melt, "Cell", xvar, | |
297 "value.scale", "Reads", use_log = F, | |
298 size = size) | |
299 plist[["B"]] <- do_single(grudat_spread_melt, "Cell", xvar, | |
300 "value.prop", "Sample", use_log = F, | |
301 size = size) | |
302 } | |
303 return(plot_grid(ggdraw() + draw_label(title, fontface = "bold"), | |
304 plot_grid(plotlist = plist, ncol = ncol), | |
305 ncol = 1, rel_heights = c(0.05, 0.95))) | |
306 | |
307 } | |
308 p1 <- do_gridplot("Cell Types vs Bulk Datasets", "Bulk", "both", ) | |
309 p2a <- do_gridplot("Cell Types vs Samples", "Sample", "normal", 1, | |
310 size = 8) | |
311 p2b <- do_gridplot("Cell Types vs Samples (log10+1)", "Sample", "log", 1, | |
312 size = 8) | |
313 p3 <- ggplot + theme_void() | |
314 if (do_factors) { | |
315 p3 <- do_gridplot("Cell Types against Factors", "Factors", "both") | |
316 } | |
317 return(list(bulk = p1, | |
318 samples = list(log = p2b, normal = p2a), | |
319 factors = p3)) | |
320 } | |
321 | |
322 summarize_boxplots <- function(grudat_spread, do_factors) { | |
323 common1 <- ggplot(grudat_spread, aes(x = value.prop)) + ggtitle("Sample") + | |
324 xlab(element_blank()) + ylab(element_blank()) | |
325 common2 <- ggplot(grudat_spread, aes(x = value.scale)) + ggtitle("Reads") + | |
326 xlab(element_blank()) + ylab(element_blank()) | |
327 | |
328 A <- B <- list() #nolint | |
329 ## Cell type by sample | |
330 A$p1 <- common2 + geom_boxplot(aes(y = Cell, color = Bulk)) | |
331 A$p2 <- common1 + geom_boxplot(aes(y = Cell, color = Bulk)) | |
332 ## Sample by Cell type | |
333 B$p1 <- common2 + geom_boxplot(aes(y = Bulk, color = Cell)) + | |
334 ylab("Bulk Dataset") | |
335 B$p2 <- common1 + geom_boxplot(aes(y = Bulk, color = Cell)) + | |
336 ylab("Bulk Dataset") | |
337 ## -- Factor plots are optional | |
338 A$p3 <- B$p3 <- A$p4 <- B$p4 <- ggplot() + theme_void() | |
339 | |
340 if (do_factors) { | |
341 A$p3 <- common1 + geom_boxplot(aes(y = Cell, color = Factors)) | |
342 A$p4 <- common2 + geom_boxplot(aes(y = Cell, color = Factors)) | |
343 B$p3 <- common1 + geom_boxplot(aes(y = Bulk, color = Factors)) + | |
344 ylab("Bulk Dataset") | |
345 B$p4 <- common2 + geom_boxplot(aes(y = Bulk, color = Factors)) + | |
346 ylab("Bulk Dataset") | |
347 } | |
348 | |
349 title_a <- "Cell Types against Bulk" | |
350 title_b <- "Bulk Datasets against Cells" | |
351 if (do_factors) { | |
352 title_a <- paste0(title_a, " and Factors") | |
353 title_b <- paste0(title_b, " and Factors") | |
354 } | |
355 | |
356 a_all <- plot_grid(ggdraw() + draw_label(title_a, fontface = "bold"), | |
357 plot_grid(plotlist = A, ncol = 2), | |
358 ncol = 1, rel_heights = c(0.05, 0.95)) | |
359 b_all <- plot_grid(ggdraw() + draw_label(title_b, fontface = "bold"), | |
360 plot_grid(plotlist = B, ncol = 2), | |
361 ncol = 1, rel_heights = c(0.05, 0.95)) | |
362 return(list(cell = a_all, bulk = b_all)) | |
363 } | |
364 | |
365 filter_output <- function(grudat_spread_melt, out_filt) { | |
366 print_red <- function(comment, red_list) { | |
367 cat(paste(comment, paste(red_list, collapse = ", "), "\n")) | |
368 } | |
369 grudat_filt <- grudat_spread_melt | |
370 print_red("Total Cell types:", unique(grudat_filt$Cell)) | |
371 if (!is.null(out_filt$cells)) { | |
372 grudat_filt <- grudat_filt[grudat_filt$Cell %in% out_filt$cells, ] | |
373 print_red(" - selecting:", out_filt$cells) | |
374 } | |
375 print_red("Total Factors:", unique(grudat_spread_melt$Factors)) | |
376 if (!is.null(out_filt$facts)) { | |
377 grudat_filt <- grudat_filt[grudat_filt$Factors %in% out_filt$facts, ] | |
378 print_red(" - selecting:", out_filt$facts) | |
379 } | |
380 return(grudat_filt) | |
381 } | |
382 | |
383 | |
384 results <- music_on_all(files) | |
385 | |
386 if (heat_grouped_p) { | |
387 plot_grouped_heatmaps(results) | |
388 } else { | |
389 plot_all_individual_heatmaps(results) | |
390 } | |
391 | |
392 save.image("/tmp/sesh.rds") | |
393 | |
394 summat <- summarized_matrix(results) | |
395 grudat <- group_by_dataset(summat) | |
396 grudat_spread_melt <- merge_factors_spread(grudat$spread, | |
397 flatten_factor_list(results)) | |
398 | |
399 | |
400 | |
401 ## The output filters ONLY apply to boxplots, since these take | |
402 do_factors <- (length(unique(grudat_spread_melt[["Factors"]])) > 1) | |
403 | |
404 grudat_spread_melt_filt <- filter_output(grudat_spread_melt, out_filt) | |
405 | |
406 heat_maps <- summarize_heatmaps(grudat_spread_melt_filt, do_factors) | |
407 box_plots <- summarize_boxplots(grudat_spread_melt_filt, do_factors) | |
408 | |
409 pdf(out_heatsumm_pdf, width = 14, height = 14) | |
410 print(heat_maps) | |
411 print(box_plots) | |
412 dev.off() | |
413 | |
414 ## Generate output tables | |
415 stats_prop <- lapply(grudat$spread$prop, function(x) { | |
416 t(apply(x, 1, summary))}) | |
417 stats_scale <- lapply(grudat$spread$scale, function(x) { | |
418 t(apply(x, 1, summary))}) | |
419 | |
420 writable2 <- function(obj, prefix, title) { | |
421 write.table(obj, | |
422 file = paste0("report_data/", prefix, "_", | |
423 title, ".tabular"), | |
424 quote = F, sep = "\t", col.names = NA) | |
425 } | |
426 ## Make the value table printable | |
427 grudat_spread_melt$value.scale <- as.integer(grudat_spread_melt$value.scale) # nolint | |
428 colnames(grudat_spread_melt) <- c("Sample", "Cell", "Bulk", "Factors", | |
429 "CT Prop in Sample", "Number of Reads") | |
430 | |
431 writable2(grudat_spread_melt, "values", "Data Table") | |
432 writable2(summat$prop, "values", "Matrix of Cell Type Sample Proportions") | |
433 writable2({ | |
434 aa <- as.matrix(summat$scaled); mode(aa) <- "integer"; aa | |
435 }, "values", "Matrix of Cell Type Read Counts") | |
436 | |
437 for (bname in names(stats_prop)) { | |
438 writable2(stats_prop[[bname]], "stats", paste0(bname, ": Sample Props")) | |
439 writable2(stats_scale[[bname]], "stats", paste0(bname, ": Read Props")) | |
440 } |