Mercurial > repos > bgruening > music_construct_eset
comparison scripts/compare.R @ 3:7ffaa0968da3 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit 8beed1a19fcd9dc59f7746e1dfa735a2d5f29784"
| author | bgruening |
|---|---|
| date | Thu, 10 Feb 2022 12:53:22 +0000 |
| parents | |
| children | 282819d09a4f |
comparison
equal
deleted
inserted
replaced
| 2:7902cd31b9b5 | 3:7ffaa0968da3 |
|---|---|
| 1 suppressWarnings(suppressPackageStartupMessages(library(xbioc))) | |
| 2 suppressWarnings(suppressPackageStartupMessages(library(MuSiC))) | |
| 3 suppressWarnings(suppressPackageStartupMessages(library(reshape2))) | |
| 4 suppressWarnings(suppressPackageStartupMessages(library(cowplot))) | |
| 5 ## We use this script to estimate the effectiveness of proportion methods | |
| 6 | |
| 7 ## Load Conf | |
| 8 args <- commandArgs(trailingOnly = TRUE) | |
| 9 source(args[1]) | |
| 10 | |
| 11 method_key <- list("MuSiC" = "est_music", | |
| 12 "NNLS" = "est_nnls")[[est_method]] | |
| 13 | |
| 14 | |
| 15 scale_yaxes <- function(gplot, value) { | |
| 16 if (is.na(value)) { | |
| 17 gplot | |
| 18 } else { | |
| 19 gplot + scale_y_continuous(lim = c(0, value)) | |
| 20 } | |
| 21 } | |
| 22 | |
| 23 | |
| 24 set_factor_data <- function(bulk_data, factor_name = NULL) { | |
| 25 if (is.null(factor_name)) { | |
| 26 factor_name <- "None" ## change to something plottable | |
| 27 } | |
| 28 pdat <- pData(bulk_data) | |
| 29 sam_fact <- NULL | |
| 30 if (factor_name %in% colnames(pdat)) { | |
| 31 sam_fact <- cbind(rownames(pdat), | |
| 32 as.character(pdat[[factor_name]])) | |
| 33 cat(paste0(" - factor: ", factor_name, | |
| 34 " found in phenotypes\n")) | |
| 35 } else { | |
| 36 ## We assign this as the factor for the entire dataset | |
| 37 sam_fact <- cbind(rownames(pdat), | |
| 38 factor_name) | |
| 39 cat(paste0(" - factor: assigning \"", factor_name, | |
| 40 "\" to whole dataset\n")) | |
| 41 } | |
| 42 colnames(sam_fact) <- c("Samples", "Factors") | |
| 43 return(as.data.frame(sam_fact)) | |
| 44 } | |
| 45 | |
| 46 ## Due to limiting sizes, we need to load and unload | |
| 47 ## possibly very large datasets. | |
| 48 process_pair <- function(sc_data, bulk_data, | |
| 49 ctypes_label, samples_label, ctypes, | |
| 50 factor_group) { | |
| 51 ## - Generate | |
| 52 est_prop <- music_prop( | |
| 53 bulk.eset = bulk_data, sc.eset = sc_data, | |
| 54 clusters = ctypes_label, | |
| 55 samples = samples_label, select.ct = ctypes, verbose = T) | |
| 56 ## - | |
| 57 estimated_music_props <- est_prop$Est.prop.weighted | |
| 58 estimated_nnls_props <- est_prop$Est.prop.allgene | |
| 59 ## - | |
| 60 fact_data <- set_factor_data(bulk_data, factor_group) | |
| 61 ## - | |
| 62 return(list(est_music = estimated_music_props, | |
| 63 est_nnls = estimated_nnls_props, | |
| 64 bulk_sample_totals = colSums(exprs(bulk_data)), | |
| 65 plot_groups = fact_data)) | |
| 66 } | |
| 67 | |
| 68 music_on_all <- function(files) { | |
| 69 results <- list() | |
| 70 for (sc_name in names(files)) { | |
| 71 cat(paste0("sc-group:", sc_name, "\n")) | |
| 72 scgroup <- files[[sc_name]] | |
| 73 ## - sc Data | |
| 74 sc_est <- readRDS(scgroup$dataset) | |
| 75 ## - params | |
| 76 celltypes_label <- scgroup$label_cell | |
| 77 samples_label <- scgroup$label_sample | |
| 78 celltypes <- scgroup$celltype | |
| 79 | |
| 80 results[[sc_name]] <- list() | |
| 81 for (bulk_name in names(scgroup$bulk)) { | |
| 82 cat(paste0(" - bulk-group:", bulk_name, "\n")) | |
| 83 bulkgroup <- scgroup$bulk[[bulk_name]] | |
| 84 ## - bulk Data | |
| 85 bulk_est <- readRDS(bulkgroup$dataset) | |
| 86 ## - bulk params | |
| 87 pheno_facts <- bulkgroup$pheno_facts | |
| 88 pheno_excl <- bulkgroup$pheno_excl | |
| 89 ## | |
| 90 results[[sc_name]][[bulk_name]] <- process_pair( | |
| 91 sc_est, bulk_est, | |
| 92 celltypes_label, samples_label, | |
| 93 celltypes, bulkgroup$factor_group) | |
| 94 ## | |
| 95 rm(bulk_est) ## unload | |
| 96 } | |
| 97 rm(sc_est) ## unload | |
| 98 } | |
| 99 return(results) | |
| 100 } | |
| 101 | |
| 102 plot_all_individual_heatmaps <- function(results) { | |
| 103 pdf(out_heatmulti_pdf, width = 8, height = 8) | |
| 104 for (sc_name in names(results)) { | |
| 105 for (bk_name in names(results[[sc_name]])) { | |
| 106 res <- results[[sc_name]][[bk_name]] | |
| 107 plot_hmap <- Prop_heat_Est( | |
| 108 data.matrix(res[[method_key]]), method.name = est_method) + | |
| 109 ggtitle(paste0("[", est_method, "Cell type ", | |
| 110 "proportions in ", | |
| 111 bk_name, " (Bulk) based on ", | |
| 112 sc_name, " (scRNA)")) + | |
| 113 xlab("Cell Types (scRNA)") + | |
| 114 ylab("Samples (Bulk)") + | |
| 115 theme(axis.text.x = element_text(angle = -90), | |
| 116 axis.text.y = element_text(size = 6)) | |
| 117 print(plot_hmap) | |
| 118 } | |
| 119 } | |
| 120 dev.off() | |
| 121 } | |
| 122 | |
| 123 merge_factors_spread <- function(grudat_spread, factor_groups) { | |
| 124 ## Generated | |
| 125 merge_it <- function(matr, plot_groups, valname) { | |
| 126 ren <- melt(lapply(matr, function(mat) { | |
| 127 mat["ct"] <- rownames(mat); return(mat)})) | |
| 128 ## - Grab factors and merge into list | |
| 129 ren_new <- merge(ren, plot_groups, by.x = "variable", by.y = "Samples") | |
| 130 colnames(ren_new) <- c("Sample", "Cell", valname, "Bulk", "Factors") | |
| 131 return(ren_new) | |
| 132 } | |
| 133 tab <- merge(merge_it(grudat$spread$prop, factor_groups, "value.prop"), | |
| 134 merge_it(grudat$spread$scale, factor_groups, "value.scale"), | |
| 135 by = c("Sample", "Cell", "Bulk", "Factors")) | |
| 136 return(tab) | |
| 137 } | |
| 138 | |
| 139 | |
| 140 plot_grouped_heatmaps <- function(results) { | |
| 141 pdf(out_heatmulti_pdf, width = 8, height = 8) | |
| 142 for (sc_name in names(results)) { | |
| 143 named_list <- sapply( | |
| 144 names(results[[sc_name]]), | |
| 145 function(n) { | |
| 146 ## We transpose the data here, because | |
| 147 ## the plotting function omits by default | |
| 148 ## the Y-axis which are the samples. | |
| 149 ## Since the celltypes are the common factor | |
| 150 ## these should be the Y-axis instead. | |
| 151 t(data.matrix(results[[sc_name]][[n]][[method_key]])) | |
| 152 }, simplify = F, USE.NAMES = T) | |
| 153 named_methods <- names(results[[sc_name]]) | |
| 154 ## | |
| 155 plot_hmap <- Prop_heat_Est( | |
| 156 named_list, | |
| 157 method.name = named_methods) + | |
| 158 ggtitle(paste0("[", est_method, "] Cell type ", | |
| 159 "proportions of ", | |
| 160 "Bulk Datasets based on ", | |
| 161 sc_name, " (scRNA)")) + | |
| 162 xlab("Samples (Bulk)") + | |
| 163 ylab("Cell Types (scRNA)") + | |
| 164 theme(axis.text.x = element_text(angle = -90), | |
| 165 axis.text.y = element_text(size = 6)) | |
| 166 print(plot_hmap) | |
| 167 } | |
| 168 dev.off() | |
| 169 } | |
| 170 | |
| 171 ## Desired plots | |
| 172 ## 1. Pie chart: | |
| 173 ## - Per Bulk dataset (using just normalised proportions) | |
| 174 ## - Per Bulk dataset (multiplying proportions by nreads) | |
| 175 | |
| 176 unlist_names <- function(results, method, prepend_bkname=FALSE) { | |
| 177 unique(sort( | |
| 178 unlist(lapply(names(results), function(scname) { | |
| 179 lapply(names(results[[scname]]), function(bkname) { | |
| 180 res <- get(method)(results[[scname]][[bkname]][[method_key]]) | |
| 181 if (prepend_bkname) { | |
| 182 ## We *do not* assume unique bulk sample names | |
| 183 ## across different bulk datasets. | |
| 184 res <- paste0(bkname, "::", res) | |
| 185 } | |
| 186 return(res) | |
| 187 }) | |
| 188 })) | |
| 189 )) | |
| 190 } | |
| 191 | |
| 192 summarized_matrix <- function(results) { # nolint | |
| 193 ## We assume that cell types MUST be unique, but that sample | |
| 194 ## names do not need to be. For this reason, we must prepend | |
| 195 ## the bulk dataset name to the individual sample names. | |
| 196 all_celltypes <- unlist_names(results, "colnames") | |
| 197 all_samples <- unlist_names(results, "rownames", prepend_bkname = TRUE) | |
| 198 | |
| 199 ## Iterate through all possible samples and populate a table. | |
| 200 ddff <- data.frame() | |
| 201 ddff_scale <- data.frame() | |
| 202 for (cell in all_celltypes) { | |
| 203 for (sample in all_samples) { | |
| 204 group_sname <- unlist(strsplit(sample, split = "::")) | |
| 205 bulk <- group_sname[1] | |
| 206 id_sample <- group_sname[2] | |
| 207 for (scgroup in names(results)) { | |
| 208 if (bulk %in% names(results[[scgroup]])) { | |
| 209 mat_prop <- results[[scgroup]][[bulk]][[method_key]] | |
| 210 vec_counts <- results[[scgroup]][[bulk]]$bulk_sample_totals | |
| 211 ## - We use sample instead of id_sample because we need to | |
| 212 ## extract bulk sets from the complete matrix later. It's | |
| 213 ## messy, yes. | |
| 214 if (cell %in% colnames(mat_prop)) { | |
| 215 ddff[cell, sample] <- mat_prop[id_sample, cell] | |
| 216 ddff_scale[cell, sample] <- mat_prop[id_sample, cell] * vec_counts[[id_sample]] #nolint | |
| 217 } else { | |
| 218 ddff[cell, sample] <- 0 | |
| 219 ddff_scale[cell, sample] <- 0 | |
| 220 } | |
| 221 } | |
| 222 } | |
| 223 } | |
| 224 } | |
| 225 return(list(prop = ddff, scaled = ddff_scale)) | |
| 226 } | |
| 227 | |
| 228 flatten_factor_list <- function(results) { | |
| 229 ## Get a 2d DF of all factors across all bulk samples. | |
| 230 res <- c() | |
| 231 for (scgroup in names(results)) { | |
| 232 for (bulkgroup in names(results[[scgroup]])) { | |
| 233 dat <- results[[scgroup]][[bulkgroup]]$plot_groups | |
| 234 dat$Samples <- paste0(bulkgroup, "::", dat$Samples) #nolint | |
| 235 res <- rbind(res, dat) | |
| 236 } | |
| 237 } | |
| 238 return(res) | |
| 239 } | |
| 240 | |
| 241 group_by_dataset <- function(summat) { | |
| 242 bulk_names <- unlist( | |
| 243 lapply(names(files), function(x) names(files[[x]]$bulk))) | |
| 244 mat_names <- colnames(summat$prop) | |
| 245 bd <- list() | |
| 246 bd_scale <- list() | |
| 247 bd_spread_scale <- list() | |
| 248 bd_spread_prop <- list() | |
| 249 for (bname in bulk_names) { | |
| 250 subs <- mat_names[startsWith(mat_names, paste0(bname, "::"))] | |
| 251 ## - | |
| 252 bd[[bname]] <- rowSums(summat$prop[, subs]) | |
| 253 bd_scale[[bname]] <- rowSums(summat$scaled[, subs]) | |
| 254 bd_spread_scale[[bname]] <- summat$scaled[, subs] | |
| 255 bd_spread_prop[[bname]] <- summat$prop[, subs] | |
| 256 } | |
| 257 return(list(prop = as.data.frame(bd), | |
| 258 scaled = as.data.frame(bd_scale), | |
| 259 spread = list(scale = bd_spread_scale, | |
| 260 prop = bd_spread_prop))) | |
| 261 } | |
| 262 | |
| 263 summarize_heatmaps <- function(grudat_spread_melt, do_factors) { | |
| 264 ## - | |
| 265 do_single <- function(grudat_melted, yaxis, xaxis, fillval, title, | |
| 266 ylabs = element_blank(), xlabs = element_blank(), | |
| 267 use_log = TRUE, size = 11) { | |
| 268 ## Convert from matrix to long format | |
| 269 melted <- grudat_melted ## copy? | |
| 270 if (use_log) { | |
| 271 melted[[fillval]] <- log10(melted[[fillval]] + 1) | |
| 272 } | |
| 273 return(ggplot(melted) + | |
| 274 geom_tile(aes_string(y = yaxis, x = xaxis, fill = fillval), | |
| 275 colour = "white") + | |
| 276 scale_fill_gradient2(low = "steelblue", high = "red", | |
| 277 mid = "white", name = element_blank()) + | |
| 278 theme(axis.text.x = element_text(angle = -90, hjust = 0, | |
| 279 size = size)) + | |
| 280 ggtitle(label = title) + xlab(xlabs) + ylab(ylabs)) | |
| 281 } | |
| 282 | |
| 283 do_gridplot <- function(title, xvar, plot="both", ncol=2, size = 11) { | |
| 284 do_logged <- (plot %in% c("log", "both")) | |
| 285 do_normal <- (plot %in% c("normal", "both")) | |
| 286 plist <- list() | |
| 287 if (do_logged) { | |
| 288 plist[["1"]] <- do_single(grudat_spread_melt, "Cell", xvar, | |
| 289 "value.scale", "Reads (log10+1)", | |
| 290 size = size) | |
| 291 plist[["2"]] <- do_single(grudat_spread_melt, "Cell", xvar, | |
| 292 "value.prop", "Sample (log10+1)", | |
| 293 size = size) | |
| 294 } | |
| 295 if (do_normal) { | |
| 296 plist[["A"]] <- do_single(grudat_spread_melt, "Cell", xvar, | |
| 297 "value.scale", "Reads", use_log = F, | |
| 298 size = size) | |
| 299 plist[["B"]] <- do_single(grudat_spread_melt, "Cell", xvar, | |
| 300 "value.prop", "Sample", use_log = F, | |
| 301 size = size) | |
| 302 } | |
| 303 return(plot_grid(ggdraw() + draw_label(title, fontface = "bold"), | |
| 304 plot_grid(plotlist = plist, ncol = ncol), | |
| 305 ncol = 1, rel_heights = c(0.05, 0.95))) | |
| 306 | |
| 307 } | |
| 308 p1 <- do_gridplot("Cell Types vs Bulk Datasets", "Bulk", "both", ) | |
| 309 p2a <- do_gridplot("Cell Types vs Samples", "Sample", "normal", 1, | |
| 310 size = 8) | |
| 311 p2b <- do_gridplot("Cell Types vs Samples (log10+1)", "Sample", "log", 1, | |
| 312 size = 8) | |
| 313 p3 <- ggplot + theme_void() | |
| 314 if (do_factors) { | |
| 315 p3 <- do_gridplot("Cell Types against Factors", "Factors", "both") | |
| 316 } | |
| 317 return(list(bulk = p1, | |
| 318 samples = list(log = p2b, normal = p2a), | |
| 319 factors = p3)) | |
| 320 } | |
| 321 | |
| 322 summarize_boxplots <- function(grudat_spread, do_factors) { | |
| 323 common1 <- ggplot(grudat_spread, aes(x = value.prop)) + ggtitle("Sample") + | |
| 324 xlab(element_blank()) + ylab(element_blank()) | |
| 325 common2 <- ggplot(grudat_spread, aes(x = value.scale)) + ggtitle("Reads") + | |
| 326 xlab(element_blank()) + ylab(element_blank()) | |
| 327 | |
| 328 A <- B <- list() #nolint | |
| 329 ## Cell type by sample | |
| 330 A$p1 <- common2 + geom_boxplot(aes(y = Cell, color = Bulk)) | |
| 331 A$p2 <- common1 + geom_boxplot(aes(y = Cell, color = Bulk)) | |
| 332 ## Sample by Cell type | |
| 333 B$p1 <- common2 + geom_boxplot(aes(y = Bulk, color = Cell)) + | |
| 334 ylab("Bulk Dataset") | |
| 335 B$p2 <- common1 + geom_boxplot(aes(y = Bulk, color = Cell)) + | |
| 336 ylab("Bulk Dataset") | |
| 337 ## -- Factor plots are optional | |
| 338 A$p3 <- B$p3 <- A$p4 <- B$p4 <- ggplot() + theme_void() | |
| 339 | |
| 340 if (do_factors) { | |
| 341 A$p3 <- common1 + geom_boxplot(aes(y = Cell, color = Factors)) | |
| 342 A$p4 <- common2 + geom_boxplot(aes(y = Cell, color = Factors)) | |
| 343 B$p3 <- common1 + geom_boxplot(aes(y = Bulk, color = Factors)) + | |
| 344 ylab("Bulk Dataset") | |
| 345 B$p4 <- common2 + geom_boxplot(aes(y = Bulk, color = Factors)) + | |
| 346 ylab("Bulk Dataset") | |
| 347 } | |
| 348 | |
| 349 title_a <- "Cell Types against Bulk" | |
| 350 title_b <- "Bulk Datasets against Cells" | |
| 351 if (do_factors) { | |
| 352 title_a <- paste0(title_a, " and Factors") | |
| 353 title_b <- paste0(title_b, " and Factors") | |
| 354 } | |
| 355 | |
| 356 a_all <- plot_grid(ggdraw() + draw_label(title_a, fontface = "bold"), | |
| 357 plot_grid(plotlist = A, ncol = 2), | |
| 358 ncol = 1, rel_heights = c(0.05, 0.95)) | |
| 359 b_all <- plot_grid(ggdraw() + draw_label(title_b, fontface = "bold"), | |
| 360 plot_grid(plotlist = B, ncol = 2), | |
| 361 ncol = 1, rel_heights = c(0.05, 0.95)) | |
| 362 return(list(cell = a_all, bulk = b_all)) | |
| 363 } | |
| 364 | |
| 365 filter_output <- function(grudat_spread_melt, out_filt) { | |
| 366 print_red <- function(comment, red_list) { | |
| 367 cat(paste(comment, paste(red_list, collapse = ", "), "\n")) | |
| 368 } | |
| 369 grudat_filt <- grudat_spread_melt | |
| 370 print_red("Total Cell types:", unique(grudat_filt$Cell)) | |
| 371 if (!is.null(out_filt$cells)) { | |
| 372 grudat_filt <- grudat_filt[grudat_filt$Cell %in% out_filt$cells, ] | |
| 373 print_red(" - selecting:", out_filt$cells) | |
| 374 } | |
| 375 print_red("Total Factors:", unique(grudat_spread_melt$Factors)) | |
| 376 if (!is.null(out_filt$facts)) { | |
| 377 grudat_filt <- grudat_filt[grudat_filt$Factors %in% out_filt$facts, ] | |
| 378 print_red(" - selecting:", out_filt$facts) | |
| 379 } | |
| 380 return(grudat_filt) | |
| 381 } | |
| 382 | |
| 383 | |
| 384 results <- music_on_all(files) | |
| 385 | |
| 386 if (heat_grouped_p) { | |
| 387 plot_grouped_heatmaps(results) | |
| 388 } else { | |
| 389 plot_all_individual_heatmaps(results) | |
| 390 } | |
| 391 | |
| 392 save.image("/tmp/sesh.rds") | |
| 393 | |
| 394 summat <- summarized_matrix(results) | |
| 395 grudat <- group_by_dataset(summat) | |
| 396 grudat_spread_melt <- merge_factors_spread(grudat$spread, | |
| 397 flatten_factor_list(results)) | |
| 398 | |
| 399 | |
| 400 | |
| 401 ## The output filters ONLY apply to boxplots, since these take | |
| 402 do_factors <- (length(unique(grudat_spread_melt[["Factors"]])) > 1) | |
| 403 | |
| 404 grudat_spread_melt_filt <- filter_output(grudat_spread_melt, out_filt) | |
| 405 | |
| 406 heat_maps <- summarize_heatmaps(grudat_spread_melt_filt, do_factors) | |
| 407 box_plots <- summarize_boxplots(grudat_spread_melt_filt, do_factors) | |
| 408 | |
| 409 pdf(out_heatsumm_pdf, width = 14, height = 14) | |
| 410 print(heat_maps) | |
| 411 print(box_plots) | |
| 412 dev.off() | |
| 413 | |
| 414 ## Generate output tables | |
| 415 stats_prop <- lapply(grudat$spread$prop, function(x) { | |
| 416 t(apply(x, 1, summary))}) | |
| 417 stats_scale <- lapply(grudat$spread$scale, function(x) { | |
| 418 t(apply(x, 1, summary))}) | |
| 419 | |
| 420 writable2 <- function(obj, prefix, title) { | |
| 421 write.table(obj, | |
| 422 file = paste0("report_data/", prefix, "_", | |
| 423 title, ".tabular"), | |
| 424 quote = F, sep = "\t", col.names = NA) | |
| 425 } | |
| 426 ## Make the value table printable | |
| 427 grudat_spread_melt$value.scale <- as.integer(grudat_spread_melt$value.scale) # nolint | |
| 428 colnames(grudat_spread_melt) <- c("Sample", "Cell", "Bulk", "Factors", | |
| 429 "CT Prop in Sample", "Number of Reads") | |
| 430 | |
| 431 writable2(grudat_spread_melt, "values", "Data Table") | |
| 432 writable2(summat$prop, "values", "Matrix of Cell Type Sample Proportions") | |
| 433 writable2({ | |
| 434 aa <- as.matrix(summat$scaled); mode(aa) <- "integer"; aa | |
| 435 }, "values", "Matrix of Cell Type Read Counts") | |
| 436 | |
| 437 for (bname in names(stats_prop)) { | |
| 438 writable2(stats_prop[[bname]], "stats", paste0(bname, ": Sample Props")) | |
| 439 writable2(stats_scale[[bname]], "stats", paste0(bname, ": Read Props")) | |
| 440 } |
