view scripts/compare.R @ 3:1e4db841f0db draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit 6954ddebecfe9f0975f2c7e29193e4c99c550514
author bgruening
date Mon, 28 Oct 2024 17:49:31 +0000
parents 4447ed460308
children
line wrap: on
line source

suppressWarnings(suppressPackageStartupMessages(library(xbioc)))
suppressWarnings(suppressPackageStartupMessages(library(MuSiC)))
suppressWarnings(suppressPackageStartupMessages(library(reshape2)))
suppressWarnings(suppressPackageStartupMessages(library(cowplot)))
## We use this script to estimate the effectiveness of proportion methods

## Load Conf
args <- commandArgs(trailingOnly = TRUE)
source(args[1])

method_key <- list("MuSiC" = "est_music",
                   "NNLS" = "est_nnls")[[est_method]]

delim <- "::" ## separator bulk datasets and their samples

scale_yaxes <- function(gplot, value) {
    if (is.na(value)) {
        gplot
    } else {
        gplot + scale_y_continuous(lim = c(0, value))
    }
}


set_factor_data <- function(bulk_data, factor_name = NULL) {
    if (is.null(factor_name)) {
        factor_name <- "None" ## change to something plottable
    }
    pdat <- pData(bulk_data)
    sam_fact <- NULL
    if (factor_name %in% colnames(pdat)) {
        sam_fact <- cbind(rownames(pdat),
                          as.character(pdat[[factor_name]]))
        cat(paste0("   - factor: ", factor_name,
                   " found in phenotypes\n"))
    } else {
        ## We assign this as the factor for the entire dataset
        sam_fact <- cbind(rownames(pdat),
                          factor_name)
        cat(paste0("   - factor: assigning \"", factor_name,
                   "\" to whole dataset\n"))
    }
    colnames(sam_fact) <- c("Samples", "Factors")
    return(as.data.frame(sam_fact))
}

## Due to limiting sizes, we need to load and unload
## possibly very large datasets.
process_pair <- function(sc_data, bulk_data,
                         ctypes_label, samples_label, ctypes,
                         factor_group) {
    ## - Generate
    est_prop <- music_prop(
        bulk.eset = bulk_data, sc.eset = sc_data,
        clusters = ctypes_label,
        samples = samples_label, select.ct = ctypes, verbose = T)
    ## -
    estimated_music_props <- est_prop$Est.prop.weighted
    estimated_nnls_props <- est_prop$Est.prop.allgene
    ## -
    fact_data <- set_factor_data(bulk_data, factor_group)
    ## -
    return(list(est_music = estimated_music_props,
                est_nnls = estimated_nnls_props,
                bulk_sample_totals = colSums(exprs(bulk_data)),
                plot_groups = fact_data))
}

music_on_all <- function(files) {
    results <- list()
    for (sc_name in names(files)) {
        cat(paste0("sc-group:", sc_name, "\n"))
        scgroup <- files[[sc_name]]
        ## - sc Data
        sc_est <- readRDS(scgroup$dataset)
        ## - params
        celltypes_label <- scgroup$label_cell
        samples_label <- scgroup$label_sample
        celltypes <- scgroup$celltype

        results[[sc_name]] <- list()
        for (bulk_name in names(scgroup$bulk)) {
            cat(paste0(" - bulk-group:", bulk_name, "\n"))
            bulkgroup <- scgroup$bulk[[bulk_name]]
            ## - bulk Data
            bulk_est <- readRDS(bulkgroup$dataset)
            ## - bulk params
            pheno_facts <- bulkgroup$pheno_facts
            pheno_excl <- bulkgroup$pheno_excl
            ##
            results[[sc_name]][[bulk_name]] <- process_pair(
                sc_est, bulk_est,
                celltypes_label, samples_label,
                celltypes, bulkgroup$factor_group)
            ##
            rm(bulk_est) ## unload
        }
        rm(sc_est) ## unload
    }
    return(results)
}

plot_all_individual_heatmaps <- function(results) {
    pdf(out_heatmulti_pdf, width = 8, height = 8)
    for (sc_name in names(results)) {
        for (bk_name in names(results[[sc_name]])) {
            res <- results[[sc_name]][[bk_name]]
            plot_hmap <- Prop_heat_Est(
                data.matrix(res[[method_key]]), method.name = est_method) +
                ggtitle(paste0("[", est_method, "Cell type ",
                               "proportions in ",
                               bk_name, " (Bulk) based on ",
                               sc_name, " (scRNA)")) +
                xlab("Cell Types (scRNA)") +
                ylab("Samples (Bulk)") +
                theme(axis.text.x = element_text(angle = -90),
                      axis.text.y = element_text(size = 6))
            print(plot_hmap)
        }
    }
    dev.off()
}

merge_factors_spread <- function(grudat_spread, factor_groups) {
    ## Generated
    merge_it <- function(matr, plot_groups, valname) {
        ren <- melt(lapply(matr, function(mat) {
            mat["ct"] <- rownames(mat); return(mat)}))
        ## - Grab factors and merge into list
        ren_new <- merge(ren, plot_groups, by.x = "variable", by.y = "Samples")
        colnames(ren_new) <- c("Sample", "Cell", valname, "Bulk", "Factors")
        return(ren_new)
    }
    tab <- merge(merge_it(grudat$spread$prop, factor_groups, "value.prop"),
                 merge_it(grudat$spread$scale, factor_groups, "value.scale"),
                 by = c("Sample", "Cell", "Bulk", "Factors"))
    return(tab)
}

unlist_names <- function(results, method, prepend_bkname=FALSE) {
    unique(sort(
        unlist(lapply(names(results), function(scname) {
            lapply(names(results[[scname]]), function(bkname) {
                res <- get(method)(results[[scname]][[bkname]][[method_key]])
                if (prepend_bkname) {
                    ## We *do not* assume unique bulk sample names
                    ## across different bulk datasets.
                    res <- paste0(bkname, delim, res)
                }
                return(res)
            })
        }))
    ))
}

summarized_matrix <- function(results) {  # nolint
    ## We assume that cell types MUST be unique, but that sample
    ## names do not need to be. For this reason, we must prepend
    ## the bulk dataset name to the individual sample names.
    all_celltypes <- unlist_names(results, "colnames")
    all_samples <- unlist_names(results, "rownames", prepend_bkname = TRUE)

    ## Iterate through all possible samples and populate a table.
    ddff <- data.frame()
    ddff_scale <- data.frame()
    for (cell in all_celltypes) {
        for (sample in all_samples) {
            group_sname <- unlist(strsplit(sample, split = delim))
            bulk <- group_sname[1]
            id_sample <- group_sname[2]
            for (scgroup in names(results)) {
                if (bulk %in% names(results[[scgroup]])) {
                    mat_prop <- results[[scgroup]][[bulk]][[method_key]]
                    vec_counts <- results[[scgroup]][[bulk]]$bulk_sample_totals
                    ## - We use sample instead of id_sample because we need to
                    ##   extract bulk sets from the complete matrix later. It's
                    ##   messy, yes.
                    if (cell %in% colnames(mat_prop)) {
                        ddff[cell, sample] <- mat_prop[id_sample, cell]
                        ddff_scale[cell, sample] <- mat_prop[id_sample, cell] * vec_counts[[id_sample]] #nolint
                    } else {
                        ddff[cell, sample] <- 0
                        ddff_scale[cell, sample] <- 0
                    }
                }
            }
        }
    }
    return(list(prop = ddff, scaled = ddff_scale))
}

flatten_factor_list <- function(results) {
    ## Get a 2d DF of all factors across all bulk samples.
    res <- c()
    for (scgroup in names(results)) {
        for (bulkgroup in names(results[[scgroup]])) {
            dat <- results[[scgroup]][[bulkgroup]]$plot_groups
            dat$Samples <- paste0(bulkgroup, delim, dat$Samples) #nolint
            res <- rbind(res, dat)
        }
    }
    return(res)
}

group_by_dataset <- function(summat) {
    bulk_names <- unlist(
        lapply(names(files), function(x) names(files[[x]]$bulk)))
    mat_names <- colnames(summat$prop)
    bd <- list()
    bd_scale <- list()
    bd_spread_scale <- list()
    bd_spread_prop <- list()
    for (bname in bulk_names) {
        subs <- mat_names[startsWith(mat_names, paste0(bname, delim))]
        ## -
        bd[[bname]] <- rowSums(summat$prop[, subs])
        bd_scale[[bname]] <- rowSums(summat$scaled[, subs])
        bd_spread_scale[[bname]] <- summat$scaled[, subs]
        bd_spread_prop[[bname]] <- summat$prop[, subs]
    }
    return(list(prop = as.data.frame(bd),
                scaled = as.data.frame(bd_scale),
                spread = list(scale = bd_spread_scale,
                              prop = bd_spread_prop)))
}

do_cluster <- function(grudat_spread_melt, xaxis, yaxis, value_name,
                       xlabs="", ylabs="", titled="",
                       order_col=T, order_row=T, size=11) {

    data_m <- grudat_spread_melt
    data_matrix <- {
        tmp <- dcast(data_m, formula(paste0(yaxis, " ~ ", xaxis)), value.var = value_name)
        rownames(tmp) <- tmp[[yaxis]]
        tmp[[yaxis]] <- NULL
        tmp
    }
    dist_method <- "euclidean"
    clust_method <- "complete"

    if (order_row) {
        dd_row <- as.dendrogram(hclust(dist(data_matrix, method = dist_method), method = clust_method))
        row_ord <- order.dendrogram(dd_row)
        ordered_row_names <- row.names(data_matrix[row_ord, ])
        data_m[[yaxis]] <- factor(data_m[[yaxis]], levels = ordered_row_names)
    }

    if (order_col) {
        dd_col <- as.dendrogram(hclust(dist(t(data_matrix), method = dist_method),
                                       method = clust_method))
        col_ord <- order.dendrogram(dd_col)
        ordered_col_names <- colnames(data_matrix[, col_ord])
        data_m[[xaxis]] <- factor(data_m[[xaxis]], levels = ordered_col_names)
    }

    heat_plot <- ggplot(data_m, aes_string(x = xaxis, y = yaxis, fill = value_name)) +
        geom_tile(colour = "white") +
        scale_fill_gradient2(low = "steelblue", high = "red", mid = "white",
                             name = element_blank()) +
        scale_y_discrete(position = "right") +
        theme(axis.text.x = element_text(angle = -90, hjust = 0,
                                         size = size)) +
        ggtitle(label = titled) + xlab(xlabs) + ylab(ylabs)

    ## Graphics
    dendro_linesize <- 0.5
    dendro_colunit <- 0.2
    dendro_rowunit <- 0.1
    final_plot <- heat_plot

    if (order_row) {
        dendro_data_row <- ggdendro::dendro_data(dd_row, type = "rectangle")
        dendro_row <- cowplot::axis_canvas(heat_plot, axis = "y", coord_flip = TRUE) +
            ggplot2::geom_segment(data = ggdendro::segment(dendro_data_row),
                                  ggplot2::aes(y = -y, x = x, xend = xend, yend = -yend),
                                  size = dendro_linesize) + ggplot2::coord_flip()
        final_plot <- cowplot::insert_yaxis_grob(
                                   final_plot, dendro_row, grid::unit(dendro_colunit, "null"),
                                   position = "left")
    }
    if (order_col) {
        dendro_data_col <- ggdendro::dendro_data(dd_col, type = "rectangle")
        dendro_col <- cowplot::axis_canvas(heat_plot, axis = "x") +
            ggplot2::geom_segment(data = ggdendro::segment(dendro_data_col),
                                  ggplot2::aes(x = x, y = y, xend = xend, yend = yend),
                                  size = dendro_linesize)
        final_plot <- cowplot::insert_xaxis_grob(
                                   final_plot, dendro_col, grid::unit(dendro_rowunit, "null"),
                                   position = "top")
    }
    return(cowplot::ggdraw(final_plot))
}

summarize_heatmaps <- function(grudat_spread_melt, do_factors, cluster="None") {
    ## - Cluster is either "Rows", "Cols", "Both", or "None"
    do_single <- function(grudat_melted, yaxis, xaxis, fillval, title,
                          ylabs = element_blank(), xlabs = element_blank(),
                          use_log = TRUE, size = 11) {
        ## Convert from matrix to long format
        melted <- grudat_melted ## copy?
        if (use_log) {
            melted[[fillval]] <- log10(melted[[fillval]] + 1)
        }
        if (cluster == "None") {
            return(ggplot(melted) +
                   geom_tile(aes_string(y = yaxis, x = xaxis, fill = fillval),
                             colour = "white") +
                   scale_fill_gradient2(
                       low = "steelblue", high = "red", mid = "white",
                       name = element_blank()) +
                   theme(axis.text.x = element_text(
                             angle = -90, hjust = 0, size = size)) +
                   ggtitle(label = title) + xlab(xlabs) + ylab(ylabs))
        } else {
            return(do_cluster(grudat_spread_melt, xaxis, yaxis, fillval,
                              xlabs, ylabs, title,
                              (cluster %in% c("Cols", "Both")),
                              (cluster %in% c("Rows", "Both"))))
        }
    }

    do_gridplot <- function(title, xvar, plot="both", ncol=2, size = 11) {
        do_logged <- (plot %in% c("log", "both"))
        do_normal <- (plot %in% c("normal", "both"))
        plist <- list()
        if (do_logged) {
            plist[["1"]] <- do_single(grudat_spread_melt, "Cell", xvar,
                                      "value.scale", "Reads (log10+1)",
                                      size = size)
            plist[["2"]] <- do_single(grudat_spread_melt, "Cell", xvar,
                                      "value.prop", "Sample (log10+1)",
                                      size = size)
        }
        if (do_normal) {
            plist[["A"]] <- do_single(grudat_spread_melt, "Cell", xvar,
                                      "value.scale", "Reads", use_log = F,
                                      size = size)
            plist[["B"]] <- do_single(grudat_spread_melt, "Cell", xvar,
                                      "value.prop", "Sample", use_log = F,
                                      size = size)
        }
        return(plot_grid(ggdraw() + draw_label(title, fontface = "bold"),
                         plot_grid(plotlist = plist, ncol = ncol),
                         ncol = 1, rel_heights = c(0.05, 0.95)))
    }

    p1 <- do_gridplot("Cell Types vs Bulk Datasets", "Bulk", "both")
    p2a <- do_gridplot("Cell Types vs Samples", "Sample", "normal",
                       ncol = 1, size = 8)
    p2b <- do_gridplot("Cell Types vs Samples (log10+1)", "Sample", "log",
                       ncol = 1, size = 8)
    p3 <- ggplot + theme_void()
    if (do_factors) {
        p3 <- do_gridplot("Cell Types vs Factors", "Factors", "both")
    }
    return(list(bulk = p1,
                samples = list(log = p2b, normal = p2a),
                factors = p3))
}

summarize_boxplots <- function(grudat_spread, do_factors) {
    common1 <- ggplot(grudat_spread, aes(x = value.prop)) + ggtitle("Sample") +
        xlab(element_blank()) + ylab(element_blank())
    common2 <- ggplot(grudat_spread, aes(x = value.scale)) + ggtitle("Reads") +
        xlab(element_blank()) + ylab(element_blank())

    A <- B <- list() #nolint
    ## Cell type by sample
    A$p1 <- common2 + geom_boxplot(aes(y = Cell, color = Bulk))
    A$p2 <- common1 + geom_boxplot(aes(y = Cell, color = Bulk))
    ## Sample by Cell type
    B$p1 <- common2 + geom_boxplot(aes(y = Bulk, color = Cell)) +
        ylab("Bulk Dataset")
    B$p2 <- common1 + geom_boxplot(aes(y = Bulk, color = Cell)) +
        ylab("Bulk Dataset")
    ## -- Factor plots are optional
    A$p3 <- B$p3 <- A$p4 <- B$p4 <- ggplot() + theme_void()

    if (do_factors) {
        A$p3 <- common1 + geom_boxplot(aes(y = Cell, color = Factors))
        A$p4 <- common2 + geom_boxplot(aes(y = Cell, color = Factors))
        B$p3 <- common1 + geom_boxplot(aes(y = Bulk, color = Factors)) +
            ylab("Bulk Dataset")
        B$p4 <- common2 + geom_boxplot(aes(y = Bulk, color = Factors)) +
            ylab("Bulk Dataset")
    }

    title_a <- "Cell Types vs Bulk Datasets"
    title_b <- "Bulk Datasets vs Cell Types"
    if (do_factors) {
        title_a <- paste0(title_a, " and Factors")
        title_b <- paste0(title_b, " and Factors")
    }

    a_all <- plot_grid(ggdraw() + draw_label(title_a, fontface = "bold"),
                       plot_grid(plotlist = A, ncol = 2),
                       ncol = 1, rel_heights = c(0.05, 0.95))
    b_all <- plot_grid(ggdraw() + draw_label(title_b, fontface = "bold"),
                       plot_grid(plotlist = B, ncol = 2),
                       ncol = 1, rel_heights = c(0.05, 0.95))
    return(list(cell = a_all, bulk = b_all))
}

filter_output <- function(grudat_spread_melt, out_filt) {
    print_red <- function(comment, red_list) {
        cat(paste(comment, paste(red_list, collapse = ", "), "\n"))
    }
    grudat_filt <- grudat_spread_melt
    print_red("Total Cell types:", unique(grudat_filt$Cell))
    if (!is.null(out_filt$cells)) {
        grudat_filt <- grudat_filt[grudat_filt$Cell %in% out_filt$cells, ]
        print_red(" - selecting:", out_filt$cells)
    }
    print_red("Total Factors:", unique(grudat_spread_melt$Factors))
    if (!is.null(out_filt$facts)) {
        grudat_filt <- grudat_filt[grudat_filt$Factors %in% out_filt$facts, ]
        print_red(" - selecting:", out_filt$facts)
    }
    return(grudat_filt)
}

writable2 <- function(obj, prefix, title) {
    write.table(obj,
                file = paste0("report_data/", prefix, "_",
                              title, ".tabular"),
                quote = F, sep = "\t", col.names = NA)
}


results <- music_on_all(files)
summat <- summarized_matrix(results)
grudat <- group_by_dataset(summat)
grudat_spread_melt <- merge_factors_spread(grudat$spread,
                                           flatten_factor_list(results))
grudat_spread_melt_filt <- filter_output(grudat_spread_melt, out_filt)

plot_all_individual_heatmaps(results)

## The output filters ONLY apply to boxplots, since these take
do_factors <- (length(unique(grudat_spread_melt[["Factors"]])) > 1)
box_plots <- summarize_boxplots(grudat_spread_melt_filt, do_factors)
heat_maps <- summarize_heatmaps(grudat_spread_melt_filt, do_factors,
                                dendro_setting)

pdf(out_heatsumm_pdf, width = 14, height = 14)
print(heat_maps)
print(box_plots)
dev.off()

## Generate output tables
stats_prop <- lapply(grudat$spread$prop, function(x) {
    t(apply(x, 1, summary))})
stats_scale <- lapply(grudat$spread$scale, function(x) {
    t(apply(x, 1, summary))})

## Make the value table printable
grudat_spread_melt$value.scale <- as.integer(grudat_spread_melt$value.scale) # nolint
colnames(grudat_spread_melt) <- c("Sample", "Cell", "Bulk", "Factors",
                                  "CT Prop in Sample", "Number of Reads")

writable2(grudat_spread_melt, "values", "Data Table")
writable2(summat$prop, "values", "Matrix of Cell Type Sample Proportions")
writable2({
    aa <- as.matrix(summat$scaled); mode(aa) <- "integer"; aa
}, "values", "Matrix of Cell Type Read Counts")

for (bname in names(stats_prop)) {
    writable2(stats_prop[[bname]], "stats", paste0(bname, ": Sample Props"))
    writable2(stats_scale[[bname]], "stats", paste0(bname, ": Read Props"))
}