Repository 'music_construct_eset'
hg clone https://toolshed.g2.bx.psu.edu/repos/bgruening/music_construct_eset

Changeset 3:7ffaa0968da3 (2022-02-10)
Previous changeset 2:7902cd31b9b5 (2022-01-29) Next changeset 4:282819d09a4f (2022-05-02)
Commit message:
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit 8beed1a19fcd9dc59f7746e1dfa735a2d5f29784"
modified:
macros.xml
scripts/estimateprops.R
added:
scripts/compare.R
test-data/default_output_no_disease_nnls.pdf
test-data/out_filt1.pdf
test-data/out_heat2.pdf
b
diff -r 7902cd31b9b5 -r 7ffaa0968da3 macros.xml
--- a/macros.xml Sat Jan 29 12:52:10 2022 +0000
+++ b/macros.xml Thu Feb 10 12:53:22 2022 +0000
b
@@ -1,5 +1,5 @@
 <macros>
-    <token name="@VERSION_SUFFIX@">2</token>
+    <token name="@VERSION_SUFFIX@">3</token>
     <!-- The ESet inspector/constructor and MuSiC tool can have
          independent Galaxy versions but should reference the same
          package version always. -->
b
diff -r 7902cd31b9b5 -r 7ffaa0968da3 scripts/compare.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/compare.R Thu Feb 10 12:53:22 2022 +0000
[
b'@@ -0,0 +1,440 @@\n+suppressWarnings(suppressPackageStartupMessages(library(xbioc)))\n+suppressWarnings(suppressPackageStartupMessages(library(MuSiC)))\n+suppressWarnings(suppressPackageStartupMessages(library(reshape2)))\n+suppressWarnings(suppressPackageStartupMessages(library(cowplot)))\n+## We use this script to estimate the effectiveness of proportion methods\n+\n+## Load Conf\n+args <- commandArgs(trailingOnly = TRUE)\n+source(args[1])\n+\n+method_key <- list("MuSiC" = "est_music",\n+                   "NNLS" = "est_nnls")[[est_method]]\n+\n+\n+scale_yaxes <- function(gplot, value) {\n+    if (is.na(value)) {\n+        gplot\n+    } else {\n+        gplot + scale_y_continuous(lim = c(0, value))\n+    }\n+}\n+\n+\n+set_factor_data <- function(bulk_data, factor_name = NULL) {\n+    if (is.null(factor_name)) {\n+        factor_name <- "None" ## change to something plottable\n+    }\n+    pdat <- pData(bulk_data)\n+    sam_fact <- NULL\n+    if (factor_name %in% colnames(pdat)) {\n+        sam_fact <- cbind(rownames(pdat),\n+                          as.character(pdat[[factor_name]]))\n+        cat(paste0("   - factor: ", factor_name,\n+                   " found in phenotypes\\n"))\n+    } else {\n+        ## We assign this as the factor for the entire dataset\n+        sam_fact <- cbind(rownames(pdat),\n+                          factor_name)\n+        cat(paste0("   - factor: assigning \\"", factor_name,\n+                   "\\" to whole dataset\\n"))\n+    }\n+    colnames(sam_fact) <- c("Samples", "Factors")\n+    return(as.data.frame(sam_fact))\n+}\n+\n+## Due to limiting sizes, we need to load and unload\n+## possibly very large datasets.\n+process_pair <- function(sc_data, bulk_data,\n+                         ctypes_label, samples_label, ctypes,\n+                         factor_group) {\n+    ## - Generate\n+    est_prop <- music_prop(\n+        bulk.eset = bulk_data, sc.eset = sc_data,\n+        clusters = ctypes_label,\n+        samples = samples_label, select.ct = ctypes, verbose = T)\n+    ## -\n+    estimated_music_props <- est_prop$Est.prop.weighted\n+    estimated_nnls_props <- est_prop$Est.prop.allgene\n+    ## -\n+    fact_data <- set_factor_data(bulk_data, factor_group)\n+    ## -\n+    return(list(est_music = estimated_music_props,\n+                est_nnls = estimated_nnls_props,\n+                bulk_sample_totals = colSums(exprs(bulk_data)),\n+                plot_groups = fact_data))\n+}\n+\n+music_on_all <- function(files) {\n+    results <- list()\n+    for (sc_name in names(files)) {\n+        cat(paste0("sc-group:", sc_name, "\\n"))\n+        scgroup <- files[[sc_name]]\n+        ## - sc Data\n+        sc_est <- readRDS(scgroup$dataset)\n+        ## - params\n+        celltypes_label <- scgroup$label_cell\n+        samples_label <- scgroup$label_sample\n+        celltypes <- scgroup$celltype\n+\n+        results[[sc_name]] <- list()\n+        for (bulk_name in names(scgroup$bulk)) {\n+            cat(paste0(" - bulk-group:", bulk_name, "\\n"))\n+            bulkgroup <- scgroup$bulk[[bulk_name]]\n+            ## - bulk Data\n+            bulk_est <- readRDS(bulkgroup$dataset)\n+            ## - bulk params\n+            pheno_facts <- bulkgroup$pheno_facts\n+            pheno_excl <- bulkgroup$pheno_excl\n+            ##\n+            results[[sc_name]][[bulk_name]] <- process_pair(\n+                sc_est, bulk_est,\n+                celltypes_label, samples_label,\n+                celltypes, bulkgroup$factor_group)\n+            ##\n+            rm(bulk_est) ## unload\n+        }\n+        rm(sc_est) ## unload\n+    }\n+    return(results)\n+}\n+\n+plot_all_individual_heatmaps <- function(results) {\n+    pdf(out_heatmulti_pdf, width = 8, height = 8)\n+    for (sc_name in names(results)) {\n+        for (bk_name in names(results[[sc_name]])) {\n+            res <- results[[sc_name]][[bk_name]]\n+            plot_hmap <- Prop_heat_Est(\n+                data.matrix(res[[method_key]]), method.name = est_method) +\n+                ggtitle(paste0("[", est_method, "Cell type ",\n+                          '..b'\n+    B$p2 <- common1 + geom_boxplot(aes(y = Bulk, color = Cell)) +\n+        ylab("Bulk Dataset")\n+    ## -- Factor plots are optional\n+    A$p3 <- B$p3 <- A$p4 <- B$p4 <- ggplot() + theme_void()\n+\n+    if (do_factors) {\n+        A$p3 <- common1 + geom_boxplot(aes(y = Cell, color = Factors))\n+        A$p4 <- common2 + geom_boxplot(aes(y = Cell, color = Factors))\n+        B$p3 <- common1 + geom_boxplot(aes(y = Bulk, color = Factors)) +\n+            ylab("Bulk Dataset")\n+        B$p4 <- common2 + geom_boxplot(aes(y = Bulk, color = Factors)) +\n+            ylab("Bulk Dataset")\n+    }\n+\n+    title_a <- "Cell Types against Bulk"\n+    title_b <- "Bulk Datasets against Cells"\n+    if (do_factors) {\n+        title_a <- paste0(title_a, " and Factors")\n+        title_b <- paste0(title_b, " and Factors")\n+    }\n+\n+    a_all <- plot_grid(ggdraw() + draw_label(title_a, fontface = "bold"),\n+                       plot_grid(plotlist = A, ncol = 2),\n+                       ncol = 1, rel_heights = c(0.05, 0.95))\n+    b_all <- plot_grid(ggdraw() + draw_label(title_b, fontface = "bold"),\n+                       plot_grid(plotlist = B, ncol = 2),\n+                       ncol = 1, rel_heights = c(0.05, 0.95))\n+    return(list(cell = a_all, bulk = b_all))\n+}\n+\n+filter_output <- function(grudat_spread_melt, out_filt) {\n+    print_red <- function(comment, red_list) {\n+        cat(paste(comment, paste(red_list, collapse = ", "), "\\n"))\n+    }\n+    grudat_filt <- grudat_spread_melt\n+    print_red("Total Cell types:", unique(grudat_filt$Cell))\n+    if (!is.null(out_filt$cells)) {\n+        grudat_filt <- grudat_filt[grudat_filt$Cell %in% out_filt$cells, ]\n+        print_red(" - selecting:", out_filt$cells)\n+    }\n+    print_red("Total Factors:", unique(grudat_spread_melt$Factors))\n+    if (!is.null(out_filt$facts)) {\n+        grudat_filt <- grudat_filt[grudat_filt$Factors %in% out_filt$facts, ]\n+        print_red(" - selecting:", out_filt$facts)\n+    }\n+    return(grudat_filt)\n+}\n+\n+\n+results <- music_on_all(files)\n+\n+if (heat_grouped_p) {\n+    plot_grouped_heatmaps(results)\n+} else {\n+    plot_all_individual_heatmaps(results)\n+}\n+\n+save.image("/tmp/sesh.rds")\n+\n+summat <- summarized_matrix(results)\n+grudat <- group_by_dataset(summat)\n+grudat_spread_melt <- merge_factors_spread(grudat$spread,\n+                                           flatten_factor_list(results))\n+\n+\n+\n+## The output filters ONLY apply to boxplots, since these take\n+do_factors <- (length(unique(grudat_spread_melt[["Factors"]])) > 1)\n+\n+grudat_spread_melt_filt <- filter_output(grudat_spread_melt, out_filt)\n+\n+heat_maps <- summarize_heatmaps(grudat_spread_melt_filt, do_factors)\n+box_plots <- summarize_boxplots(grudat_spread_melt_filt, do_factors)\n+\n+pdf(out_heatsumm_pdf, width = 14, height = 14)\n+print(heat_maps)\n+print(box_plots)\n+dev.off()\n+\n+## Generate output tables\n+stats_prop <- lapply(grudat$spread$prop, function(x) {\n+    t(apply(x, 1, summary))})\n+stats_scale <- lapply(grudat$spread$scale, function(x) {\n+    t(apply(x, 1, summary))})\n+\n+writable2 <- function(obj, prefix, title) {\n+    write.table(obj,\n+                file = paste0("report_data/", prefix, "_",\n+                              title, ".tabular"),\n+                quote = F, sep = "\\t", col.names = NA)\n+}\n+## Make the value table printable\n+grudat_spread_melt$value.scale <- as.integer(grudat_spread_melt$value.scale) # nolint\n+colnames(grudat_spread_melt) <- c("Sample", "Cell", "Bulk", "Factors",\n+                                  "CT Prop in Sample", "Number of Reads")\n+\n+writable2(grudat_spread_melt, "values", "Data Table")\n+writable2(summat$prop, "values", "Matrix of Cell Type Sample Proportions")\n+writable2({\n+    aa <- as.matrix(summat$scaled); mode(aa) <- "integer"; aa\n+}, "values", "Matrix of Cell Type Read Counts")\n+\n+for (bname in names(stats_prop)) {\n+    writable2(stats_prop[[bname]], "stats", paste0(bname, ": Sample Props"))\n+    writable2(stats_scale[[bname]], "stats", paste0(bname, ": Read Props"))\n+}\n'
b
diff -r 7902cd31b9b5 -r 7ffaa0968da3 scripts/estimateprops.R
--- a/scripts/estimateprops.R Sat Jan 29 12:52:10 2022 +0000
+++ b/scripts/estimateprops.R Thu Feb 10 12:53:22 2022 +0000
[
b'@@ -14,8 +14,12 @@\n     clusters = celltypes_label,\n     samples = samples_label, select.ct = celltypes, verbose = T)\n \n+\n estimated_music_props <- est_prop$Est.prop.weighted\n estimated_nnls_props <- est_prop$Est.prop.allgene\n+##\n+estimated_music_props_flat <- melt(estimated_music_props)\n+estimated_nnls_props_flat <- melt(estimated_nnls_props)\n \n scale_yaxes <- function(gplot, value) {\n     if (is.na(value)) {\n@@ -25,23 +29,36 @@\n     }\n }\n \n+sieve_data <- function(func, music_data, nnls_data) {\n+    if (func == "list") {\n+        res <- list(if ("MuSiC" %in% methods) music_data else NULL,\n+                    if ("NNLS" %in% methods) nnls_data else NULL)\n+        res[lengths(res) > 0] ## filter out NULL elements\n+    } else if (func == "rbind") {\n+        rbind(if ("MuSiC" %in% methods) music_data else NULL,\n+              if ("NNLS" %in% methods) nnls_data else NULL)\n+    } else if (func == "c") {\n+        c(if ("MuSiC" %in% methods) music_data else NULL,\n+          if ("NNLS" %in% methods) nnls_data else NULL)\n+    }\n+}\n+\n+\n ## Show different in estimation methods\n ## Jitter plot of estimated cell type proportions\n jitter_fig <- scale_yaxes(Jitter_Est(\n-    list(data.matrix(estimated_music_props),\n-         data.matrix(estimated_nnls_props)),\n+    sieve_data("list",\n+               data.matrix(estimated_music_props),\n+               data.matrix(estimated_nnls_props)),\n     method.name = methods, title = "Jitter plot of Est Proportions",\n     size = 2, alpha = 0.7) + theme_minimal(), maxyscale)\n \n-\n ## Make a Plot\n ## A more sophisticated jitter plot is provided as below. We separated\n ## the T2D subjects and normal subjects by their disease factor levels.\n-estimated_music_props_flat <- melt(estimated_music_props)\n-estimated_nnls_props_flat <- melt(estimated_nnls_props)\n-\n-m_prop <- rbind(estimated_music_props_flat,\n-                estimated_nnls_props_flat)\n+m_prop <- sieve_data("rbind",\n+                     estimated_music_props_flat,\n+                     estimated_nnls_props_flat)\n colnames(m_prop) <- c("Sub", "CellType", "Prop")\n \n if (is.null(celltypes)) {\n@@ -69,7 +86,7 @@\n         phenotype_target_threshold <- -Inf\n         message("phenotype target threshold set to -Inf")\n     }\n-\n+    ## the "2" here is to do with the sample groups, not number of methods\n     m_prop$Disease_factor <- rep(bulk_eset[[phenotype_target]], 2 * length(celltypes)) # nolint\n     m_prop <- m_prop[!is.na(m_prop$Disease_factor), ]\n     ## Generate a TRUE/FALSE table of Normal == 1 and Disease == 2\n@@ -84,8 +101,10 @@\n     m_prop <- rbind(subset(m_prop, Disease != sample_disease_group),\n                     subset(m_prop, Disease == sample_disease_group))\n \n-    jitter_new <- scale_yaxes(ggplot(m_prop, aes(Method, Prop)) +\n-        geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease),\n+    jitter_new <- scale_yaxes(\n+        ggplot(m_prop, aes(Method, Prop)) +\n+        geom_point(aes(fill = Method, color = Disease,\n+                       stroke = D, shape = Disease),\n                    size = 2, alpha = 0.7,\n                    position = position_jitter(width = 0.25, height = 0)) +\n         facet_wrap(~ CellType, scales = "free") +\n@@ -100,21 +119,23 @@\n     ## Create dataframe for beta cell proportions and Disease_factor levels\n     ## - Ugly code. Essentially, doubles the cell type proportions for each\n     ##   set of MuSiC and NNLS methods\n-    m_prop_ana <- data.frame(pData(bulk_eset)[rep(1:nrow(estimated_music_props), 2), #nolint\n-                                              phenotype_factors],\n-                             ## get proportions of target cell type\n-                             ct.prop = c(estimated_music_props[, phenotype_scrna_target],\n-                                         estimated_nnls_props[, phenotype_scrna_target]),\n-                             ##\n-                             Method = factor(rep(methods,\n-                                                 each = nrow(estimated_music_'..b'       geom_smooth(method = "lm",  se = FALSE, col = "black", lwd = 0.25) +\n-        geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease),\n+        geom_point(aes(fill = Method, color = Disease,\n+                       stroke = D, shape = Disease),\n                    size = 2, alpha = 0.7) +  facet_wrap(~ Method) +\n         ggtitle(paste0(toupper(phenotype_target), " vs. ",\n-                       toupper(phenotype_scrna_target), " Cell Type Proportion")) +\n+                       toupper(phenotype_scrna_target),\n+                       " Cell Type Proportion")) +\n         theme_minimal() +\n         ylab(paste0("Proportion of ",\n                     phenotype_scrna_target, " cells")) +\n@@ -138,19 +162,22 @@\n }\n \n ## BoxPlot\n-plot_box <- scale_yaxes(Boxplot_Est(list(\n-    data.matrix(estimated_music_props),\n-    data.matrix(estimated_nnls_props)),\n-    method.name = c("MuSiC", "NNLS")) +\n+plot_box <- scale_yaxes(Boxplot_Est(\n+    sieve_data("list",\n+               data.matrix(estimated_music_props),\n+               data.matrix(estimated_nnls_props)),\n+    method.name = methods) +\n     theme(axis.text.x = element_text(angle = -90),\n           axis.text.y = element_text(size = 8)) +\n     ggtitle(element_blank()) + theme_minimal(), maxyscale)\n \n ## Heatmap\n-plot_hmap <- Prop_heat_Est(list(\n-    data.matrix(estimated_music_props),\n-    data.matrix(estimated_nnls_props)),\n-    method.name = c("MuSiC", "NNLS")) +\n+plot_hmap <- Prop_heat_Est(\n+    sieve_data(\n+        "list",\n+        data.matrix(estimated_music_props),\n+        data.matrix(estimated_nnls_props)),\n+    method.name = methods) +\n     theme(axis.text.x = element_text(angle = -90),\n           axis.text.y = element_text(size = 6))\n \n@@ -167,33 +194,29 @@\n plot_hmap\n message(dev.off())\n \n-## Output Proportions\n+writable <- function(obj, prefix, title) {\n+    write.table(obj,\n+                file = paste0("report_data/", prefix, "_",\n+                              title, ".tabular"),\n+                quote = F, sep = "\\t", col.names = NA)\n+}\n \n-write.table(est_prop$Est.prop.weighted,\n-            file = paste0("report_data/prop_",\n-                          "Music Estimated Proportions of Cell Types",\n-                          ".tabular"),\n-            quote = F, sep = "\\t", col.names = NA)\n-write.table(est_prop$Est.prop.allgene,\n-            file = paste0("report_data/prop_",\n-                          "NNLS Estimated Proportions of Cell Types",\n-                          ".tabular"),\n-            quote = F, sep = "\\t", col.names = NA)\n-write.table(est_prop$Weight.gene,\n-            file = paste0("report_data/weightgene_",\n-                          "Music Estimated Proportions of Cell Types (by Gene)",\n-                          ".tabular"),\n-            quote = F, sep = "\\t", col.names = NA)\n-write.table(est_prop$r.squared.full,\n-            file = paste0("report_data/rsquared_",\n-                          "Music R-sqr Estimated Proportions of Each Subject",\n-                          ".tabular"),\n-            quote = F, sep = "\\t", col.names = NA)\n-write.table(est_prop$Var.prop,\n-            file = paste0("report_data/varprop_",\n-                          "Matrix of Variance of MuSiC Estimates",\n-                          ".tabular"),\n-            quote = F, sep = "\\t", col.names = NA)\n+## Output Proportions\n+if ("NNLS" %in% methods) {\n+    writable(est_prop$Est.prop.allgene, "prop",\n+             "NNLS Estimated Proportions of Cell Types")\n+}\n+\n+if ("MuSiC" %in% methods) {\n+    writable(est_prop$Est.prop.weighted, "prop",\n+             "Music Estimated Proportions of Cell Types")\n+    writable(est_prop$Weight.gene, "weightgene",\n+             "Music Estimated Proportions of Cell Types (by Gene)")\n+    writable(est_prop$r.squared.full, "rsquared",\n+             "Music R-sqr Estimated Proportions of Each Subject")\n+    writable(est_prop$Var.prop, "varprop",\n+             "Matrix of Variance of MuSiC Estimates")\n+}\n \n \n if (use_disease_factor) {\n'
b
diff -r 7902cd31b9b5 -r 7ffaa0968da3 test-data/default_output_no_disease_nnls.pdf
b
Binary file test-data/default_output_no_disease_nnls.pdf has changed
b
diff -r 7902cd31b9b5 -r 7ffaa0968da3 test-data/out_filt1.pdf
b
Binary file test-data/out_filt1.pdf has changed
b
diff -r 7902cd31b9b5 -r 7ffaa0968da3 test-data/out_heat2.pdf
b
Binary file test-data/out_heat2.pdf has changed