Repository 'music_deconvolution'
hg clone https://toolshed.g2.bx.psu.edu/repos/bgruening/music_deconvolution

Changeset 4:56371b5a2da9 (2022-02-10)
Previous changeset 3:fd7a16d073c5 (2022-01-29) Next changeset 5:2ba99a52bd44 (2022-05-02)
Commit message:
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit 8beed1a19fcd9dc59f7746e1dfa735a2d5f29784"
modified:
macros.xml
scripts/estimateprops.R
added:
music_deconvolution.xml
scripts/compare.R
test-data/default_output_no_disease_nnls.pdf
test-data/out_filt1.pdf
test-data/out_heat2.pdf
removed:
music-deconvolution.xml
b
diff -r fd7a16d073c5 -r 56371b5a2da9 macros.xml
--- a/macros.xml Sat Jan 29 12:51:49 2022 +0000
+++ b/macros.xml Thu Feb 10 12:52:31 2022 +0000
b
@@ -1,5 +1,5 @@
 <macros>
-    <token name="@VERSION_SUFFIX@">2</token>
+    <token name="@VERSION_SUFFIX@">3</token>
     <!-- The ESet inspector/constructor and MuSiC tool can have
          independent Galaxy versions but should reference the same
          package version always. -->
b
diff -r fd7a16d073c5 -r 56371b5a2da9 music-deconvolution.xml
--- a/music-deconvolution.xml Sat Jan 29 12:51:49 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
b'@@ -1,298 +0,0 @@\n-<tool id="music_deconvolution" name="MuSiC" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@"\n-      profile="21.09" license="GPL-3.0-or-later" >\n-    <description>estimate cell type proportions in bulk RNA-seq data</description>\n-    <macros>\n-        <import>macros.xml</import>\n-    </macros>\n-    <expand macro="requirements" />\n-    <command detect_errors="exit_code" ><![CDATA[\n-mkdir report_data &&\n-Rscript --vanilla \'$__tool_directory__/scripts/${do.method}.R\' \'$conf\'\n-]]></command>\n-    <configfiles>\n-        <configfile name="conf" >\n-\n-null_str_vec = function(gstr){\n-   tokens = unlist(as.vector(strsplit(gstr, split=",")))\n-   if (length(tokens) == 0){\n-      return(NULL)\n-   }\n-   if (length(tokens) == 1){\n-      return(tokens[[1]])\n-   }\n-   return(tokens)\n-}\n-\n-bulk_eset = readRDS(\'$bulk_eset\')\n-scrna_eset = readRDS(\'$scrna_eset\')\n-use_disease_factor = FALSE\n-maxyscale = NA\n-\n-#if str($do.method) == "estimateprops":\n-\n-maxyscale = as.numeric(\'$do.maxyscale\')  ## yields "NA" if blank\n-phenotype_factors = null_str_vec(\'$do.phenotype_factors\')\n-phenotype_factors_always_exclude = null_str_vec(\'$do.phenotype_factors_always_exclude\')\n-celltypes_label = null_str_vec(\'$do.celltypes_label\')\n-samples_label = null_str_vec(\'$do.samples_label\')\n-celltypes = null_str_vec(\'$do.celltypes\')\n-methods = c("MuSiC", "NNLS")\n-\n-    #if str($do.disease_factor.use) == "yes":\n-use_disease_factor = TRUE\n-phenotype_scrna_target = null_str_vec(\'$do.disease_factor.phenotype_scrna_target\')\n-phenotype_target = null_str_vec(\'$do.disease_factor.phenotype_target\')\n-phenotype_target_threshold = as.numeric(\'$do.disease_factor.phenotype_target_threshold\')\n-sample_disease_group = null_str_vec(\'$do.disease_factor.sample_disease_group\')\n-sample_disease_group_scale = as.integer(\'$do.disease_factor.sample_disease_group_scale\')\n-    #end if\n-\n-outfile_pdf=\'$out_pdf\'\n-\n-#elif str($do.method) == "dendrogram":\n-\n-celltypes_label = null_str_vec(\'$do.celltypes_label\')\n-clustertype_label = null_str_vec(\'$do.clustertype_label\')\n-samples_label = null_str_vec(\'$do.samples_label\')\n-celltypes = null_str_vec(\'$do.celltypes\')\n-\n-data.to.use = list(\n-    #for $i, $repeat in enumerate( $do.cluster_groups )\n-        #if $i == 0:\n-        $repeat.cluster_id = list(cell.types = null_str_vec(\'$repeat.celltypes\'),\n-            marker.names = null_str_vec(\'$repeat.marker_name\'),\n-            marker.list = read_list(\'$repeat.marker_list\'))\n-        #else\n-        , $repeat.cluster_id = list(cell.types = null_str_vec(\'$repeat.celltypes\'),\n-            marker.names = null_str_vec(\'$repeat.marker_name\'),\n-            marker.list = read_list(\'$repeat.marker_list\'))\n-        #end if\n-    #end for\n-)\n-\n-outfile_pdf=\'$out_pdf\'\n-outfile_tab=\'$out_tab\'\n-\n-#else\n-    stop("No such option")\n-#end if\n-\n-        </configfile>\n-    </configfiles>\n-    <inputs>\n-        <param name="scrna_eset" label="scRNA Dataset" type="data" format="@RDATATYPE@" />\n-        <param name="bulk_eset" label="Bulk RNA Dataset" type="data" format="@RDATATYPE@" />\n-        <conditional name="do" >\n-            <param name="method" type="select" label="Purpose" >\n-                <!-- The values here correspond to script names in the scripts folder\n-                     and must remain so -->\n-                <option value="estimateprops">Estimate Proportions</option>\n-                <option value="dendrogram">Compute Dendrogram</option>\n-            </param>\n-            <when value="estimateprops" >\n-                <param name="celltypes_label" type="text" value="cellType"\n-                       label="Cell Types Label from scRNA dataset" >\n-                    <expand macro="validator_text" />\n-                </param>\n-                <param name="samples_label" type="text" value="sampleID"\n-                       label="Samples Identifier from scRNA dataset" >\n-                    <expand macro="validator_text" />\n-                </param>\n-                <expand macro="celltypes_macro" />'..b'>\n-                </repeat>\n-            </conditional>\n-            <output name="out_pdf" value="dendro.pdf" compare="sim_size" />\n-            <output name="out_tab">\n-                <assert_contents>\n-                    <has_text_matching expression="^\\s+Neutro\\s+Podo\\s+Endo" />\n-                    <has_text text="APOL1.GNA78M"/>\n-                </assert_contents>\n-            </output>\n-        </test>\n-        <test expect_num_outputs="2" >\n-            <!-- Estimate Proportions: no disease factor, no fitting reports -->\n-            <param name="bulk_eset" value="GSE50244bulkeset.subset.rds" />\n-            <param name="scrna_eset" value="EMTABesethealthy.subset.rds" />\n-            <conditional name="do" >\n-                <param name="method" value="estimateprops" />\n-                <param name="celltypes_label" value="cellType" />\n-                <param name="samples_label" value="sampleID" />\n-                <param name="disease_factor" value="no" />\n-            </conditional>\n-            <output name="out_pdf" value="default_output_no_disease.pdf" compare="sim_size" />\n-        </test>\n-        <test expect_num_outputs="3" >\n-            <!-- Estimate Proportions test -->\n-            <param name="bulk_eset" value="GSE50244bulkeset.subset.rds" />\n-            <param name="scrna_eset" value="EMTABesethealthy.subset.rds" />\n-            <conditional name="do" >\n-                <param name="method" value="estimateprops" />\n-                <param name="celltypes_label" value="cellType" />\n-                <param name="samples_label" value="sampleID" />\n-                <param name="celltypes" value="alpha,beta,delta,gamma,acinar,ductal" />\n-                <conditional name="disease_factor" >\n-                    <param name="use" value="yes" />\n-                    <param name="phenotype_scrna_target" value="beta" />\n-                    <param name="phenotype_factors" value="age,bmi,hba1c,gender" />\n-                    <param name="phenotype_target" value="hba1c" />\n-                    <param name="phenotype_target_threshold" value="6.5" />\n-                    <param name="sample_disease_group" value="T2D" />\n-                    <param name="sample_disease_group_scale" value="5" />\n-                </conditional>\n-            </conditional>\n-            <output name="out_pdf" value="default_output.pdf" compare="sim_size" />\n-            <output_collection name="summaries" count="5">\n-                <element name="Log of MuSiC fitting" ftype="txt">\n-                    <assert_contents>\n-                        <has_text text="Residual standard error: 0.1704 on 72 degrees of freedom"/>\n-                    </assert_contents>\n-                </element>\n-                <element name="Log of NNLS fitting" ftype="txt">\n-                    <assert_contents>\n-                        <has_text text="Residual standard error: 0.0645 on 72 degrees of freedom"/>\n-                    </assert_contents>\n-                </element>\n-            </output_collection>\n-        </test>\n-    </tests>\n-    <help><![CDATA[\n-MuSiC utilizes cell-type specific gene expression from single-cell RNA sequencing (RNA-seq) data to characterize cell type compositions from bulk RNA-seq data in complex tissues. By appropriate weighting of genes showing cross-subject and cross-cell consistency, MuSiC enables the transfer of cell type-specific gene expression information from one dataset to another.\n-\n-Solid tissues often contain closely related cell types which leads to collinearity. To deal with collinearity, MuSiC employs a tree-guided procedure that recursively zooms in on closely related cell types. Briefly, we first group similar cell types into the same cluster and estimate cluster proportions, then recursively repeat this procedure within each cluster.\n-\n-    ]]></help>\n-    <citations>\n-        <citation type="doi">https://doi.org/10.1038/s41467-018-08023-x</citation>\n-    </citations>\n-</tool>\n\\ No newline at end of file\n'
b
diff -r fd7a16d073c5 -r 56371b5a2da9 music_deconvolution.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/music_deconvolution.xml Thu Feb 10 12:52:31 2022 +0000
[
b'@@ -0,0 +1,315 @@\n+<tool id="music_deconvolution" name="MuSiC Deconvolution" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@"\n+      profile="21.09" license="GPL-3.0-or-later" >\n+    <description>estimate cell type proportions in bulk RNA-seq data</description>\n+    <macros>\n+        <import>macros.xml</import>\n+    </macros>\n+    <expand macro="requirements" />\n+    <command detect_errors="exit_code" ><![CDATA[\n+mkdir report_data &&\n+Rscript --vanilla \'$__tool_directory__/scripts/${do.method}.R\' \'$conf\'\n+]]></command>\n+    <configfiles>\n+        <configfile name="conf" >\n+\n+null_str_vec = function(gstr){\n+   tokens = unlist(as.vector(strsplit(gstr, split=",")))\n+   if (length(tokens) == 0){\n+      return(NULL)\n+   }\n+   if (length(tokens) == 1){\n+      return(tokens[[1]])\n+   }\n+   return(tokens)\n+}\n+\n+bulk_eset = readRDS(\'$bulk_eset\')\n+scrna_eset = readRDS(\'$scrna_eset\')\n+use_disease_factor = FALSE\n+maxyscale = NA\n+\n+#if str($do.method) == "estimateprops":\n+\n+maxyscale = as.numeric(\'$do.maxyscale\')  ## yields "NA" if blank\n+phenotype_factors = null_str_vec(\'$do.phenotype_factors\')\n+phenotype_factors_always_exclude = null_str_vec(\'$do.phenotype_factors_always_exclude\')\n+celltypes_label = null_str_vec(\'$do.celltypes_label\')\n+samples_label = null_str_vec(\'$do.samples_label\')\n+celltypes = null_str_vec(\'$do.celltypes\')\n+methods = c(null_str_vec(\'$do.est_methods\'))\n+\n+    #if str($do.disease_factor.use) == "yes":\n+use_disease_factor = TRUE\n+phenotype_scrna_target = null_str_vec(\'$do.disease_factor.phenotype_scrna_target\')\n+phenotype_target = null_str_vec(\'$do.disease_factor.phenotype_target\')\n+phenotype_target_threshold = as.numeric(\'$do.disease_factor.phenotype_target_threshold\')\n+sample_disease_group = null_str_vec(\'$do.disease_factor.sample_disease_group\')\n+sample_disease_group_scale = as.integer(\'$do.disease_factor.sample_disease_group_scale\')\n+    #end if\n+\n+outfile_pdf=\'$out_pdf\'\n+\n+#elif str($do.method) == "dendrogram":\n+\n+celltypes_label = null_str_vec(\'$do.celltypes_label\')\n+clustertype_label = null_str_vec(\'$do.clustertype_label\')\n+samples_label = null_str_vec(\'$do.samples_label\')\n+celltypes = null_str_vec(\'$do.celltypes\')\n+\n+data.to.use = list(\n+    #for $i, $repeat in enumerate( $do.cluster_groups )\n+        #if $i == 0:\n+        $repeat.cluster_id = list(cell.types = null_str_vec(\'$repeat.celltypes\'),\n+            marker.names = null_str_vec(\'$repeat.marker_name\'),\n+            marker.list = read_list(\'$repeat.marker_list\'))\n+        #else\n+        , $repeat.cluster_id = list(cell.types = null_str_vec(\'$repeat.celltypes\'),\n+            marker.names = null_str_vec(\'$repeat.marker_name\'),\n+            marker.list = read_list(\'$repeat.marker_list\'))\n+        #end if\n+    #end for\n+)\n+\n+outfile_pdf=\'$out_pdf\'\n+outfile_tab=\'$out_tab\'\n+\n+#else\n+    stop("No such option")\n+#end if\n+\n+        </configfile>\n+    </configfiles>\n+    <inputs>\n+        <param name="scrna_eset" label="scRNA Dataset" type="data" format="@RDATATYPE@" />\n+        <param name="bulk_eset" label="Bulk RNA Dataset" type="data" format="@RDATATYPE@" />\n+        <conditional name="do" >\n+            <param name="method" type="select" label="Purpose" >\n+                <!-- The values here correspond to script names in the scripts folder\n+                     and must remain so -->\n+                <option value="estimateprops">Estimate Proportions</option>\n+                <option value="dendrogram">Compute Dendrogram</option>\n+            </param>\n+            <when value="estimateprops" >\n+                <param name="est_methods" type="select" multiple="true" label="Methods to use" min="1" >\n+                    <option value="MuSiC" selected="true" >MuSiC</option>\n+                    <option value="NNLS" selected="true" >NNLS</option>\n+                </param>\n+                <param name="celltypes_label" type="text" value="cellType"\n+                       label="Cell Types Label from scRNA dataset" >\n+                    <expand macro="validator_tex'..b'"estimateprops" />\n+                <param name="est_methods" value="NNLS" />\n+                <param name="celltypes_label" value="cellType" />\n+                <param name="samples_label" value="sampleID" />\n+                <param name="disease_factor" value="no" />\n+            </conditional>\n+            <output name="out_pdf" value="default_output_no_disease_nnls.pdf" compare="sim_size" />\n+        </test>\n+        <test expect_num_outputs="2" >\n+            <!-- Estimate Proportions: no disease factor, no fitting reports -->\n+            <param name="bulk_eset" value="GSE50244bulkeset.subset.rds" />\n+            <param name="scrna_eset" value="EMTABesethealthy.subset.rds" />\n+            <conditional name="do" >\n+                <param name="method" value="estimateprops" />\n+                <param name="celltypes_label" value="cellType" />\n+                <param name="samples_label" value="sampleID" />\n+                <param name="disease_factor" value="no" />\n+            </conditional>\n+            <output name="out_pdf" value="default_output_no_disease.pdf" compare="sim_size" />\n+        </test>\n+        <test expect_num_outputs="3" >\n+            <!-- Estimate Proportions test -->\n+            <param name="bulk_eset" value="GSE50244bulkeset.subset.rds" />\n+            <param name="scrna_eset" value="EMTABesethealthy.subset.rds" />\n+            <conditional name="do" >\n+                <param name="method" value="estimateprops" />\n+                <param name="celltypes_label" value="cellType" />\n+                <param name="samples_label" value="sampleID" />\n+                <param name="celltypes" value="alpha,beta,delta,gamma,acinar,ductal" />\n+                <conditional name="disease_factor" >\n+                    <param name="use" value="yes" />\n+                    <param name="phenotype_scrna_target" value="beta" />\n+                    <param name="phenotype_factors" value="age,bmi,hba1c,gender" />\n+                    <param name="phenotype_target" value="hba1c" />\n+                    <param name="phenotype_target_threshold" value="6.5" />\n+                    <param name="sample_disease_group" value="T2D" />\n+                    <param name="sample_disease_group_scale" value="5" />\n+                </conditional>\n+            </conditional>\n+            <output name="out_pdf" value="default_output.pdf" compare="sim_size" />\n+            <output_collection name="summaries" count="5">\n+                <element name="Log of MuSiC fitting" ftype="txt">\n+                    <assert_contents>\n+                        <has_text text="Residual standard error: 0.1704 on 72 degrees of freedom"/>\n+                    </assert_contents>\n+                </element>\n+                <element name="Log of NNLS fitting" ftype="txt">\n+                    <assert_contents>\n+                        <has_text text="Residual standard error: 0.0645 on 72 degrees of freedom"/>\n+                    </assert_contents>\n+                </element>\n+            </output_collection>\n+        </test>\n+    </tests>\n+    <help><![CDATA[\n+MuSiC utilizes cell-type specific gene expression from single-cell RNA sequencing (RNA-seq) data to characterize cell type compositions from bulk RNA-seq data in complex tissues. By appropriate weighting of genes showing cross-subject and cross-cell consistency, MuSiC enables the transfer of cell type-specific gene expression information from one dataset to another.\n+\n+Solid tissues often contain closely related cell types which leads to collinearity. To deal with collinearity, MuSiC employs a tree-guided procedure that recursively zooms in on closely related cell types. Briefly, we first group similar cell types into the same cluster and estimate cluster proportions, then recursively repeat this procedure within each cluster.\n+\n+    ]]></help>\n+    <citations>\n+        <citation type="doi">https://doi.org/10.1038/s41467-018-08023-x</citation>\n+    </citations>\n+</tool>\n\\ No newline at end of file\n'
b
diff -r fd7a16d073c5 -r 56371b5a2da9 scripts/compare.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/compare.R Thu Feb 10 12:52:31 2022 +0000
[
b'@@ -0,0 +1,440 @@\n+suppressWarnings(suppressPackageStartupMessages(library(xbioc)))\n+suppressWarnings(suppressPackageStartupMessages(library(MuSiC)))\n+suppressWarnings(suppressPackageStartupMessages(library(reshape2)))\n+suppressWarnings(suppressPackageStartupMessages(library(cowplot)))\n+## We use this script to estimate the effectiveness of proportion methods\n+\n+## Load Conf\n+args <- commandArgs(trailingOnly = TRUE)\n+source(args[1])\n+\n+method_key <- list("MuSiC" = "est_music",\n+                   "NNLS" = "est_nnls")[[est_method]]\n+\n+\n+scale_yaxes <- function(gplot, value) {\n+    if (is.na(value)) {\n+        gplot\n+    } else {\n+        gplot + scale_y_continuous(lim = c(0, value))\n+    }\n+}\n+\n+\n+set_factor_data <- function(bulk_data, factor_name = NULL) {\n+    if (is.null(factor_name)) {\n+        factor_name <- "None" ## change to something plottable\n+    }\n+    pdat <- pData(bulk_data)\n+    sam_fact <- NULL\n+    if (factor_name %in% colnames(pdat)) {\n+        sam_fact <- cbind(rownames(pdat),\n+                          as.character(pdat[[factor_name]]))\n+        cat(paste0("   - factor: ", factor_name,\n+                   " found in phenotypes\\n"))\n+    } else {\n+        ## We assign this as the factor for the entire dataset\n+        sam_fact <- cbind(rownames(pdat),\n+                          factor_name)\n+        cat(paste0("   - factor: assigning \\"", factor_name,\n+                   "\\" to whole dataset\\n"))\n+    }\n+    colnames(sam_fact) <- c("Samples", "Factors")\n+    return(as.data.frame(sam_fact))\n+}\n+\n+## Due to limiting sizes, we need to load and unload\n+## possibly very large datasets.\n+process_pair <- function(sc_data, bulk_data,\n+                         ctypes_label, samples_label, ctypes,\n+                         factor_group) {\n+    ## - Generate\n+    est_prop <- music_prop(\n+        bulk.eset = bulk_data, sc.eset = sc_data,\n+        clusters = ctypes_label,\n+        samples = samples_label, select.ct = ctypes, verbose = T)\n+    ## -\n+    estimated_music_props <- est_prop$Est.prop.weighted\n+    estimated_nnls_props <- est_prop$Est.prop.allgene\n+    ## -\n+    fact_data <- set_factor_data(bulk_data, factor_group)\n+    ## -\n+    return(list(est_music = estimated_music_props,\n+                est_nnls = estimated_nnls_props,\n+                bulk_sample_totals = colSums(exprs(bulk_data)),\n+                plot_groups = fact_data))\n+}\n+\n+music_on_all <- function(files) {\n+    results <- list()\n+    for (sc_name in names(files)) {\n+        cat(paste0("sc-group:", sc_name, "\\n"))\n+        scgroup <- files[[sc_name]]\n+        ## - sc Data\n+        sc_est <- readRDS(scgroup$dataset)\n+        ## - params\n+        celltypes_label <- scgroup$label_cell\n+        samples_label <- scgroup$label_sample\n+        celltypes <- scgroup$celltype\n+\n+        results[[sc_name]] <- list()\n+        for (bulk_name in names(scgroup$bulk)) {\n+            cat(paste0(" - bulk-group:", bulk_name, "\\n"))\n+            bulkgroup <- scgroup$bulk[[bulk_name]]\n+            ## - bulk Data\n+            bulk_est <- readRDS(bulkgroup$dataset)\n+            ## - bulk params\n+            pheno_facts <- bulkgroup$pheno_facts\n+            pheno_excl <- bulkgroup$pheno_excl\n+            ##\n+            results[[sc_name]][[bulk_name]] <- process_pair(\n+                sc_est, bulk_est,\n+                celltypes_label, samples_label,\n+                celltypes, bulkgroup$factor_group)\n+            ##\n+            rm(bulk_est) ## unload\n+        }\n+        rm(sc_est) ## unload\n+    }\n+    return(results)\n+}\n+\n+plot_all_individual_heatmaps <- function(results) {\n+    pdf(out_heatmulti_pdf, width = 8, height = 8)\n+    for (sc_name in names(results)) {\n+        for (bk_name in names(results[[sc_name]])) {\n+            res <- results[[sc_name]][[bk_name]]\n+            plot_hmap <- Prop_heat_Est(\n+                data.matrix(res[[method_key]]), method.name = est_method) +\n+                ggtitle(paste0("[", est_method, "Cell type ",\n+                          '..b'\n+    B$p2 <- common1 + geom_boxplot(aes(y = Bulk, color = Cell)) +\n+        ylab("Bulk Dataset")\n+    ## -- Factor plots are optional\n+    A$p3 <- B$p3 <- A$p4 <- B$p4 <- ggplot() + theme_void()\n+\n+    if (do_factors) {\n+        A$p3 <- common1 + geom_boxplot(aes(y = Cell, color = Factors))\n+        A$p4 <- common2 + geom_boxplot(aes(y = Cell, color = Factors))\n+        B$p3 <- common1 + geom_boxplot(aes(y = Bulk, color = Factors)) +\n+            ylab("Bulk Dataset")\n+        B$p4 <- common2 + geom_boxplot(aes(y = Bulk, color = Factors)) +\n+            ylab("Bulk Dataset")\n+    }\n+\n+    title_a <- "Cell Types against Bulk"\n+    title_b <- "Bulk Datasets against Cells"\n+    if (do_factors) {\n+        title_a <- paste0(title_a, " and Factors")\n+        title_b <- paste0(title_b, " and Factors")\n+    }\n+\n+    a_all <- plot_grid(ggdraw() + draw_label(title_a, fontface = "bold"),\n+                       plot_grid(plotlist = A, ncol = 2),\n+                       ncol = 1, rel_heights = c(0.05, 0.95))\n+    b_all <- plot_grid(ggdraw() + draw_label(title_b, fontface = "bold"),\n+                       plot_grid(plotlist = B, ncol = 2),\n+                       ncol = 1, rel_heights = c(0.05, 0.95))\n+    return(list(cell = a_all, bulk = b_all))\n+}\n+\n+filter_output <- function(grudat_spread_melt, out_filt) {\n+    print_red <- function(comment, red_list) {\n+        cat(paste(comment, paste(red_list, collapse = ", "), "\\n"))\n+    }\n+    grudat_filt <- grudat_spread_melt\n+    print_red("Total Cell types:", unique(grudat_filt$Cell))\n+    if (!is.null(out_filt$cells)) {\n+        grudat_filt <- grudat_filt[grudat_filt$Cell %in% out_filt$cells, ]\n+        print_red(" - selecting:", out_filt$cells)\n+    }\n+    print_red("Total Factors:", unique(grudat_spread_melt$Factors))\n+    if (!is.null(out_filt$facts)) {\n+        grudat_filt <- grudat_filt[grudat_filt$Factors %in% out_filt$facts, ]\n+        print_red(" - selecting:", out_filt$facts)\n+    }\n+    return(grudat_filt)\n+}\n+\n+\n+results <- music_on_all(files)\n+\n+if (heat_grouped_p) {\n+    plot_grouped_heatmaps(results)\n+} else {\n+    plot_all_individual_heatmaps(results)\n+}\n+\n+save.image("/tmp/sesh.rds")\n+\n+summat <- summarized_matrix(results)\n+grudat <- group_by_dataset(summat)\n+grudat_spread_melt <- merge_factors_spread(grudat$spread,\n+                                           flatten_factor_list(results))\n+\n+\n+\n+## The output filters ONLY apply to boxplots, since these take\n+do_factors <- (length(unique(grudat_spread_melt[["Factors"]])) > 1)\n+\n+grudat_spread_melt_filt <- filter_output(grudat_spread_melt, out_filt)\n+\n+heat_maps <- summarize_heatmaps(grudat_spread_melt_filt, do_factors)\n+box_plots <- summarize_boxplots(grudat_spread_melt_filt, do_factors)\n+\n+pdf(out_heatsumm_pdf, width = 14, height = 14)\n+print(heat_maps)\n+print(box_plots)\n+dev.off()\n+\n+## Generate output tables\n+stats_prop <- lapply(grudat$spread$prop, function(x) {\n+    t(apply(x, 1, summary))})\n+stats_scale <- lapply(grudat$spread$scale, function(x) {\n+    t(apply(x, 1, summary))})\n+\n+writable2 <- function(obj, prefix, title) {\n+    write.table(obj,\n+                file = paste0("report_data/", prefix, "_",\n+                              title, ".tabular"),\n+                quote = F, sep = "\\t", col.names = NA)\n+}\n+## Make the value table printable\n+grudat_spread_melt$value.scale <- as.integer(grudat_spread_melt$value.scale) # nolint\n+colnames(grudat_spread_melt) <- c("Sample", "Cell", "Bulk", "Factors",\n+                                  "CT Prop in Sample", "Number of Reads")\n+\n+writable2(grudat_spread_melt, "values", "Data Table")\n+writable2(summat$prop, "values", "Matrix of Cell Type Sample Proportions")\n+writable2({\n+    aa <- as.matrix(summat$scaled); mode(aa) <- "integer"; aa\n+}, "values", "Matrix of Cell Type Read Counts")\n+\n+for (bname in names(stats_prop)) {\n+    writable2(stats_prop[[bname]], "stats", paste0(bname, ": Sample Props"))\n+    writable2(stats_scale[[bname]], "stats", paste0(bname, ": Read Props"))\n+}\n'
b
diff -r fd7a16d073c5 -r 56371b5a2da9 scripts/estimateprops.R
--- a/scripts/estimateprops.R Sat Jan 29 12:51:49 2022 +0000
+++ b/scripts/estimateprops.R Thu Feb 10 12:52:31 2022 +0000
[
b'@@ -14,8 +14,12 @@\n     clusters = celltypes_label,\n     samples = samples_label, select.ct = celltypes, verbose = T)\n \n+\n estimated_music_props <- est_prop$Est.prop.weighted\n estimated_nnls_props <- est_prop$Est.prop.allgene\n+##\n+estimated_music_props_flat <- melt(estimated_music_props)\n+estimated_nnls_props_flat <- melt(estimated_nnls_props)\n \n scale_yaxes <- function(gplot, value) {\n     if (is.na(value)) {\n@@ -25,23 +29,36 @@\n     }\n }\n \n+sieve_data <- function(func, music_data, nnls_data) {\n+    if (func == "list") {\n+        res <- list(if ("MuSiC" %in% methods) music_data else NULL,\n+                    if ("NNLS" %in% methods) nnls_data else NULL)\n+        res[lengths(res) > 0] ## filter out NULL elements\n+    } else if (func == "rbind") {\n+        rbind(if ("MuSiC" %in% methods) music_data else NULL,\n+              if ("NNLS" %in% methods) nnls_data else NULL)\n+    } else if (func == "c") {\n+        c(if ("MuSiC" %in% methods) music_data else NULL,\n+          if ("NNLS" %in% methods) nnls_data else NULL)\n+    }\n+}\n+\n+\n ## Show different in estimation methods\n ## Jitter plot of estimated cell type proportions\n jitter_fig <- scale_yaxes(Jitter_Est(\n-    list(data.matrix(estimated_music_props),\n-         data.matrix(estimated_nnls_props)),\n+    sieve_data("list",\n+               data.matrix(estimated_music_props),\n+               data.matrix(estimated_nnls_props)),\n     method.name = methods, title = "Jitter plot of Est Proportions",\n     size = 2, alpha = 0.7) + theme_minimal(), maxyscale)\n \n-\n ## Make a Plot\n ## A more sophisticated jitter plot is provided as below. We separated\n ## the T2D subjects and normal subjects by their disease factor levels.\n-estimated_music_props_flat <- melt(estimated_music_props)\n-estimated_nnls_props_flat <- melt(estimated_nnls_props)\n-\n-m_prop <- rbind(estimated_music_props_flat,\n-                estimated_nnls_props_flat)\n+m_prop <- sieve_data("rbind",\n+                     estimated_music_props_flat,\n+                     estimated_nnls_props_flat)\n colnames(m_prop) <- c("Sub", "CellType", "Prop")\n \n if (is.null(celltypes)) {\n@@ -69,7 +86,7 @@\n         phenotype_target_threshold <- -Inf\n         message("phenotype target threshold set to -Inf")\n     }\n-\n+    ## the "2" here is to do with the sample groups, not number of methods\n     m_prop$Disease_factor <- rep(bulk_eset[[phenotype_target]], 2 * length(celltypes)) # nolint\n     m_prop <- m_prop[!is.na(m_prop$Disease_factor), ]\n     ## Generate a TRUE/FALSE table of Normal == 1 and Disease == 2\n@@ -84,8 +101,10 @@\n     m_prop <- rbind(subset(m_prop, Disease != sample_disease_group),\n                     subset(m_prop, Disease == sample_disease_group))\n \n-    jitter_new <- scale_yaxes(ggplot(m_prop, aes(Method, Prop)) +\n-        geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease),\n+    jitter_new <- scale_yaxes(\n+        ggplot(m_prop, aes(Method, Prop)) +\n+        geom_point(aes(fill = Method, color = Disease,\n+                       stroke = D, shape = Disease),\n                    size = 2, alpha = 0.7,\n                    position = position_jitter(width = 0.25, height = 0)) +\n         facet_wrap(~ CellType, scales = "free") +\n@@ -100,21 +119,23 @@\n     ## Create dataframe for beta cell proportions and Disease_factor levels\n     ## - Ugly code. Essentially, doubles the cell type proportions for each\n     ##   set of MuSiC and NNLS methods\n-    m_prop_ana <- data.frame(pData(bulk_eset)[rep(1:nrow(estimated_music_props), 2), #nolint\n-                                              phenotype_factors],\n-                             ## get proportions of target cell type\n-                             ct.prop = c(estimated_music_props[, phenotype_scrna_target],\n-                                         estimated_nnls_props[, phenotype_scrna_target]),\n-                             ##\n-                             Method = factor(rep(methods,\n-                                                 each = nrow(estimated_music_'..b'       geom_smooth(method = "lm",  se = FALSE, col = "black", lwd = 0.25) +\n-        geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease),\n+        geom_point(aes(fill = Method, color = Disease,\n+                       stroke = D, shape = Disease),\n                    size = 2, alpha = 0.7) +  facet_wrap(~ Method) +\n         ggtitle(paste0(toupper(phenotype_target), " vs. ",\n-                       toupper(phenotype_scrna_target), " Cell Type Proportion")) +\n+                       toupper(phenotype_scrna_target),\n+                       " Cell Type Proportion")) +\n         theme_minimal() +\n         ylab(paste0("Proportion of ",\n                     phenotype_scrna_target, " cells")) +\n@@ -138,19 +162,22 @@\n }\n \n ## BoxPlot\n-plot_box <- scale_yaxes(Boxplot_Est(list(\n-    data.matrix(estimated_music_props),\n-    data.matrix(estimated_nnls_props)),\n-    method.name = c("MuSiC", "NNLS")) +\n+plot_box <- scale_yaxes(Boxplot_Est(\n+    sieve_data("list",\n+               data.matrix(estimated_music_props),\n+               data.matrix(estimated_nnls_props)),\n+    method.name = methods) +\n     theme(axis.text.x = element_text(angle = -90),\n           axis.text.y = element_text(size = 8)) +\n     ggtitle(element_blank()) + theme_minimal(), maxyscale)\n \n ## Heatmap\n-plot_hmap <- Prop_heat_Est(list(\n-    data.matrix(estimated_music_props),\n-    data.matrix(estimated_nnls_props)),\n-    method.name = c("MuSiC", "NNLS")) +\n+plot_hmap <- Prop_heat_Est(\n+    sieve_data(\n+        "list",\n+        data.matrix(estimated_music_props),\n+        data.matrix(estimated_nnls_props)),\n+    method.name = methods) +\n     theme(axis.text.x = element_text(angle = -90),\n           axis.text.y = element_text(size = 6))\n \n@@ -167,33 +194,29 @@\n plot_hmap\n message(dev.off())\n \n-## Output Proportions\n+writable <- function(obj, prefix, title) {\n+    write.table(obj,\n+                file = paste0("report_data/", prefix, "_",\n+                              title, ".tabular"),\n+                quote = F, sep = "\\t", col.names = NA)\n+}\n \n-write.table(est_prop$Est.prop.weighted,\n-            file = paste0("report_data/prop_",\n-                          "Music Estimated Proportions of Cell Types",\n-                          ".tabular"),\n-            quote = F, sep = "\\t", col.names = NA)\n-write.table(est_prop$Est.prop.allgene,\n-            file = paste0("report_data/prop_",\n-                          "NNLS Estimated Proportions of Cell Types",\n-                          ".tabular"),\n-            quote = F, sep = "\\t", col.names = NA)\n-write.table(est_prop$Weight.gene,\n-            file = paste0("report_data/weightgene_",\n-                          "Music Estimated Proportions of Cell Types (by Gene)",\n-                          ".tabular"),\n-            quote = F, sep = "\\t", col.names = NA)\n-write.table(est_prop$r.squared.full,\n-            file = paste0("report_data/rsquared_",\n-                          "Music R-sqr Estimated Proportions of Each Subject",\n-                          ".tabular"),\n-            quote = F, sep = "\\t", col.names = NA)\n-write.table(est_prop$Var.prop,\n-            file = paste0("report_data/varprop_",\n-                          "Matrix of Variance of MuSiC Estimates",\n-                          ".tabular"),\n-            quote = F, sep = "\\t", col.names = NA)\n+## Output Proportions\n+if ("NNLS" %in% methods) {\n+    writable(est_prop$Est.prop.allgene, "prop",\n+             "NNLS Estimated Proportions of Cell Types")\n+}\n+\n+if ("MuSiC" %in% methods) {\n+    writable(est_prop$Est.prop.weighted, "prop",\n+             "Music Estimated Proportions of Cell Types")\n+    writable(est_prop$Weight.gene, "weightgene",\n+             "Music Estimated Proportions of Cell Types (by Gene)")\n+    writable(est_prop$r.squared.full, "rsquared",\n+             "Music R-sqr Estimated Proportions of Each Subject")\n+    writable(est_prop$Var.prop, "varprop",\n+             "Matrix of Variance of MuSiC Estimates")\n+}\n \n \n if (use_disease_factor) {\n'
b
diff -r fd7a16d073c5 -r 56371b5a2da9 test-data/default_output_no_disease_nnls.pdf
b
Binary file test-data/default_output_no_disease_nnls.pdf has changed
b
diff -r fd7a16d073c5 -r 56371b5a2da9 test-data/out_filt1.pdf
b
Binary file test-data/out_filt1.pdf has changed
b
diff -r fd7a16d073c5 -r 56371b5a2da9 test-data/out_heat2.pdf
b
Binary file test-data/out_heat2.pdf has changed