changeset 1:3ca0132c182a draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit 683bb72ae92b5759a239b7e3bf4c5a229ed35b54"
author bgruening
date Fri, 26 Nov 2021 15:54:51 +0000
parents 224721e76869
children 1c4cf4b7debe
files macros.xml music-deconvolution.xml scripts/dendrogram.R scripts/estimateprops.R scripts/inspect.R test-data/EMTABesethealthy.subset.rds test-data/Mousebulkeset.rds test-data/Mousesubeset.degenesonly2.half.rds test-data/default_output.pdf test-data/dendro.pdf test-data/dendro_1.pdf
diffstat 11 files changed, 292 insertions(+), 127 deletions(-) [+]
line wrap: on
line diff
--- a/macros.xml	Sun Sep 12 19:48:48 2021 +0000
+++ b/macros.xml	Fri Nov 26 15:54:51 2021 +0000
@@ -1,5 +1,5 @@
 <macros>
-    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@VERSION_SUFFIX@">1</token>
     <!-- The ESet inspector/constructor and MuSiC tool can have
          independent Galaxy versions but should reference the same
          package version always. -->
@@ -15,11 +15,11 @@
         <validator type="regex" message="FORMAT terms separated by commas">^(([A-Za-z0-9+_ -]+)\s?,?)*$</validator>
     </xml>
     <xml name="validator_text" >
-        <validator type="regex" message="No commas allowed">^(([A-Za-z0-9+_ -]+)\s?)*$</validator>
+        <validator type="regex" message="No commas allowed">^(([A-Za-z0-9+_ -]+)\s?)+$</validator>
     </xml>
     <xml name="celltypes_macro" >
         <param name="celltypes" type="text" optional="true" value=""
-               label="Comma list of cell types to use from scRNA dataset" help="If NULL, then use all cell types." >
+               label="Comma list of cell types to use from scRNA dataset" help="If blank, then use all available cell types." >
             <expand macro="validator_index_identifiers" />
         </param>
     </xml>
--- a/music-deconvolution.xml	Sun Sep 12 19:48:48 2021 +0000
+++ b/music-deconvolution.xml	Fri Nov 26 15:54:51 2021 +0000
@@ -1,11 +1,11 @@
 <tool id="music_deconvolution" name="MuSiC" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@"
-      profile="20.05" license="GPL-3.0-or-later" >
+      profile="21.09" license="GPL-3.0-or-later" >
     <description>estimate cell type proportions in bulk RNA-seq data</description>
     <macros>
         <import>macros.xml</import>
     </macros>
     <expand macro="requirements" />
-    <command detect_errors="exit_code"><![CDATA[
+    <command detect_errors="exit_code" ><![CDATA[
 mkdir report_data &&
 Rscript --vanilla '$__tool_directory__/scripts/${do.method}.R' '$conf'
 ]]></command>
@@ -29,16 +29,17 @@
 #if str($do.method) == "estimateprops":
 
 phenotype_factors = null_str_vec('$do.phenotype_factors')
+phenotype_factors_always_exclude = null_str_vec('$do.phenotype_factors_always_exclude')
 celltypes_label = null_str_vec('$do.celltypes_label')
 samples_label = null_str_vec('$do.samples_label')
 celltypes = null_str_vec('$do.celltypes')
-methods = null_str_vec('$do.methods')
-phenotype_gene = null_str_vec('$do.phenotype_gene')
-sample_groups = null_str_vec('$do.sample_groups')
+methods = c("MuSiC", "NNLS")
+phenotype_target = null_str_vec('$do.phenotype_target')
+phenotype_target_threshold = as.numeric('$do.phenotype_target_threshold')
 sample_disease_group = null_str_vec('$do.sample_disease_group')
 sample_disease_group_scale = as.integer('$do.sample_disease_group_scale')
-healthy_phenotype = null_str_vec('$do.healthy_phenotype')
 compare_title = null_str_vec('$do.compare_title')
+
 outfile_pdf='$out_pdf'
 
 #elif str($do.method) == "dendrogram":
@@ -91,20 +92,24 @@
                     <expand macro="validator_text" />
                 </param>
                 <expand macro="celltypes_macro" />
-                <param name="methods" multiple="true" type="select" display="checkboxes" label="Cell Proportion Method" >
-                    <option value="MuSiC" selected="true" />
-                    <option value="NNLS" selected="true" />
-                </param>
                 <param name="phenotype_factors" type="text"
-                       label="List of phenotypes factors" help="If blank, then use all phenotypes." >
+                       label="Phenotype factors"
+                       help="List of phenotypes factors to be used in the linear regression. Please make sure that each factor has more than one unique value. Names correspond to column names in the bulk RNA dataset phenotype table. If blank, then treat all bulk phenotype columns as factors." >
                     <expand macro="validator_index_identifiers" />
                 </param>
-                <param name="phenotype_gene" type="text" label="Causative Gene"
-                       help="MUST exist in the phenotype factors above." >
+                <param name="phenotype_factors_always_exclude" type="text"
+                       label="Excluded phenotype factors"
+                       help="List of phenotype factors to always exclude in the analysis"
+                       value="sampleID,SubjectName" >
+                    <expand macro="validator_index_identifiers" />
+                </param>
+                <param name="phenotype_target" type="text" label="Phenotype Target"
+                       help="MUST exist in the bulk RNA datasets phenotype factors, as above." >
                     <expand macro="validator_text" />
                 </param>
-                <param name="sample_groups" type="text" label="List of Sample Groups" >
-                    <expand macro="validator_index_identifiers" />
+                <param name="phenotype_target_threshold" type="float" label="Phenotype Target Threshold"
+                       value="-99"
+                       help="The (%) threshold at which the phenotype target manifests. Leave at -99 to select all." >
                 </param>
                 <param name="sample_disease_group" type="text" label="Sample Disease Group"
                        help="MUST exist in the sample_groups above." >
@@ -113,9 +118,6 @@
                 <param name="sample_disease_group_scale" type="integer"
                        label="Sample Disease Group (Scale)" value="5"
                        help="Used to accentutate certain features in the plots. Increase this number to reduce the effect." />
-                <param name="healthy_phenotype" type="text" label="Healthy Phenotype" >
-                    <expand macro="validator_text" />
-                </param>
                 <param name="compare_title" type="text" label="Plot Title" >
                     <expand macro="validator_text" />
                 </param>
@@ -126,7 +128,7 @@
                     <expand macro="validator_text" />
                 </param>
                 <param name="clustertype_label" type="text" value="clusterType"
-                       label="Cell Types Label from scRNA dataset" >
+                       label="Cluster Types Label from scRNA dataset" >
                     <expand macro="validator_text" />
                 </param>
                 <param name="samples_label" type="text" value="sampleID"
@@ -134,13 +136,14 @@
                     <expand macro="validator_text" />
                 </param>
                 <expand macro="celltypes_macro" />
-                <repeat name="cluster_groups" title="Cluster Groups" min="2" >
+                <repeat name="cluster_groups" title="Cluster Groups" min="0"
+                        help="Insert cell cluster groups based on a previous clustering." >
                     <param name="cluster_id" label="Cluster ID" type="text" value=""
                            help="e.g. C1 or Cluster1, etc." />
                     <expand macro="celltypes_macro" />
                     <param name="marker_name" label="Marker Gene Group Name" type="text"
                            optional="true" value=""
-                           help="Name of the list of geme markers used to describe the marker list supplied below." >
+                           help="Name of the list of gene markers used to describe the marker list supplied below." >
                         <expand macro="validator_text" />
                     </param>
                     <param name="marker_list" label="List of Gene Markers" type="data" format="txt,tabular"
@@ -153,17 +156,35 @@
     <outputs>
         <data name="out_pdf" format="pdf" label="${tool.name} on ${on_string}: PDF Plots" />
         <data name="out_tab" format="tabular" label="${tool.name} on ${on_string}: Cell Proportions by Sample" >
-            <filter>do["method"] == "dendrogram"</filter>
+            <filter>do["method"] == "dendrogram" and len(do["cluster_groups"]) > 0</filter>
         </data>
-        <collection name="summaries" type="list" label="${tool.name} on ${on_string}: Method Summaries">
+        <collection name="props" type="list" label="${tool.name} on ${on_string}: Proportion Matrices" >
             <filter>do["method"] == "estimateprops"</filter>
-            <discover_datasets pattern="summ_(?P&lt;designation&gt;.+)\.txt" format="txt"
-                               directory="report_data" />
+            <discover_datasets pattern="prop_(?P&lt;designation&gt;.+)\.tabular" format="tabular" directory="report_data" />
+        </collection>
+        <collection name="summaries" type="list" label="${tool.name} on ${on_string}: Summaries and Logs">
+            <filter>do["method"] == "estimateprops"</filter>
+            <discover_datasets pattern="summ_(?P&lt;designation&gt;.+)\.txt" format="txt" directory="report_data" />
+            <discover_datasets pattern="varprop_(?P&lt;designation&gt;.+)\.tabular" format="tabular" directory="report_data" />
+            <discover_datasets pattern="rsquared_(?P&lt;designation&gt;.+)\.tabular" format="tabular" directory="report_data" />
+            <discover_datasets pattern="weightgene_(?P&lt;designation&gt;.+)\.tabular" format="tabular" directory="report_data" />
         </collection>
     </outputs>
     <tests>
+        <test expect_num_outputs="1" >
+            <!-- Dendrogram test 1 -->
+            <param name="bulk_eset" value="Mousebulkeset.rds" />
+            <param name="scrna_eset" value="Mousesubeset.degenesonly2.half.rds" />
+            <conditional name="do" >
+                <param name="method" value="dendrogram" />
+                <param name="celltypes_label" value="cellType" />
+                <param name="samples_label" value="sampleID" />
+                <param name="celltypes" value="Endo,Podo,PT,LOH,DCT,CD-PC,CD-IC,Fib,Macro,Neutro,B lymph,T lymph,NK" />
+            </conditional>
+            <output name="out_pdf" value="dendro_1.pdf" compare="sim_size" />
+        </test>
         <test expect_num_outputs="2" >
-            <!-- Dendrogram test -->
+            <!-- Dendrogram test 2 -->
             <param name="bulk_eset" value="Mousebulkeset.rds" />
             <param name="scrna_eset" value="Mousesubeset.degenesonly2.half.rds" />
             <conditional name="do" >
@@ -195,12 +216,12 @@
             <output name="out_pdf" value="dendro.pdf" compare="sim_size" />
             <output name="out_tab">
                 <assert_contents>
-                    <has_text_matching expression="^\s+Est\.prop\.weighted\.cluster\.Neutro\s+Est\.prop\.weighted\.cluster\.Podo\s+Est\.prop\.weighted\.cluster\.Endo" />
+                    <has_text_matching expression="^\s+Neutro\s+Podo\s+Endo" />
                     <has_text text="APOL1.GNA78M"/>
                 </assert_contents>
             </output>
         </test>
-        <test expect_num_outputs="2" >
+        <test expect_num_outputs="3" >
             <!-- Estimate Proportions test -->
             <param name="bulk_eset" value="GSE50244bulkeset.subset.rds" />
             <param name="scrna_eset" value="EMTABesethealthy.subset.rds" />
@@ -209,25 +230,23 @@
                 <param name="celltypes_label" value="cellType" />
                 <param name="samples_label" value="sampleID" />
                 <param name="celltypes" value="alpha,beta,delta,gamma,acinar,ductal" />
-                <param name="methods" value="MuSiC,NNLS" />
                 <param name="phenotype_factors" value="age,bmi,hba1c,gender" />
-                <param name="phenotype_gene" value="hba1c" />
-                <param name="sample_groups" value="Normal,T2D" />
+                <param name="phenotype_target" value="hba1c" />
+                <param name="phenotype_target_threshold" value="6.5" />
                 <param name="sample_disease_group" value="T2D" />
                 <param name="sample_disease_group_scale" value="5" />
-                <param name="healthy_phenotype" value="Normal" />
                 <param name="compare_title" value="HbA1c vs Beta Cell Type Proportion" />
             </conditional>
             <output name="out_pdf" value="default_output.pdf" compare="sim_size" />
-            <output_collection name="summaries" count="2">
-                <element name="MuSiC" ftype="txt">
+            <output_collection name="summaries" count="5">
+                <element name="Log of MuSiC fitting" ftype="txt">
                     <assert_contents>
-                        <has_text text="Residual standard error: 0.1662 on 72 degrees of freedom"/>
+                        <has_text text="Residual standard error: 0.1704 on 72 degrees of freedom"/>
                     </assert_contents>
                 </element>
-                <element name="NNLS" ftype="txt">
+                <element name="Log of NNLS fitting" ftype="txt">
                     <assert_contents>
-                        <has_text text="Residual standard error: 0.06561 on 72 degrees of freedom"/>
+                        <has_text text="Residual standard error: 0.0645 on 72 degrees of freedom"/>
                     </assert_contents>
                 </element>
             </output_collection>
@@ -238,7 +257,8 @@
 
 Solid tissues often contain closely related cell types which leads to collinearity. To deal with collinearity, MuSiC employs a tree-guided procedure that recursively zooms in on closely related cell types. Briefly, we first group similar cell types into the same cluster and estimate cluster proportions, then recursively repeat this procedure within each cluster.
 
-.. image:: https://xuranw.github.io/MuSiC/articles/images/FigureMethod.jpg
+.. image:: $PATH_TO_IMAGES/FigureMethod.jpg
+
     ]]></help>
     <citations>
         <citation type="doi">https://doi.org/10.1038/s41467-018-08023-x</citation>
--- a/scripts/dendrogram.R	Sun Sep 12 19:48:48 2021 +0000
+++ b/scripts/dendrogram.R	Fri Nov 26 15:54:51 2021 +0000
@@ -17,31 +17,6 @@
 args <- commandArgs(trailingOnly = TRUE)
 source(args[1])
 
-## We then perform bulk tissue cell type estimation with pre-grouping
-## of cell types: C, list_of_cell_types, marker genes name, marker
-## genes list.
-## data.to.use = list(
-##     "C1" = list(cell.types = c("Neutro"),
-##                 marker.names=NULL,
-##                 marker.list=NULL),
-##     "C2" = list(cell.types = c("Podo"),
-##                 marker.names=NULL,
-##                 marker.list=NULL),
-##     "C3" = list(cell.types = c("Endo","CD-PC","LOH","CD-IC","DCT","PT"),
-##                 marker.names = "Epithelial",
-##                 marker.list = read_list("../test-data/epith.markers")),
-##     "C4" = list(cell.types = c("Macro","Fib","B lymph","NK","T lymph"),
-##                 marker.names = "Immune",
-##                 marker.list = read_list("../test-data/immune.markers"))
-## )
-grouped_celltypes <- lapply(data.to.use, function(x) {
-    x$cell.types
-})
-marker_groups <- lapply(data.to.use, function(x) {
-    x$marker.list
-})
-names(marker_groups) <- names(data.to.use)
-
 
 ## Perform the estimation
 ## Produce the first step information
@@ -51,33 +26,107 @@
 
 ## Plot the dendrogram of design matrix and cross-subject mean of
 ## realtive abundance
-par(mfrow = c(1, 2))
-d <- dist(t(log(sub.basis$Disgn.mtx + 1e-6)), method = "euclidean")
+## Hierarchical clustering using Complete Linkage
+d1 <- dist(t(log(sub.basis$Disgn.mtx + 1e-6)), method = "euclidean")
+hc1 <- hclust(d1, method = "complete")
 ## Hierarchical clustering using Complete Linkage
-hc1 <- hclust(d, method = "complete")
-## Plot the obtained dendrogram
-plot(hc1, cex = 0.6, hang = -1, main = "Cluster log(Design Matrix)")
-d <- dist(t(log(sub.basis$M.theta + 1e-8)), method = "euclidean")
-## Hierarchical clustering using Complete Linkage
-hc2 <- hclust(d, method = "complete")
-## Plot the obtained dendrogram
-pdf(file = outfile_pdf, width = 8, height = 8)
-plot(hc2, cex = 0.6, hang = -1, main = "Cluster log(Mean of RA)")
+d2 <- dist(t(log(sub.basis$M.theta + 1e-8)), method = "euclidean")
+hc2 <- hclust(d2, method = "complete")
+
 
-cl_type <- as.character(scrna_eset[[celltypes_label]])
+if (length(data.to.use) > 0) {
+    ## We then perform bulk tissue cell type estimation with pre-grouping
+    ## of cell types: C, list_of_cell_types, marker genes name, marker
+    ## genes list.
+    ## data.to.use = list(
+    ##     "C1" = list(cell.types = c("Neutro"),
+    ##                 marker.names=NULL,
+    ##                 marker.list=NULL),
+    ##     "C2" = list(cell.types = c("Podo"),
+    ##                 marker.names=NULL,
+    ##                 marker.list=NULL),
+    ##     "C3" = list(cell.types = c("Endo","CD-PC","LOH","CD-IC","DCT","PT"),
+    ##                 marker.names = "Epithelial",
+    ##                 marker.list = read_list("../test-data/epith.markers")),
+    ##     "C4" = list(cell.types = c("Macro","Fib","B lymph","NK","T lymph"),
+    ##                 marker.names = "Immune",
+    ##                 marker.list = read_list("../test-data/immune.markers"))
+    ## )
+    grouped_celltypes <- lapply(data.to.use, function(x) {
+        x$cell.types
+    })
+    marker_groups <- lapply(data.to.use, function(x) {
+        x$marker.list
+    })
+    names(marker_groups) <- names(data.to.use)
+
+
+    cl_type <- as.character(scrna_eset[[celltypes_label]])
+
+    for (cl in seq_len(length(grouped_celltypes))) {
+        cl_type[cl_type %in%
+                grouped_celltypes[[cl]]] <- names(grouped_celltypes)[cl]
+    }
+    pData(scrna_eset)[[clustertype_label]] <- factor(
+        cl_type, levels = c(names(grouped_celltypes),
+                            "CD-Trans", "Novel1", "Novel2"))
+
+    est_bulk <- music_prop.cluster(
+        bulk.eset = bulk_eset, sc.eset = scrna_eset,
+        group.markers = marker_groups, clusters = celltypes_label,
+        groups = clustertype_label, samples = samples_label,
+        clusters.type = grouped_celltypes
+    )
 
-for (cl in seq_len(length(grouped_celltypes))) {
-  cl_type[cl_type %in% grouped_celltypes[[cl]]] <- names(grouped_celltypes)[cl]
-}
-pData(scrna_eset)[[clustertype_label]] <- factor(
-    cl_type, levels = c(names(grouped_celltypes),
-                        "CD-Trans", "Novel1", "Novel2"))
+    estimated_music_props <- est_bulk$Est.prop.weighted.cluster
+    ## NNLS is not calculated here
+
+    ## Show different in estimation methods
+    ## Jitter plot of estimated cell type proportions
+    methods_list <- c("MuSiC")
+
+    jitter_fig <- Jitter_Est(
+        list(data.matrix(estimated_music_props)),
+        method.name = methods_list, title = "Jitter plot of Est Proportions",
+        size = 2, alpha = 0.7) +
+        theme_minimal() +
+        labs(x = element_blank(), y = element_blank()) +
+        theme(axis.text = element_text(size = 6),
+              axis.text.x = element_blank(),
+              legend.position = "none")
+
+    plot_box <- Boxplot_Est(list(
+        data.matrix(estimated_music_props)),
+        method.name = methods_list) +
+        theme_minimal() +
+        labs(x = element_blank(), y = element_blank()) +
+        theme(axis.text = element_text(size = 6),
+              axis.text.x = element_blank(),
+              legend.position = "none")
 
-est_bulk <- music_prop.cluster(
-    bulk.eset = bulk_eset, sc.eset = scrna_eset,
-    group.markers = marker_groups, clusters = celltypes_label,
-    groups = clustertype_label, samples = samples_label,
-    clusters.type = grouped_celltypes)
+    plot_hmap <- Prop_heat_Est(list(
+        data.matrix(estimated_music_props)),
+        method.name = methods_list) +
+        labs(x = element_blank(), y = element_blank()) +
+        theme(axis.text.y = element_text(size = 6),
+              axis.text.x = element_text(angle = -90, size = 5),
+              plot.title = element_text(size = 9),
+              legend.key.width = unit(0.15, "cm"),
+              legend.text = element_text(size = 5),
+              legend.title = element_text(size = 5))
 
-write.table(est_bulk, file = outfile_tab, quote = F, col.names = NA, sep = "\t")
-dev.off()
+}
+    
+pdf(file = outfile_pdf, width = 8, height = 8)
+par(mfrow = c(1, 2))
+plot(hc1, cex = 0.6, hang = -1, main = "Cluster log(Design Matrix)")
+plot(hc2, cex = 0.6, hang = -1, main = "Cluster log(Mean of RA)")
+if (length(data.to.use) > 0) {
+    plot_grid(jitter_fig, plot_box, plot_hmap, ncol = 2, nrow = 2)
+}
+message(dev.off())
+
+if (length(data.to.use) > 0) {
+    write.table(estimated_music_props,
+                file = outfile_tab, quote = F, col.names = NA, sep = "\t")
+}
--- a/scripts/estimateprops.R	Sun Sep 12 19:48:48 2021 +0000
+++ b/scripts/estimateprops.R	Fri Nov 26 15:54:51 2021 +0000
@@ -14,36 +14,67 @@
     clusters = celltypes_label,
     samples = samples_label, select.ct = celltypes, verbose = T)
 
+estimated_music_props <- est_prop$Est.prop.weighted
+estimated_nnls_props <- est_prop$Est.prop.allgene
 
 ## Show different in estimation methods
 ## Jitter plot of estimated cell type proportions
-jitter.fig <- Jitter_Est(
-    list(data.matrix(est_prop$Est.prop.weighted),
-         data.matrix(est_prop$Est.prop.allgene)),
-    method.name = methods, title = "Jitter plot of Est Proportions")
+jitter_fig <- Jitter_Est(
+    list(data.matrix(estimated_music_props),
+         data.matrix(estimated_nnls_props)),
+    method.name = methods, title = "Jitter plot of Est Proportions",
+    size = 2, alpha = 0.7) + theme_minimal()
 
 
 ## Make a Plot
 ## A more sophisticated jitter plot is provided as below. We separated
-## the T2D subjects and normal subjects by their HbA1c levels.
-m_prop <- rbind(melt(est_prop$Est.prop.weighted),
-               melt(est_prop$Est.prop.allgene))
+## the T2D subjects and normal subjects by their disease factor levels.
+estimated_music_props_flat <- melt(estimated_music_props)
+estimated_nnls_props_flat <- melt(estimated_nnls_props)
+
+m_prop <- rbind(estimated_music_props_flat,
+                estimated_nnls_props_flat)
+colnames(m_prop) <- c("Sub", "CellType", "Prop")
+
+if (is.null(celltypes)) {
+    celltypes <- levels(m_prop$CellType)
+    message("No celltypes declared, using:")
+    message(celltypes)
+}
 
-colnames(m_prop) <- c("Sub", "CellType", "Prop")
+if (phenotype_target_threshold == -99) {
+    phenotype_target_threshold <- -Inf
+    message("phenotype target threshold set to -Inf")
+}
+
+if (is.null(phenotype_factors)) {
+    phenotype_factors <- colnames(pData(bulk_eset))
+}
+## filter out unwanted factors like "sampleID" and "subjectName"
+phenotype_factors <- phenotype_factors[
+    !(phenotype_factors %in% phenotype_factors_always_exclude)]
+message("Phenotype Factors to use:")
+message(phenotype_factors)
+
 
 m_prop$CellType <- factor(m_prop$CellType, levels = celltypes) # nolint
-m_prop$Method <- factor(rep(methods, each = 89 * 6), levels = methods) # nolint
-m_prop$HbA1c <- rep(bulk_eset$hba1c, 2 * 6) # nolint
-m_prop <- m_prop[!is.na(m_prop$HbA1c), ]
-m_prop$Disease <- factor(sample_groups[(m_prop$HbA1c > 6.5) + 1], # nolint
+m_prop$Method <- factor(rep(methods, each = nrow(estimated_music_props_flat)), # nolint
+                        levels = methods)
+m_prop$Disease_factor <- rep(bulk_eset[[phenotype_target]], 2 * length(celltypes)) # nolint
+m_prop <- m_prop[!is.na(m_prop$Disease_factor), ]
+## Generate a TRUE/FALSE table of Normal == 1 and Disease == 2
+sample_groups <- c("Normal", sample_disease_group)
+m_prop$Disease <- factor(sample_groups[(m_prop$Disease_factor > phenotype_target_threshold) + 1], # nolint
                          levels = sample_groups)
 
+## Binary to scale: e.g. TRUE / 5 = 0.2
 m_prop$D <- (m_prop$Disease ==   # nolint
              sample_disease_group) / sample_disease_group_scale
-m_prop <- rbind(subset(m_prop, Disease == healthy_phenotype),
-               subset(m_prop, Disease != healthy_phenotype))
+## NA's are not included in the comparison below
+m_prop <- rbind(subset(m_prop, Disease != sample_disease_group),
+               subset(m_prop, Disease == sample_disease_group))
 
-jitter.new <- ggplot(m_prop, aes(Method, Prop)) +
+jitter_new <- ggplot(m_prop, aes(Method, Prop)) +
     geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease),
                size = 2, alpha = 0.7,
                position = position_jitter(width = 0.25, height = 0)) +
@@ -52,20 +83,23 @@
     scale_shape_manual(values = c(21, 24)) + theme_minimal()
 
 ## Plot to compare method effectiveness
-## Create dataframe for beta cell proportions and HbA1c levels
-m_prop_ana <- data.frame(pData(bulk_eset)[rep(1:89, 2), phenotype_factors],
-                        ct.prop = c(est_prop$Est.prop.weighted[, 2],
-                                    est_prop$Est.prop.allgene[, 2]),
-                        Method = factor(rep(methods, each = 89),
+## Create dataframe for beta cell proportions and Disease_factor levels
+m_prop_ana <- data.frame(pData(bulk_eset)[rep(1:nrow(estimated_music_props), 2), #nolint
+                                          phenotype_factors],
+                        ct.prop = c(estimated_music_props[, 2],
+                                    estimated_nnls_props[, 2]),
+                        Method = factor(rep(methods,
+                                            each = nrow(estimated_music_props)),
                                         levels = methods))
-colnames(m_prop_ana)[1:4] <- phenotype_factors
-m_prop_ana <- subset(m_prop_ana, !is.na(m_prop_ana[phenotype_gene]))
+colnames(m_prop_ana)[1:length(phenotype_factors)] <- phenotype_factors #nolint
+m_prop_ana <- subset(m_prop_ana, !is.na(m_prop_ana[phenotype_target]))
 m_prop_ana$Disease <- factor(sample_groups[(  # nolint
-    m_prop_ana[phenotype_gene] > 6.5) + 1], sample_groups)
+    m_prop_ana[phenotype_target] > phenotype_target_threshold) + 1],
+    sample_groups)
 m_prop_ana$D <- (m_prop_ana$Disease ==        # nolint
                  sample_disease_group) / sample_disease_group_scale
 
-jitt_compare <- ggplot(m_prop_ana, aes_string(phenotype_gene, "ct.prop")) +
+jitt_compare <- ggplot(m_prop_ana, aes_string(phenotype_target, "ct.prop")) +
     geom_smooth(method = "lm",  se = FALSE, col = "black", lwd = 0.25) +
     geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease),
                size = 2, alpha = 0.7) +  facet_wrap(~ Method) +
@@ -73,21 +107,81 @@
     scale_colour_manual(values = c("white", "gray20")) +
     scale_shape_manual(values = c(21, 24))
 
+## BoxPlot
+plot_box <- Boxplot_Est(list(
+    data.matrix(estimated_music_props),
+    data.matrix(estimated_nnls_props)),
+    method.name = c("MuSiC", "NNLS")) +
+    theme(axis.text.x = element_text(angle = -90),
+          axis.text.y = element_text(size = 8)) +
+    ggtitle(element_blank()) + theme_minimal()
+
+## Heatmap
+plot_hmap <- Prop_heat_Est(list(
+    data.matrix(estimated_music_props),
+    data.matrix(estimated_nnls_props)),
+    method.name = c("MuSiC", "NNLS")) +
+    theme(axis.text.x = element_text(angle = -90),
+          axis.text.y = element_text(size = 6))
 
 pdf(file = outfile_pdf, width = 8, height = 8)
-plot_grid(jitter.fig, jitter.new, labels = "auto", ncol = 1, nrow = 2)
-jitt_compare
-dev.off()
+plot_grid(jitter_fig, plot_box, labels = "auto", ncol = 1, nrow = 2)
+plot_grid(jitter_new, jitt_compare, labels = "auto", ncol = 1, nrow = 2)
+plot_hmap
+message(dev.off())
+
+## Output Proportions
+
+write.table(est_prop$Est.prop.weighted,
+            file = paste0("report_data/prop_",
+                          "Music Estimated Proportions of Cell Types",
+                          ".tabular"),
+            quote = F, sep = "\t", col.names = NA)
+write.table(est_prop$Est.prop.allgene,
+            file = paste0("report_data/prop_",
+                          "NNLS Estimated Proportions of Cell Types",
+                          ".tabular"),
+            quote = F, sep = "\t", col.names = NA)
+write.table(est_prop$Weight.gene,
+            file = paste0("report_data/weightgene_",
+                          "Music Estimated Proportions of Cell Types (by Gene)",
+                          ".tabular"),
+            quote = F, sep = "\t", col.names = NA)
+write.table(est_prop$r.squared.full,
+            file = paste0("report_data/rsquared_",
+                          "Music R-sqr Estimated Proportions of Each Subject",
+                          ".tabular"),
+            quote = F, sep = "\t", col.names = NA)
+write.table(est_prop$Var.prop,
+            file = paste0("report_data/varprop_",
+                          "Matrix of Variance of MuSiC Estimates",
+                          ".tabular"),
+            quote = F, sep = "\t", col.names = NA)
+
 
 ## Summary table
 for (meth in methods) {
     ##lm_beta_meth = lm(ct.prop ~ age + bmi + hba1c + gender, data =
-    ##subset(m_prop_ana, Method == meth))
+    sub_data <- subset(m_prop_ana, Method == meth)
+    ## We can only do regression where there are more than 1 factors
+    ## so we must find and exclude the ones which are not
+    gt1_facts <- sapply(phenotype_factors, function(facname) {
+        return(length(unique(sort(sub_data[[facname]]))) == 1)
+    })
+    form_factors <- phenotype_factors
+    exclude_facts <- names(gt1_facts)[gt1_facts]
+    if (length(exclude_facts) > 0) {
+        message("Factors with only one level will be excluded:")
+        message(exclude_facts)
+        form_factors <- phenotype_factors[
+            !(phenotype_factors %in% exclude_facts)]
+    }
     lm_beta_meth <- lm(as.formula(
-        paste("ct.prop", paste(phenotype_factors, collapse = " + "),
-              sep = " ~ ")),
-        data = subset(m_prop_ana, Method == meth))
-    print(paste0("Summary: ", meth))
+        paste("ct.prop", paste(form_factors, collapse = " + "),
+              sep = " ~ ")), data = sub_data)
+    message(paste0("Summary: ", meth))
     capture.output(summary(lm_beta_meth),
-                   file = paste0("report_data/summ_", meth, ".txt"))
+                   file = paste0("report_data/summ_Log of ",
+                                 meth,
+                                 " fitting.txt"))
 }
--- a/scripts/inspect.R	Sun Sep 12 19:48:48 2021 +0000
+++ b/scripts/inspect.R	Fri Nov 26 15:54:51 2021 +0000
@@ -6,17 +6,19 @@
 source(args[1])
 
 printout <- function(text) {
-    if (typeof(text) %in% c("list", "vector")) {
+    if (typeof(text) %in% c("list", "vector", "integer", "double", "numeric")) {
         write.table(text, file = outfile_tab, quote = F, sep = "\t",
                     col.names = NA)
     } else {
         ## text
+        print(typeof(text))
         capture.output(text, file = outfile_tab)  # nolint
     }
 }
 
-if (inspector %in% c("print", "pData", "fData", "dims", "experimentData",
-                     "exprs", "signature", "annotation", "abstract")) {
+if (inspector %in% c("print", "pData", "fData", "dims",
+                     "experimentData", "protocolData", "exprs",
+                     "signature", "annotation", "abstract")) {
     op <- get(inspector)
     tab <- op(rds_eset)
     printout(tab)
Binary file test-data/EMTABesethealthy.subset.rds has changed
Binary file test-data/Mousebulkeset.rds has changed
Binary file test-data/Mousesubeset.degenesonly2.half.rds has changed
Binary file test-data/default_output.pdf has changed
Binary file test-data/dendro.pdf has changed
Binary file test-data/dendro_1.pdf has changed