Repository 'music_inspect_eset'
hg clone https://toolshed.g2.bx.psu.edu/repos/bgruening/music_inspect_eset

Changeset 1:817eb707bbf4 (2021-11-26)
Previous changeset 0:2fed32b5aa02 (2021-09-12) Next changeset 2:577435e5e6b2 (2022-01-29)
Commit message:
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit 683bb72ae92b5759a239b7e3bf4c5a229ed35b54"
modified:
inspect_eset.xml
macros.xml
scripts/dendrogram.R
scripts/estimateprops.R
scripts/inspect.R
test-data/EMTABesethealthy.subset.rds
test-data/Mousebulkeset.rds
test-data/Mousesubeset.degenesonly2.half.rds
test-data/default_output.pdf
test-data/dendro.pdf
added:
test-data/dendro_1.pdf
b
diff -r 2fed32b5aa02 -r 817eb707bbf4 inspect_eset.xml
--- a/inspect_eset.xml Sun Sep 12 19:48:26 2021 +0000
+++ b/inspect_eset.xml Fri Nov 26 15:54:31 2021 +0000
b
@@ -38,14 +38,15 @@
             <option value="pData" >PhenoType Data Table</option>
             <option value="fData" >Feature Data Table</option>
             <option value="dims" >Dimension</option>
-            <option value="experimentData" />
-            <option value="signature" />
-            <option value="annotation" />
-            <option value="abstract" />
+            <option value="protocolData">Protocol Data</option>
+            <option value="experimentData" >Experiment Data</option>
+            <option value="annotation" >Annotation</option>
+            <option value="signature" >Signature (requires Annotation)</option>
+            <option value="abstract" >Abstract</option>
         </param>
     </inputs>
     <outputs>
-        <data name="out_tab" format="tabular" label="${tool.name} on ${on_string}: Inspection Result" />
+        <data name="out_tab" format="tabular" label="${tool.name} on ${on_string}: Inspection Result (${inspector})" />
     </outputs>
     <tests>
         <test expect_num_outputs="1" >
@@ -75,7 +76,6 @@
             <param name="inspector" value="dims" />
             <output name="out_tab" >
                 <assert_contents>
-                    <has_n_columns n="1" />
                     <has_text_matching expression="Samples\s+\d{1,4}"/>
                 </assert_contents>
             </output>
b
diff -r 2fed32b5aa02 -r 817eb707bbf4 macros.xml
--- a/macros.xml Sun Sep 12 19:48:26 2021 +0000
+++ b/macros.xml Fri Nov 26 15:54:31 2021 +0000
[
@@ -1,5 +1,5 @@
 <macros>
-    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@VERSION_SUFFIX@">1</token>
     <!-- The ESet inspector/constructor and MuSiC tool can have
          independent Galaxy versions but should reference the same
          package version always. -->
@@ -15,11 +15,11 @@
         <validator type="regex" message="FORMAT terms separated by commas">^(([A-Za-z0-9+_ -]+)\s?,?)*$</validator>
     </xml>
     <xml name="validator_text" >
-        <validator type="regex" message="No commas allowed">^(([A-Za-z0-9+_ -]+)\s?)*$</validator>
+        <validator type="regex" message="No commas allowed">^(([A-Za-z0-9+_ -]+)\s?)+$</validator>
     </xml>
     <xml name="celltypes_macro" >
         <param name="celltypes" type="text" optional="true" value=""
-               label="Comma list of cell types to use from scRNA dataset" help="If NULL, then use all cell types." >
+               label="Comma list of cell types to use from scRNA dataset" help="If blank, then use all available cell types." >
             <expand macro="validator_index_identifiers" />
         </param>
     </xml>
b
diff -r 2fed32b5aa02 -r 817eb707bbf4 scripts/dendrogram.R
--- a/scripts/dendrogram.R Sun Sep 12 19:48:26 2021 +0000
+++ b/scripts/dendrogram.R Fri Nov 26 15:54:31 2021 +0000
[
@@ -17,31 +17,6 @@
 args <- commandArgs(trailingOnly = TRUE)
 source(args[1])
 
-## We then perform bulk tissue cell type estimation with pre-grouping
-## of cell types: C, list_of_cell_types, marker genes name, marker
-## genes list.
-## data.to.use = list(
-##     "C1" = list(cell.types = c("Neutro"),
-##                 marker.names=NULL,
-##                 marker.list=NULL),
-##     "C2" = list(cell.types = c("Podo"),
-##                 marker.names=NULL,
-##                 marker.list=NULL),
-##     "C3" = list(cell.types = c("Endo","CD-PC","LOH","CD-IC","DCT","PT"),
-##                 marker.names = "Epithelial",
-##                 marker.list = read_list("../test-data/epith.markers")),
-##     "C4" = list(cell.types = c("Macro","Fib","B lymph","NK","T lymph"),
-##                 marker.names = "Immune",
-##                 marker.list = read_list("../test-data/immune.markers"))
-## )
-grouped_celltypes <- lapply(data.to.use, function(x) {
-    x$cell.types
-})
-marker_groups <- lapply(data.to.use, function(x) {
-    x$marker.list
-})
-names(marker_groups) <- names(data.to.use)
-
 
 ## Perform the estimation
 ## Produce the first step information
@@ -51,33 +26,107 @@
 
 ## Plot the dendrogram of design matrix and cross-subject mean of
 ## realtive abundance
-par(mfrow = c(1, 2))
-d <- dist(t(log(sub.basis$Disgn.mtx + 1e-6)), method = "euclidean")
+## Hierarchical clustering using Complete Linkage
+d1 <- dist(t(log(sub.basis$Disgn.mtx + 1e-6)), method = "euclidean")
+hc1 <- hclust(d1, method = "complete")
 ## Hierarchical clustering using Complete Linkage
-hc1 <- hclust(d, method = "complete")
-## Plot the obtained dendrogram
-plot(hc1, cex = 0.6, hang = -1, main = "Cluster log(Design Matrix)")
-d <- dist(t(log(sub.basis$M.theta + 1e-8)), method = "euclidean")
-## Hierarchical clustering using Complete Linkage
-hc2 <- hclust(d, method = "complete")
-## Plot the obtained dendrogram
-pdf(file = outfile_pdf, width = 8, height = 8)
-plot(hc2, cex = 0.6, hang = -1, main = "Cluster log(Mean of RA)")
+d2 <- dist(t(log(sub.basis$M.theta + 1e-8)), method = "euclidean")
+hc2 <- hclust(d2, method = "complete")
+
 
-cl_type <- as.character(scrna_eset[[celltypes_label]])
+if (length(data.to.use) > 0) {
+    ## We then perform bulk tissue cell type estimation with pre-grouping
+    ## of cell types: C, list_of_cell_types, marker genes name, marker
+    ## genes list.
+    ## data.to.use = list(
+    ##     "C1" = list(cell.types = c("Neutro"),
+    ##                 marker.names=NULL,
+    ##                 marker.list=NULL),
+    ##     "C2" = list(cell.types = c("Podo"),
+    ##                 marker.names=NULL,
+    ##                 marker.list=NULL),
+    ##     "C3" = list(cell.types = c("Endo","CD-PC","LOH","CD-IC","DCT","PT"),
+    ##                 marker.names = "Epithelial",
+    ##                 marker.list = read_list("../test-data/epith.markers")),
+    ##     "C4" = list(cell.types = c("Macro","Fib","B lymph","NK","T lymph"),
+    ##                 marker.names = "Immune",
+    ##                 marker.list = read_list("../test-data/immune.markers"))
+    ## )
+    grouped_celltypes <- lapply(data.to.use, function(x) {
+        x$cell.types
+    })
+    marker_groups <- lapply(data.to.use, function(x) {
+        x$marker.list
+    })
+    names(marker_groups) <- names(data.to.use)
+
+
+    cl_type <- as.character(scrna_eset[[celltypes_label]])
+
+    for (cl in seq_len(length(grouped_celltypes))) {
+        cl_type[cl_type %in%
+                grouped_celltypes[[cl]]] <- names(grouped_celltypes)[cl]
+    }
+    pData(scrna_eset)[[clustertype_label]] <- factor(
+        cl_type, levels = c(names(grouped_celltypes),
+                            "CD-Trans", "Novel1", "Novel2"))
+
+    est_bulk <- music_prop.cluster(
+        bulk.eset = bulk_eset, sc.eset = scrna_eset,
+        group.markers = marker_groups, clusters = celltypes_label,
+        groups = clustertype_label, samples = samples_label,
+        clusters.type = grouped_celltypes
+    )
 
-for (cl in seq_len(length(grouped_celltypes))) {
-  cl_type[cl_type %in% grouped_celltypes[[cl]]] <- names(grouped_celltypes)[cl]
-}
-pData(scrna_eset)[[clustertype_label]] <- factor(
-    cl_type, levels = c(names(grouped_celltypes),
-                        "CD-Trans", "Novel1", "Novel2"))
+    estimated_music_props <- est_bulk$Est.prop.weighted.cluster
+    ## NNLS is not calculated here
+
+    ## Show different in estimation methods
+    ## Jitter plot of estimated cell type proportions
+    methods_list <- c("MuSiC")
+
+    jitter_fig <- Jitter_Est(
+        list(data.matrix(estimated_music_props)),
+        method.name = methods_list, title = "Jitter plot of Est Proportions",
+        size = 2, alpha = 0.7) +
+        theme_minimal() +
+        labs(x = element_blank(), y = element_blank()) +
+        theme(axis.text = element_text(size = 6),
+              axis.text.x = element_blank(),
+              legend.position = "none")
+
+    plot_box <- Boxplot_Est(list(
+        data.matrix(estimated_music_props)),
+        method.name = methods_list) +
+        theme_minimal() +
+        labs(x = element_blank(), y = element_blank()) +
+        theme(axis.text = element_text(size = 6),
+              axis.text.x = element_blank(),
+              legend.position = "none")
 
-est_bulk <- music_prop.cluster(
-    bulk.eset = bulk_eset, sc.eset = scrna_eset,
-    group.markers = marker_groups, clusters = celltypes_label,
-    groups = clustertype_label, samples = samples_label,
-    clusters.type = grouped_celltypes)
+    plot_hmap <- Prop_heat_Est(list(
+        data.matrix(estimated_music_props)),
+        method.name = methods_list) +
+        labs(x = element_blank(), y = element_blank()) +
+        theme(axis.text.y = element_text(size = 6),
+              axis.text.x = element_text(angle = -90, size = 5),
+              plot.title = element_text(size = 9),
+              legend.key.width = unit(0.15, "cm"),
+              legend.text = element_text(size = 5),
+              legend.title = element_text(size = 5))
 
-write.table(est_bulk, file = outfile_tab, quote = F, col.names = NA, sep = "\t")
-dev.off()
+}
+    
+pdf(file = outfile_pdf, width = 8, height = 8)
+par(mfrow = c(1, 2))
+plot(hc1, cex = 0.6, hang = -1, main = "Cluster log(Design Matrix)")
+plot(hc2, cex = 0.6, hang = -1, main = "Cluster log(Mean of RA)")
+if (length(data.to.use) > 0) {
+    plot_grid(jitter_fig, plot_box, plot_hmap, ncol = 2, nrow = 2)
+}
+message(dev.off())
+
+if (length(data.to.use) > 0) {
+    write.table(estimated_music_props,
+                file = outfile_tab, quote = F, col.names = NA, sep = "\t")
+}
b
diff -r 2fed32b5aa02 -r 817eb707bbf4 scripts/estimateprops.R
--- a/scripts/estimateprops.R Sun Sep 12 19:48:26 2021 +0000
+++ b/scripts/estimateprops.R Fri Nov 26 15:54:31 2021 +0000
[
b'@@ -14,36 +14,67 @@\n     clusters = celltypes_label,\n     samples = samples_label, select.ct = celltypes, verbose = T)\n \n+estimated_music_props <- est_prop$Est.prop.weighted\n+estimated_nnls_props <- est_prop$Est.prop.allgene\n \n ## Show different in estimation methods\n ## Jitter plot of estimated cell type proportions\n-jitter.fig <- Jitter_Est(\n-    list(data.matrix(est_prop$Est.prop.weighted),\n-         data.matrix(est_prop$Est.prop.allgene)),\n-    method.name = methods, title = "Jitter plot of Est Proportions")\n+jitter_fig <- Jitter_Est(\n+    list(data.matrix(estimated_music_props),\n+         data.matrix(estimated_nnls_props)),\n+    method.name = methods, title = "Jitter plot of Est Proportions",\n+    size = 2, alpha = 0.7) + theme_minimal()\n \n \n ## Make a Plot\n ## A more sophisticated jitter plot is provided as below. We separated\n-## the T2D subjects and normal subjects by their HbA1c levels.\n-m_prop <- rbind(melt(est_prop$Est.prop.weighted),\n-               melt(est_prop$Est.prop.allgene))\n+## the T2D subjects and normal subjects by their disease factor levels.\n+estimated_music_props_flat <- melt(estimated_music_props)\n+estimated_nnls_props_flat <- melt(estimated_nnls_props)\n+\n+m_prop <- rbind(estimated_music_props_flat,\n+                estimated_nnls_props_flat)\n+colnames(m_prop) <- c("Sub", "CellType", "Prop")\n+\n+if (is.null(celltypes)) {\n+    celltypes <- levels(m_prop$CellType)\n+    message("No celltypes declared, using:")\n+    message(celltypes)\n+}\n \n-colnames(m_prop) <- c("Sub", "CellType", "Prop")\n+if (phenotype_target_threshold == -99) {\n+    phenotype_target_threshold <- -Inf\n+    message("phenotype target threshold set to -Inf")\n+}\n+\n+if (is.null(phenotype_factors)) {\n+    phenotype_factors <- colnames(pData(bulk_eset))\n+}\n+## filter out unwanted factors like "sampleID" and "subjectName"\n+phenotype_factors <- phenotype_factors[\n+    !(phenotype_factors %in% phenotype_factors_always_exclude)]\n+message("Phenotype Factors to use:")\n+message(phenotype_factors)\n+\n \n m_prop$CellType <- factor(m_prop$CellType, levels = celltypes) # nolint\n-m_prop$Method <- factor(rep(methods, each = 89 * 6), levels = methods) # nolint\n-m_prop$HbA1c <- rep(bulk_eset$hba1c, 2 * 6) # nolint\n-m_prop <- m_prop[!is.na(m_prop$HbA1c), ]\n-m_prop$Disease <- factor(sample_groups[(m_prop$HbA1c > 6.5) + 1], # nolint\n+m_prop$Method <- factor(rep(methods, each = nrow(estimated_music_props_flat)), # nolint\n+                        levels = methods)\n+m_prop$Disease_factor <- rep(bulk_eset[[phenotype_target]], 2 * length(celltypes)) # nolint\n+m_prop <- m_prop[!is.na(m_prop$Disease_factor), ]\n+## Generate a TRUE/FALSE table of Normal == 1 and Disease == 2\n+sample_groups <- c("Normal", sample_disease_group)\n+m_prop$Disease <- factor(sample_groups[(m_prop$Disease_factor > phenotype_target_threshold) + 1], # nolint\n                          levels = sample_groups)\n \n+## Binary to scale: e.g. TRUE / 5 = 0.2\n m_prop$D <- (m_prop$Disease ==   # nolint\n              sample_disease_group) / sample_disease_group_scale\n-m_prop <- rbind(subset(m_prop, Disease == healthy_phenotype),\n-               subset(m_prop, Disease != healthy_phenotype))\n+## NA\'s are not included in the comparison below\n+m_prop <- rbind(subset(m_prop, Disease != sample_disease_group),\n+               subset(m_prop, Disease == sample_disease_group))\n \n-jitter.new <- ggplot(m_prop, aes(Method, Prop)) +\n+jitter_new <- ggplot(m_prop, aes(Method, Prop)) +\n     geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease),\n                size = 2, alpha = 0.7,\n                position = position_jitter(width = 0.25, height = 0)) +\n@@ -52,20 +83,23 @@\n     scale_shape_manual(values = c(21, 24)) + theme_minimal()\n \n ## Plot to compare method effectiveness\n-## Create dataframe for beta cell proportions and HbA1c levels\n-m_prop_ana <- data.frame(pData(bulk_eset)[rep(1:89, 2), phenotype_factors],\n-                        ct.prop = c(est_prop$Est.prop.weighted[, 2],\n-                      '..b'mooth(method = "lm",  se = FALSE, col = "black", lwd = 0.25) +\n     geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease),\n                size = 2, alpha = 0.7) +  facet_wrap(~ Method) +\n@@ -73,21 +107,81 @@\n     scale_colour_manual(values = c("white", "gray20")) +\n     scale_shape_manual(values = c(21, 24))\n \n+## BoxPlot\n+plot_box <- Boxplot_Est(list(\n+    data.matrix(estimated_music_props),\n+    data.matrix(estimated_nnls_props)),\n+    method.name = c("MuSiC", "NNLS")) +\n+    theme(axis.text.x = element_text(angle = -90),\n+          axis.text.y = element_text(size = 8)) +\n+    ggtitle(element_blank()) + theme_minimal()\n+\n+## Heatmap\n+plot_hmap <- Prop_heat_Est(list(\n+    data.matrix(estimated_music_props),\n+    data.matrix(estimated_nnls_props)),\n+    method.name = c("MuSiC", "NNLS")) +\n+    theme(axis.text.x = element_text(angle = -90),\n+          axis.text.y = element_text(size = 6))\n \n pdf(file = outfile_pdf, width = 8, height = 8)\n-plot_grid(jitter.fig, jitter.new, labels = "auto", ncol = 1, nrow = 2)\n-jitt_compare\n-dev.off()\n+plot_grid(jitter_fig, plot_box, labels = "auto", ncol = 1, nrow = 2)\n+plot_grid(jitter_new, jitt_compare, labels = "auto", ncol = 1, nrow = 2)\n+plot_hmap\n+message(dev.off())\n+\n+## Output Proportions\n+\n+write.table(est_prop$Est.prop.weighted,\n+            file = paste0("report_data/prop_",\n+                          "Music Estimated Proportions of Cell Types",\n+                          ".tabular"),\n+            quote = F, sep = "\\t", col.names = NA)\n+write.table(est_prop$Est.prop.allgene,\n+            file = paste0("report_data/prop_",\n+                          "NNLS Estimated Proportions of Cell Types",\n+                          ".tabular"),\n+            quote = F, sep = "\\t", col.names = NA)\n+write.table(est_prop$Weight.gene,\n+            file = paste0("report_data/weightgene_",\n+                          "Music Estimated Proportions of Cell Types (by Gene)",\n+                          ".tabular"),\n+            quote = F, sep = "\\t", col.names = NA)\n+write.table(est_prop$r.squared.full,\n+            file = paste0("report_data/rsquared_",\n+                          "Music R-sqr Estimated Proportions of Each Subject",\n+                          ".tabular"),\n+            quote = F, sep = "\\t", col.names = NA)\n+write.table(est_prop$Var.prop,\n+            file = paste0("report_data/varprop_",\n+                          "Matrix of Variance of MuSiC Estimates",\n+                          ".tabular"),\n+            quote = F, sep = "\\t", col.names = NA)\n+\n \n ## Summary table\n for (meth in methods) {\n     ##lm_beta_meth = lm(ct.prop ~ age + bmi + hba1c + gender, data =\n-    ##subset(m_prop_ana, Method == meth))\n+    sub_data <- subset(m_prop_ana, Method == meth)\n+    ## We can only do regression where there are more than 1 factors\n+    ## so we must find and exclude the ones which are not\n+    gt1_facts <- sapply(phenotype_factors, function(facname) {\n+        return(length(unique(sort(sub_data[[facname]]))) == 1)\n+    })\n+    form_factors <- phenotype_factors\n+    exclude_facts <- names(gt1_facts)[gt1_facts]\n+    if (length(exclude_facts) > 0) {\n+        message("Factors with only one level will be excluded:")\n+        message(exclude_facts)\n+        form_factors <- phenotype_factors[\n+            !(phenotype_factors %in% exclude_facts)]\n+    }\n     lm_beta_meth <- lm(as.formula(\n-        paste("ct.prop", paste(phenotype_factors, collapse = " + "),\n-              sep = " ~ ")),\n-        data = subset(m_prop_ana, Method == meth))\n-    print(paste0("Summary: ", meth))\n+        paste("ct.prop", paste(form_factors, collapse = " + "),\n+              sep = " ~ ")), data = sub_data)\n+    message(paste0("Summary: ", meth))\n     capture.output(summary(lm_beta_meth),\n-                   file = paste0("report_data/summ_", meth, ".txt"))\n+                   file = paste0("report_data/summ_Log of ",\n+                                 meth,\n+                                 " fitting.txt"))\n }\n'
b
diff -r 2fed32b5aa02 -r 817eb707bbf4 scripts/inspect.R
--- a/scripts/inspect.R Sun Sep 12 19:48:26 2021 +0000
+++ b/scripts/inspect.R Fri Nov 26 15:54:31 2021 +0000
[
@@ -6,17 +6,19 @@
 source(args[1])
 
 printout <- function(text) {
-    if (typeof(text) %in% c("list", "vector")) {
+    if (typeof(text) %in% c("list", "vector", "integer", "double", "numeric")) {
         write.table(text, file = outfile_tab, quote = F, sep = "\t",
                     col.names = NA)
     } else {
         ## text
+        print(typeof(text))
         capture.output(text, file = outfile_tab)  # nolint
     }
 }
 
-if (inspector %in% c("print", "pData", "fData", "dims", "experimentData",
-                     "exprs", "signature", "annotation", "abstract")) {
+if (inspector %in% c("print", "pData", "fData", "dims",
+                     "experimentData", "protocolData", "exprs",
+                     "signature", "annotation", "abstract")) {
     op <- get(inspector)
     tab <- op(rds_eset)
     printout(tab)
b
diff -r 2fed32b5aa02 -r 817eb707bbf4 test-data/EMTABesethealthy.subset.rds
b
Binary file test-data/EMTABesethealthy.subset.rds has changed
b
diff -r 2fed32b5aa02 -r 817eb707bbf4 test-data/Mousebulkeset.rds
b
Binary file test-data/Mousebulkeset.rds has changed
b
diff -r 2fed32b5aa02 -r 817eb707bbf4 test-data/Mousesubeset.degenesonly2.half.rds
b
Binary file test-data/Mousesubeset.degenesonly2.half.rds has changed
b
diff -r 2fed32b5aa02 -r 817eb707bbf4 test-data/default_output.pdf
b
Binary file test-data/default_output.pdf has changed
b
diff -r 2fed32b5aa02 -r 817eb707bbf4 test-data/dendro.pdf
b
Binary file test-data/dendro.pdf has changed
b
diff -r 2fed32b5aa02 -r 817eb707bbf4 test-data/dendro_1.pdf
b
Binary file test-data/dendro_1.pdf has changed