Mercurial > repos > bgruening > music_construct_eset

--- a/construct_eset.xml	Sun Sep 12 19:49:12 2021 +0000
+++ b/construct_eset.xml	Fri Nov 26 15:55:11 2021 +0000
@@ -14,9 +14,12 @@
 suppressWarnings(suppressPackageStartupMessages(library(xbioc)))
 suppressWarnings(suppressPackageStartupMessages(library(MuSiC)))

-null_str_vec = function(gstr){
+null_str_vec = function(gstr, is.str=FALSE){
    tokens = unlist(as.vector(strsplit(gstr, split=",")))
    if (length(tokens) == 0){
+      if (is.str){
+         return(character(0))
+      }
       return(NULL)
    }
    if (length(tokens) == 1){
@@ -40,6 +43,9 @@
 #end if
 ## Annotation and Feature Data, or just a string for type of chip used
 annotation = null_str_vec('$annotation')
+if (is.null(annotation)){
+   annotation = character(0)
+}

 if (all(rownames(pdata) != colnames(exprs))) {
    stop("Number of Samples between phenotypes and assays are not the same")
@@ -66,26 +72,27 @@
 metadata\$lname = NULL

 if (nrow(metadata)==0) {
-   metadata = NULL
+    pheno_data = new("AnnotatedDataFrame", data = pdata)
+} else {
+    pheno_data = new("AnnotatedDataFrame", data = pdata, varMetadata = metadata)
 }

-pheno_data = new("AnnotatedDataFrame", data = pdata, varMetadata = metadata)

 ## Experiment Description -- using the MIAME object
 experiment_data = new(
     "MIAME",
-    name = null_str_vec('$expdata.name'),
-    lab = null_str_vec('$expdata.lab'),
-    contact = null_str_vec('$expdata.contact'),
-    title = null_str_vec('$expdata.title'),
-    abstract = null_str_vec('$expdata.abstract'),
-    url = null_str_vec('$expdata.url'),
+    name = null_str_vec('$expdata.name', is.str=T),
+    lab = null_str_vec('$expdata.lab', is.str=T),
+    contact = null_str_vec('$expdata.contact', is.str=T),
+    title = null_str_vec('$expdata.title', is.str=T),
+    abstract = null_str_vec('$expdata.abstract', is.str=T),
+    url = null_str_vec('$expdata.url', is.str=T),
     other = list(
 #for i, row in enumerate($expdata.other):
     #if i==0
-    '$row.field' = null_str_vec('$row.comment')
+    '$row.field' = null_str_vec('$row.comment', is.str=T)
     #else
-    ,'$row.field' = null_str_vec('$row.comment')
+    ,'$row.field' = null_str_vec('$row.comment', is.str=T)
     #end if
 #end for
 ))
@@ -95,7 +102,7 @@
                       experimentData = experiment_data,
                       annotation = annotation)

-capture.output(print(e_set), file = '$out_tab')
+capture.output(print(e_set), file = '$out_txt')
 saveRDS(e_set, file= '$out_rds')

         </configfile>
@@ -112,7 +119,8 @@
         </param>
         <repeat name="metadata" title="Meta Data" min="0" max="15" >
             <!-- optional, so min=0 -->
-            <param name="row_names" label="Label" type="text" >
+            <param name="row_names" label="Label" type="text"
+                   help="Metadata should correspond directly to the columns of the Phenotype Data" >
                 <expand macro="validator_text_and_urls" />
             </param>
             <param name="label_desc" label="Label Description" type="text" >
@@ -149,7 +157,7 @@
         </section>
     </inputs>
     <outputs>
-        <data name="out_tab" format="tabular" label="${tool.name} on ${on_string}: General Info" />
+        <data name="out_txt" format="txt" label="${tool.name} on ${on_string}: General Info" />
         <data name="out_rds" format="rdata.eset" label="${tool.name} on ${on_string}: RData ESet Object" />
     </outputs>
     <tests>
@@ -186,7 +194,7 @@
                     <param name="comment" value="Some other comment" />
                 </repeat>
             </section>
-            <output name="out_tab">
+            <output name="out_txt">
                 <assert_contents>
                     <has_text text="assayData: 3 features, 2 samples " />
                 </assert_contents>
--- a/macros.xml	Sun Sep 12 19:49:12 2021 +0000
+++ b/macros.xml	Fri Nov 26 15:55:11 2021 +0000
@@ -1,5 +1,5 @@
 <macros>
-    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@VERSION_SUFFIX@">1</token>
     <!-- The ESet inspector/constructor and MuSiC tool can have
          independent Galaxy versions but should reference the same
          package version always. -->
@@ -15,11 +15,11 @@
         <validator type="regex" message="FORMAT terms separated by commas">^(([A-Za-z0-9+_ -]+)\s?,?)*$</validator>
     </xml>
     <xml name="validator_text" >
-        <validator type="regex" message="No commas allowed">^(([A-Za-z0-9+_ -]+)\s?)*$</validator>
+        <validator type="regex" message="No commas allowed">^(([A-Za-z0-9+_ -]+)\s?)+$</validator>
     </xml>
     <xml name="celltypes_macro" >
         <param name="celltypes" type="text" optional="true" value=""
-               label="Comma list of cell types to use from scRNA dataset" help="If NULL, then use all cell types." >
+               label="Comma list of cell types to use from scRNA dataset" help="If blank, then use all available cell types." >
             <expand macro="validator_index_identifiers" />
         </param>
     </xml>
--- a/scripts/dendrogram.R	Sun Sep 12 19:49:12 2021 +0000
+++ b/scripts/dendrogram.R	Fri Nov 26 15:55:11 2021 +0000
@@ -17,31 +17,6 @@
 args <- commandArgs(trailingOnly = TRUE)
 source(args[1])

-## We then perform bulk tissue cell type estimation with pre-grouping
-## of cell types: C, list_of_cell_types, marker genes name, marker
-## genes list.
-## data.to.use = list(
-##     "C1" = list(cell.types = c("Neutro"),
-##                 marker.names=NULL,
-##                 marker.list=NULL),
-##     "C2" = list(cell.types = c("Podo"),
-##                 marker.names=NULL,
-##                 marker.list=NULL),
-##     "C3" = list(cell.types = c("Endo","CD-PC","LOH","CD-IC","DCT","PT"),
-##                 marker.names = "Epithelial",
-##                 marker.list = read_list("../test-data/epith.markers")),
-##     "C4" = list(cell.types = c("Macro","Fib","B lymph","NK","T lymph"),
-##                 marker.names = "Immune",
-##                 marker.list = read_list("../test-data/immune.markers"))
-## )
-grouped_celltypes <- lapply(data.to.use, function(x) {
-    x$cell.types
-})
-marker_groups <- lapply(data.to.use, function(x) {
-    x$marker.list
-})
-names(marker_groups) <- names(data.to.use)
-

 ## Perform the estimation
 ## Produce the first step information
@@ -51,33 +26,107 @@

 ## Plot the dendrogram of design matrix and cross-subject mean of
 ## realtive abundance
-par(mfrow = c(1, 2))
-d <- dist(t(log(sub.basis$Disgn.mtx + 1e-6)), method = "euclidean")
+## Hierarchical clustering using Complete Linkage
+d1 <- dist(t(log(sub.basis$Disgn.mtx + 1e-6)), method = "euclidean")
+hc1 <- hclust(d1, method = "complete")
 ## Hierarchical clustering using Complete Linkage
-hc1 <- hclust(d, method = "complete")
-## Plot the obtained dendrogram
-plot(hc1, cex = 0.6, hang = -1, main = "Cluster log(Design Matrix)")
-d <- dist(t(log(sub.basis$M.theta + 1e-8)), method = "euclidean")
-## Hierarchical clustering using Complete Linkage
-hc2 <- hclust(d, method = "complete")
-## Plot the obtained dendrogram
-pdf(file = outfile_pdf, width = 8, height = 8)
-plot(hc2, cex = 0.6, hang = -1, main = "Cluster log(Mean of RA)")
+d2 <- dist(t(log(sub.basis$M.theta + 1e-8)), method = "euclidean")
+hc2 <- hclust(d2, method = "complete")
+

-cl_type <- as.character(scrna_eset[[celltypes_label]])
+if (length(data.to.use) > 0) {
+    ## We then perform bulk tissue cell type estimation with pre-grouping
+    ## of cell types: C, list_of_cell_types, marker genes name, marker
+    ## genes list.
+    ## data.to.use = list(
+    ##     "C1" = list(cell.types = c("Neutro"),
+    ##                 marker.names=NULL,
+    ##                 marker.list=NULL),
+    ##     "C2" = list(cell.types = c("Podo"),
+    ##                 marker.names=NULL,
+    ##                 marker.list=NULL),
+    ##     "C3" = list(cell.types = c("Endo","CD-PC","LOH","CD-IC","DCT","PT"),
+    ##                 marker.names = "Epithelial",
+    ##                 marker.list = read_list("../test-data/epith.markers")),
+    ##     "C4" = list(cell.types = c("Macro","Fib","B lymph","NK","T lymph"),
+    ##                 marker.names = "Immune",
+    ##                 marker.list = read_list("../test-data/immune.markers"))
+    ## )
+    grouped_celltypes <- lapply(data.to.use, function(x) {
+        x$cell.types
+    })
+    marker_groups <- lapply(data.to.use, function(x) {
+        x$marker.list
+    })
+    names(marker_groups) <- names(data.to.use)
+
+
+    cl_type <- as.character(scrna_eset[[celltypes_label]])
+
+    for (cl in seq_len(length(grouped_celltypes))) {
+        cl_type[cl_type %in%
+                grouped_celltypes[[cl]]] <- names(grouped_celltypes)[cl]
+    }
+    pData(scrna_eset)[[clustertype_label]] <- factor(
+        cl_type, levels = c(names(grouped_celltypes),
+                            "CD-Trans", "Novel1", "Novel2"))
+
+    est_bulk <- music_prop.cluster(
+        bulk.eset = bulk_eset, sc.eset = scrna_eset,
+        group.markers = marker_groups, clusters = celltypes_label,
+        groups = clustertype_label, samples = samples_label,
+        clusters.type = grouped_celltypes
+    )

-for (cl in seq_len(length(grouped_celltypes))) {
-  cl_type[cl_type %in% grouped_celltypes[[cl]]] <- names(grouped_celltypes)[cl]
-}
-pData(scrna_eset)[[clustertype_label]] <- factor(
-    cl_type, levels = c(names(grouped_celltypes),
-                        "CD-Trans", "Novel1", "Novel2"))
+    estimated_music_props <- est_bulk$Est.prop.weighted.cluster
+    ## NNLS is not calculated here
+
+    ## Show different in estimation methods
+    ## Jitter plot of estimated cell type proportions
+    methods_list <- c("MuSiC")
+
+    jitter_fig <- Jitter_Est(
+        list(data.matrix(estimated_music_props)),
+        method.name = methods_list, title = "Jitter plot of Est Proportions",
+        size = 2, alpha = 0.7) +
+        theme_minimal() +
+        labs(x = element_blank(), y = element_blank()) +
+        theme(axis.text = element_text(size = 6),
+              axis.text.x = element_blank(),
+              legend.position = "none")
+
+    plot_box <- Boxplot_Est(list(
+        data.matrix(estimated_music_props)),
+        method.name = methods_list) +
+        theme_minimal() +
+        labs(x = element_blank(), y = element_blank()) +
+        theme(axis.text = element_text(size = 6),
+              axis.text.x = element_blank(),
+              legend.position = "none")

-est_bulk <- music_prop.cluster(
-    bulk.eset = bulk_eset, sc.eset = scrna_eset,
-    group.markers = marker_groups, clusters = celltypes_label,
-    groups = clustertype_label, samples = samples_label,
-    clusters.type = grouped_celltypes)
+    plot_hmap <- Prop_heat_Est(list(
+        data.matrix(estimated_music_props)),
+        method.name = methods_list) +
+        labs(x = element_blank(), y = element_blank()) +
+        theme(axis.text.y = element_text(size = 6),
+              axis.text.x = element_text(angle = -90, size = 5),
+              plot.title = element_text(size = 9),
+              legend.key.width = unit(0.15, "cm"),
+              legend.text = element_text(size = 5),
+              legend.title = element_text(size = 5))

-write.table(est_bulk, file = outfile_tab, quote = F, col.names = NA, sep = "\t")
-dev.off()
+}
+
+pdf(file = outfile_pdf, width = 8, height = 8)
+par(mfrow = c(1, 2))
+plot(hc1, cex = 0.6, hang = -1, main = "Cluster log(Design Matrix)")
+plot(hc2, cex = 0.6, hang = -1, main = "Cluster log(Mean of RA)")
+if (length(data.to.use) > 0) {
+    plot_grid(jitter_fig, plot_box, plot_hmap, ncol = 2, nrow = 2)
+}
+message(dev.off())
+
+if (length(data.to.use) > 0) {
+    write.table(estimated_music_props,
+                file = outfile_tab, quote = F, col.names = NA, sep = "\t")
+}
--- a/scripts/estimateprops.R	Sun Sep 12 19:49:12 2021 +0000
+++ b/scripts/estimateprops.R	Fri Nov 26 15:55:11 2021 +0000
@@ -14,36 +14,67 @@
     clusters = celltypes_label,
     samples = samples_label, select.ct = celltypes, verbose = T)

+estimated_music_props <- est_prop$Est.prop.weighted
+estimated_nnls_props <- est_prop$Est.prop.allgene

 ## Show different in estimation methods
 ## Jitter plot of estimated cell type proportions
-jitter.fig <- Jitter_Est(
-    list(data.matrix(est_prop$Est.prop.weighted),
-         data.matrix(est_prop$Est.prop.allgene)),
-    method.name = methods, title = "Jitter plot of Est Proportions")
+jitter_fig <- Jitter_Est(
+    list(data.matrix(estimated_music_props),
+         data.matrix(estimated_nnls_props)),
+    method.name = methods, title = "Jitter plot of Est Proportions",
+    size = 2, alpha = 0.7) + theme_minimal()


 ## Make a Plot
 ## A more sophisticated jitter plot is provided as below. We separated
-## the T2D subjects and normal subjects by their HbA1c levels.
-m_prop <- rbind(melt(est_prop$Est.prop.weighted),
-               melt(est_prop$Est.prop.allgene))
+## the T2D subjects and normal subjects by their disease factor levels.
+estimated_music_props_flat <- melt(estimated_music_props)
+estimated_nnls_props_flat <- melt(estimated_nnls_props)
+
+m_prop <- rbind(estimated_music_props_flat,
+                estimated_nnls_props_flat)
+colnames(m_prop) <- c("Sub", "CellType", "Prop")
+
+if (is.null(celltypes)) {
+    celltypes <- levels(m_prop$CellType)
+    message("No celltypes declared, using:")
+    message(celltypes)
+}

-colnames(m_prop) <- c("Sub", "CellType", "Prop")
+if (phenotype_target_threshold == -99) {
+    phenotype_target_threshold <- -Inf
+    message("phenotype target threshold set to -Inf")
+}
+
+if (is.null(phenotype_factors)) {
+    phenotype_factors <- colnames(pData(bulk_eset))
+}
+## filter out unwanted factors like "sampleID" and "subjectName"
+phenotype_factors <- phenotype_factors[
+    !(phenotype_factors %in% phenotype_factors_always_exclude)]
+message("Phenotype Factors to use:")
+message(phenotype_factors)
+

 m_prop$CellType <- factor(m_prop$CellType, levels = celltypes) # nolint
-m_prop$Method <- factor(rep(methods, each = 89 * 6), levels = methods) # nolint
-m_prop$HbA1c <- rep(bulk_eset$hba1c, 2 * 6) # nolint
-m_prop <- m_prop[!is.na(m_prop$HbA1c), ]
-m_prop$Disease <- factor(sample_groups[(m_prop$HbA1c > 6.5) + 1], # nolint
+m_prop$Method <- factor(rep(methods, each = nrow(estimated_music_props_flat)), # nolint
+                        levels = methods)
+m_prop$Disease_factor <- rep(bulk_eset[[phenotype_target]], 2 * length(celltypes)) # nolint
+m_prop <- m_prop[!is.na(m_prop$Disease_factor), ]
+## Generate a TRUE/FALSE table of Normal == 1 and Disease == 2
+sample_groups <- c("Normal", sample_disease_group)
+m_prop$Disease <- factor(sample_groups[(m_prop$Disease_factor > phenotype_target_threshold) + 1], # nolint
                          levels = sample_groups)

+## Binary to scale: e.g. TRUE / 5 = 0.2
 m_prop$D <- (m_prop$Disease ==   # nolint
              sample_disease_group) / sample_disease_group_scale
-m_prop <- rbind(subset(m_prop, Disease == healthy_phenotype),
-               subset(m_prop, Disease != healthy_phenotype))
+## NA's are not included in the comparison below
+m_prop <- rbind(subset(m_prop, Disease != sample_disease_group),
+               subset(m_prop, Disease == sample_disease_group))

-jitter.new <- ggplot(m_prop, aes(Method, Prop)) +
+jitter_new <- ggplot(m_prop, aes(Method, Prop)) +
     geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease),
                size = 2, alpha = 0.7,
                position = position_jitter(width = 0.25, height = 0)) +
@@ -52,20 +83,23 @@
     scale_shape_manual(values = c(21, 24)) + theme_minimal()

 ## Plot to compare method effectiveness
-## Create dataframe for beta cell proportions and HbA1c levels
-m_prop_ana <- data.frame(pData(bulk_eset)[rep(1:89, 2), phenotype_factors],
-                        ct.prop = c(est_prop$Est.prop.weighted[, 2],
-                                    est_prop$Est.prop.allgene[, 2]),
-                        Method = factor(rep(methods, each = 89),
+## Create dataframe for beta cell proportions and Disease_factor levels
+m_prop_ana <- data.frame(pData(bulk_eset)[rep(1:nrow(estimated_music_props), 2), #nolint
+                                          phenotype_factors],
+                        ct.prop = c(estimated_music_props[, 2],
+                                    estimated_nnls_props[, 2]),
+                        Method = factor(rep(methods,
+                                            each = nrow(estimated_music_props)),
                                         levels = methods))
-colnames(m_prop_ana)[1:4] <- phenotype_factors
-m_prop_ana <- subset(m_prop_ana, !is.na(m_prop_ana[phenotype_gene]))
+colnames(m_prop_ana)[1:length(phenotype_factors)] <- phenotype_factors #nolint
+m_prop_ana <- subset(m_prop_ana, !is.na(m_prop_ana[phenotype_target]))
 m_prop_ana$Disease <- factor(sample_groups[(  # nolint
-    m_prop_ana[phenotype_gene] > 6.5) + 1], sample_groups)
+    m_prop_ana[phenotype_target] > phenotype_target_threshold) + 1],
+    sample_groups)
 m_prop_ana$D <- (m_prop_ana$Disease ==        # nolint
                  sample_disease_group) / sample_disease_group_scale

-jitt_compare <- ggplot(m_prop_ana, aes_string(phenotype_gene, "ct.prop")) +
+jitt_compare <- ggplot(m_prop_ana, aes_string(phenotype_target, "ct.prop")) +
     geom_smooth(method = "lm",  se = FALSE, col = "black", lwd = 0.25) +
     geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease),
                size = 2, alpha = 0.7) +  facet_wrap(~ Method) +
@@ -73,21 +107,81 @@
     scale_colour_manual(values = c("white", "gray20")) +
     scale_shape_manual(values = c(21, 24))

+## BoxPlot
+plot_box <- Boxplot_Est(list(
+    data.matrix(estimated_music_props),
+    data.matrix(estimated_nnls_props)),
+    method.name = c("MuSiC", "NNLS")) +
+    theme(axis.text.x = element_text(angle = -90),
+          axis.text.y = element_text(size = 8)) +
+    ggtitle(element_blank()) + theme_minimal()
+
+## Heatmap
+plot_hmap <- Prop_heat_Est(list(
+    data.matrix(estimated_music_props),
+    data.matrix(estimated_nnls_props)),
+    method.name = c("MuSiC", "NNLS")) +
+    theme(axis.text.x = element_text(angle = -90),
+          axis.text.y = element_text(size = 6))

 pdf(file = outfile_pdf, width = 8, height = 8)
-plot_grid(jitter.fig, jitter.new, labels = "auto", ncol = 1, nrow = 2)
-jitt_compare
-dev.off()
+plot_grid(jitter_fig, plot_box, labels = "auto", ncol = 1, nrow = 2)
+plot_grid(jitter_new, jitt_compare, labels = "auto", ncol = 1, nrow = 2)
+plot_hmap
+message(dev.off())
+
+## Output Proportions
+
+write.table(est_prop$Est.prop.weighted,
+            file = paste0("report_data/prop_",
+                          "Music Estimated Proportions of Cell Types",
+                          ".tabular"),
+            quote = F, sep = "\t", col.names = NA)
+write.table(est_prop$Est.prop.allgene,
+            file = paste0("report_data/prop_",
+                          "NNLS Estimated Proportions of Cell Types",
+                          ".tabular"),
+            quote = F, sep = "\t", col.names = NA)
+write.table(est_prop$Weight.gene,
+            file = paste0("report_data/weightgene_",
+                          "Music Estimated Proportions of Cell Types (by Gene)",
+                          ".tabular"),
+            quote = F, sep = "\t", col.names = NA)
+write.table(est_prop$r.squared.full,
+            file = paste0("report_data/rsquared_",
+                          "Music R-sqr Estimated Proportions of Each Subject",
+                          ".tabular"),
+            quote = F, sep = "\t", col.names = NA)
+write.table(est_prop$Var.prop,
+            file = paste0("report_data/varprop_",
+                          "Matrix of Variance of MuSiC Estimates",
+                          ".tabular"),
+            quote = F, sep = "\t", col.names = NA)
+

 ## Summary table
 for (meth in methods) {
     ##lm_beta_meth = lm(ct.prop ~ age + bmi + hba1c + gender, data =
-    ##subset(m_prop_ana, Method == meth))
+    sub_data <- subset(m_prop_ana, Method == meth)
+    ## We can only do regression where there are more than 1 factors
+    ## so we must find and exclude the ones which are not
+    gt1_facts <- sapply(phenotype_factors, function(facname) {
+        return(length(unique(sort(sub_data[[facname]]))) == 1)
+    })
+    form_factors <- phenotype_factors
+    exclude_facts <- names(gt1_facts)[gt1_facts]
+    if (length(exclude_facts) > 0) {
+        message("Factors with only one level will be excluded:")
+        message(exclude_facts)
+        form_factors <- phenotype_factors[
+            !(phenotype_factors %in% exclude_facts)]
+    }
     lm_beta_meth <- lm(as.formula(
-        paste("ct.prop", paste(phenotype_factors, collapse = " + "),
-              sep = " ~ ")),
-        data = subset(m_prop_ana, Method == meth))
-    print(paste0("Summary: ", meth))
+        paste("ct.prop", paste(form_factors, collapse = " + "),
+              sep = " ~ ")), data = sub_data)
+    message(paste0("Summary: ", meth))
     capture.output(summary(lm_beta_meth),
-                   file = paste0("report_data/summ_", meth, ".txt"))
+                   file = paste0("report_data/summ_Log of ",
+                                 meth,
+                                 " fitting.txt"))
 }
--- a/scripts/inspect.R	Sun Sep 12 19:49:12 2021 +0000
+++ b/scripts/inspect.R	Fri Nov 26 15:55:11 2021 +0000
@@ -6,17 +6,19 @@
 source(args[1])

 printout <- function(text) {
-    if (typeof(text) %in% c("list", "vector")) {
+    if (typeof(text) %in% c("list", "vector", "integer", "double", "numeric")) {
         write.table(text, file = outfile_tab, quote = F, sep = "\t",
                     col.names = NA)
     } else {
         ## text
+        print(typeof(text))
         capture.output(text, file = outfile_tab)  # nolint
     }
 }

-if (inspector %in% c("print", "pData", "fData", "dims", "experimentData",
-                     "exprs", "signature", "annotation", "abstract")) {
+if (inspector %in% c("print", "pData", "fData", "dims",
+                     "experimentData", "protocolData", "exprs",
+                     "signature", "annotation", "abstract")) {
     op <- get(inspector)
     tab <- op(rds_eset)
     printout(tab)
Binary file test-data/EMTABesethealthy.subset.rds has changed
Binary file test-data/Mousebulkeset.rds has changed
Binary file test-data/Mousesubeset.degenesonly2.half.rds has changed
Binary file test-data/default_output.pdf has changed
Binary file test-data/dendro.pdf has changed
Binary file test-data/dendro_1.pdf has changed