Repository 'music_deconvolution'
hg clone https://toolshed.g2.bx.psu.edu/repos/bgruening/music_deconvolution

Changeset 6:fb36f390cc52 (2024-10-28)
Previous changeset 5:2ba99a52bd44 (2022-05-02) Next changeset 7:ffcdd3a11ba9 (2024-10-28)
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit d5c7ca22af1d4f0eaa7a607886554bebb95e8c50
added:
music-deconvolution.xml.orig
scripts/dendrogram.R.orig
scripts/estimateprops.R.orig
b
diff -r 2ba99a52bd44 -r fb36f390cc52 music-deconvolution.xml.orig
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/music-deconvolution.xml.orig Mon Oct 28 17:32:19 2024 +0000
[
b'@@ -0,0 +1,357 @@\n+<tool id="music_deconvolution" name="MuSiC" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@"\n+      profile="21.09" license="GPL-3.0-or-later" >\n+    <description>estimate cell type proportions in bulk RNA-seq data</description>\n+    <macros>\n+        <import>macros.xml</import>\n+    </macros>\n+    <expand macro="requirements" />\n+    <command detect_errors="exit_code" ><![CDATA[\n+mkdir report_data &&\n+Rscript --vanilla \'$__tool_directory__/scripts/${do.method}.R\' \'$conf\'\n+]]></command>\n+    <configfiles>\n+        <configfile name="conf" >\n+\n+null_str_vec = function(gstr){\n+   tokens = unlist(as.vector(strsplit(gstr, split=",")))\n+   if (length(tokens) == 0){\n+      return(NULL)\n+   }\n+   if (length(tokens) == 1){\n+      return(tokens[[1]])\n+   }\n+   return(tokens)\n+}\n+\n+bulk_eset = readRDS(\'$bulk_eset\')\n+scrna_eset = readRDS(\'$scrna_eset\')\n+use_disease_factor = FALSE\n+maxyscale = NA\n+\n+#if str($do.method) == "estimateprops":\n+\n+maxyscale = as.numeric(\'$do.maxyscale\')  ## yields "NA" if blank\n+phenotype_factors = null_str_vec(\'$do.phenotype_factors\')\n+phenotype_factors_always_exclude = null_str_vec(\'$do.phenotype_factors_always_exclude\')\n+celltypes_label = null_str_vec(\'$do.celltypes_label\')\n+samples_label = null_str_vec(\'$do.samples_label\')\n+celltypes = null_str_vec(\'$do.celltypes\')\n+methods = c("MuSiC", "NNLS")\n+\n+    #if str($do.disease_factor.use) == "yes":\n+use_disease_factor = TRUE\n+<<<<<<< HEAD\n+phenotype_scrna_target = null_str_vec(\'$do.disease_factor.phenotype_scrna_target\')\n+=======\n+>>>>>>> 768a6e5b (v3 update:)\n+phenotype_target = null_str_vec(\'$do.disease_factor.phenotype_target\')\n+phenotype_target_threshold = as.numeric(\'$do.disease_factor.phenotype_target_threshold\')\n+sample_disease_group = null_str_vec(\'$do.disease_factor.sample_disease_group\')\n+sample_disease_group_scale = as.integer(\'$do.disease_factor.sample_disease_group_scale\')\n+<<<<<<< HEAD\n+=======\n+compare_title = null_str_vec(\'$do.disease_factor.compare_title\')\n+>>>>>>> 768a6e5b (v3 update:)\n+    #end if\n+\n+outfile_pdf=\'$out_pdf\'\n+\n+#elif str($do.method) == "dendrogram":\n+\n+celltypes_label = null_str_vec(\'$do.celltypes_label\')\n+clustertype_label = null_str_vec(\'$do.clustertype_label\')\n+samples_label = null_str_vec(\'$do.samples_label\')\n+celltypes = null_str_vec(\'$do.celltypes\')\n+\n+data.to.use = list(\n+    #for $i, $repeat in enumerate( $do.cluster_groups )\n+        #if $i == 0:\n+        $repeat.cluster_id = list(cell.types = null_str_vec(\'$repeat.celltypes\'),\n+            marker.names = null_str_vec(\'$repeat.marker_name\'),\n+            marker.list = read_list(\'$repeat.marker_list\'))\n+        #else\n+        , $repeat.cluster_id = list(cell.types = null_str_vec(\'$repeat.celltypes\'),\n+            marker.names = null_str_vec(\'$repeat.marker_name\'),\n+            marker.list = read_list(\'$repeat.marker_list\'))\n+        #end if\n+    #end for\n+)\n+\n+outfile_pdf=\'$out_pdf\'\n+outfile_tab=\'$out_tab\'\n+\n+#else\n+    stop("No such option")\n+#end if\n+\n+        </configfile>\n+    </configfiles>\n+    <inputs>\n+        <param name="scrna_eset" label="scRNA Dataset" type="data" format="@RDATATYPE@" />\n+        <param name="bulk_eset" label="Bulk RNA Dataset" type="data" format="@RDATATYPE@" />\n+        <conditional name="do" >\n+            <param name="method" type="select" label="Purpose" >\n+                <!-- The values here correspond to script names in the scripts folder\n+                     and must remain so -->\n+                <option value="estimateprops">Estimate Proportions</option>\n+                <option value="dendrogram">Compute Dendrogram</option>\n+            </param>\n+            <when value="estimateprops" >\n+                <param name="celltypes_label" type="text" value="cellType"\n+                       label="Cell Types Label from scRNA dataset" >\n+                    <expand macro="validator_text" />\n+                </param>\n+                <param name="samples_label" type="text" value="sampleID"\n+                       label='..b'cellType" />\n+                <param name="samples_label" value="sampleID" />\n+                <param name="disease_factor" value="no" />\n+            </conditional>\n+            <output name="out_pdf" value="default_output_no_disease.pdf" compare="sim_size" />\n+            <output_collection name="summaries" count="5">\n+                <element name="Log of MuSiC fitting" ftype="txt">\n+                    <assert_contents>\n+                        <has_text text="Residual standard error: 0.1734 on 72 degrees of freedom"/>\n+                    </assert_contents>\n+                </element>\n+                <element name="Log of NNLS fitting" ftype="txt">\n+                    <assert_contents>\n+                        <has_text text="Residual standard error: 0.2687 on 72 degrees of freedom"/>\n+                    </assert_contents>\n+                </element>\n+            </output_collection>\n+        </test>\n+        <test expect_num_outputs="3" >\n+            <!-- Estimate Proportions test -->\n+            <param name="bulk_eset" value="GSE50244bulkeset.subset.rds" />\n+            <param name="scrna_eset" value="EMTABesethealthy.subset.rds" />\n+            <conditional name="do" >\n+                <param name="method" value="estimateprops" />\n+                <param name="celltypes_label" value="cellType" />\n+                <param name="samples_label" value="sampleID" />\n+                <param name="celltypes" value="alpha,beta,delta,gamma,acinar,ductal" />\n+                <conditional name="disease_factor" >\n+                    <param name="use" value="yes" />\n+<<<<<<< HEAD\n+                    <param name="phenotype_scrna_target" value="beta" />\n+=======\n+>>>>>>> 768a6e5b (v3 update:)\n+                    <param name="phenotype_factors" value="age,bmi,hba1c,gender" />\n+                    <param name="phenotype_target" value="hba1c" />\n+                    <param name="phenotype_target_threshold" value="6.5" />\n+                    <param name="sample_disease_group" value="T2D" />\n+                    <param name="sample_disease_group_scale" value="5" />\n+<<<<<<< HEAD\n+=======\n+                    <param name="compare_title" value="HbA1c vs Beta Cell Type Proportion" />\n+>>>>>>> 768a6e5b (v3 update:)\n+                </conditional>\n+            </conditional>\n+            <output name="out_pdf" value="default_output.pdf" compare="sim_size" />\n+            <output_collection name="summaries" count="5">\n+                <element name="Log of MuSiC fitting" ftype="txt">\n+                    <assert_contents>\n+                        <has_text text="Residual standard error: 0.1704 on 72 degrees of freedom"/>\n+                    </assert_contents>\n+                </element>\n+                <element name="Log of NNLS fitting" ftype="txt">\n+                    <assert_contents>\n+                        <has_text text="Residual standard error: 0.0645 on 72 degrees of freedom"/>\n+                    </assert_contents>\n+                </element>\n+            </output_collection>\n+        </test>\n+    </tests>\n+    <help><![CDATA[\n+MuSiC utilizes cell-type specific gene expression from single-cell RNA sequencing (RNA-seq) data to characterize cell type compositions from bulk RNA-seq data in complex tissues. By appropriate weighting of genes showing cross-subject and cross-cell consistency, MuSiC enables the transfer of cell type-specific gene expression information from one dataset to another.\n+\n+Solid tissues often contain closely related cell types which leads to collinearity. To deal with collinearity, MuSiC employs a tree-guided procedure that recursively zooms in on closely related cell types. Briefly, we first group similar cell types into the same cluster and estimate cluster proportions, then recursively repeat this procedure within each cluster.\n+\n+    ]]></help>\n+    <citations>\n+        <citation type="doi">https://doi.org/10.1038/s41467-018-08023-x</citation>\n+    </citations>\n+</tool>\n\\ No newline at end of file\n'
b
diff -r 2ba99a52bd44 -r fb36f390cc52 scripts/dendrogram.R.orig
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/dendrogram.R.orig Mon Oct 28 17:32:19 2024 +0000
[
@@ -0,0 +1,136 @@
+##
+suppressWarnings(suppressPackageStartupMessages(library(xbioc)))
+suppressWarnings(suppressPackageStartupMessages(library(MuSiC)))
+suppressWarnings(suppressPackageStartupMessages(library(reshape2)))
+suppressWarnings(suppressPackageStartupMessages(library(cowplot)))
+## We use this script to generate a clustering dendrogram of cell
+## types, using the prior labelling from scRNA.
+
+read_list <- function(lfile) {
+    if (lfile == "None") {
+        return(NULL)
+    }
+<<<<<<< HEAD
+    return(read.table(file = lfile, header = FALSE, check.names = FALSE,
+=======
+    return(read.table(file = lfile, header = FALSE, check.names=FALSE,
+>>>>>>> 768a6e5b (v3 update:)
+                      stringsAsFactors = FALSE)$V1)
+}
+
+args <- commandArgs(trailingOnly = TRUE)
+source(args[1])
+
+
+## Perform the estimation
+## Produce the first step information
+sub.basis <- music_basis(scrna_eset, clusters = celltypes_label,
+                         samples = samples_label,
+                         select.ct = celltypes)
+
+## Plot the dendrogram of design matrix and cross-subject mean of
+## realtive abundance
+## Hierarchical clustering using Complete Linkage
+d1 <- dist(t(log(sub.basis$Disgn.mtx + 1e-6)), method = "euclidean")
+hc1 <- hclust(d1, method = "complete")
+## Hierarchical clustering using Complete Linkage
+d2 <- dist(t(log(sub.basis$M.theta + 1e-8)), method = "euclidean")
+hc2 <- hclust(d2, method = "complete")
+
+
+if (length(data.to.use) > 0) {
+    ## We then perform bulk tissue cell type estimation with pre-grouping
+    ## of cell types: C, list_of_cell_types, marker genes name, marker
+    ## genes list.
+    ## data.to.use = list(
+    ##     "C1" = list(cell.types = c("Neutro"),
+    ##                 marker.names=NULL,
+    ##                 marker.list=NULL),
+    ##     "C2" = list(cell.types = c("Podo"),
+    ##                 marker.names=NULL,
+    ##                 marker.list=NULL),
+    ##     "C3" = list(cell.types = c("Endo","CD-PC","LOH","CD-IC","DCT","PT"),
+    ##                 marker.names = "Epithelial",
+    ##                 marker.list = read_list("../test-data/epith.markers")),
+    ##     "C4" = list(cell.types = c("Macro","Fib","B lymph","NK","T lymph"),
+    ##                 marker.names = "Immune",
+    ##                 marker.list = read_list("../test-data/immune.markers"))
+    ## )
+    grouped_celltypes <- lapply(data.to.use, function(x) {
+        x$cell.types
+    })
+    marker_groups <- lapply(data.to.use, function(x) {
+        x$marker.list
+    })
+    names(marker_groups) <- names(data.to.use)
+
+
+    cl_type <- as.character(scrna_eset[[celltypes_label]])
+
+    for (cl in seq_len(length(grouped_celltypes))) {
+        cl_type[cl_type %in%
+                grouped_celltypes[[cl]]] <- names(grouped_celltypes)[cl]
+    }
+    pData(scrna_eset)[[clustertype_label]] <- factor(
+        cl_type, levels = c(names(grouped_celltypes),
+                            "CD-Trans", "Novel1", "Novel2"))
+
+    est_bulk <- music_prop.cluster(
+        bulk.eset = bulk_eset, sc.eset = scrna_eset,
+        group.markers = marker_groups, clusters = celltypes_label,
+        groups = clustertype_label, samples = samples_label,
+        clusters.type = grouped_celltypes
+    )
+
+    estimated_music_props <- est_bulk$Est.prop.weighted.cluster
+    ## NNLS is not calculated here
+
+    ## Show different in estimation methods
+    ## Jitter plot of estimated cell type proportions
+    methods_list <- c("MuSiC")
+
+    jitter_fig <- Jitter_Est(
+        list(data.matrix(estimated_music_props)),
+        method.name = methods_list, title = "Jitter plot of Est Proportions",
+        size = 2, alpha = 0.7) +
+        theme_minimal() +
+        labs(x = element_blank(), y = element_blank()) +
+        theme(axis.text = element_text(size = 6),
+              axis.text.x = element_blank(),
+              legend.position = "none")
+
+    plot_box <- Boxplot_Est(list(
+        data.matrix(estimated_music_props)),
+        method.name = methods_list) +
+        theme_minimal() +
+        labs(x = element_blank(), y = element_blank()) +
+        theme(axis.text = element_text(size = 6),
+              axis.text.x = element_blank(),
+              legend.position = "none")
+
+    plot_hmap <- Prop_heat_Est(list(
+        data.matrix(estimated_music_props)),
+        method.name = methods_list) +
+        labs(x = element_blank(), y = element_blank()) +
+        theme(axis.text.y = element_text(size = 6),
+              axis.text.x = element_text(angle = -90, size = 5),
+              plot.title = element_text(size = 9),
+              legend.key.width = unit(0.15, "cm"),
+              legend.text = element_text(size = 5),
+              legend.title = element_text(size = 5))
+
+}
+    
+pdf(file = outfile_pdf, width = 8, height = 8)
+par(mfrow = c(1, 2))
+plot(hc1, cex = 0.6, hang = -1, main = "Cluster log(Design Matrix)")
+plot(hc2, cex = 0.6, hang = -1, main = "Cluster log(Mean of RA)")
+if (length(data.to.use) > 0) {
+    plot_grid(jitter_fig, plot_box, plot_hmap, ncol = 2, nrow = 2)
+}
+message(dev.off())
+
+if (length(data.to.use) > 0) {
+    write.table(estimated_music_props,
+                file = outfile_tab, quote = F, col.names = NA, sep = "\t")
+}
b
diff -r 2ba99a52bd44 -r fb36f390cc52 scripts/estimateprops.R.orig
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/estimateprops.R.orig Mon Oct 28 17:32:19 2024 +0000
[
b'@@ -0,0 +1,281 @@\n+suppressWarnings(suppressPackageStartupMessages(library(xbioc)))\n+suppressWarnings(suppressPackageStartupMessages(library(MuSiC)))\n+suppressWarnings(suppressPackageStartupMessages(library(reshape2)))\n+suppressWarnings(suppressPackageStartupMessages(library(cowplot)))\n+## We use this script to estimate the effectiveness of proportion methods\n+\n+## Load Conf\n+args <- commandArgs(trailingOnly = TRUE)\n+source(args[1])\n+\n+## Estimate cell type proportions\n+est_prop <- music_prop(\n+    bulk.eset = bulk_eset, sc.eset = scrna_eset,\n+    clusters = celltypes_label,\n+    samples = samples_label, select.ct = celltypes, verbose = T)\n+\n+\n+estimated_music_props <- est_prop$Est.prop.weighted\n+estimated_nnls_props <- est_prop$Est.prop.allgene\n+##\n+estimated_music_props_flat <- melt(estimated_music_props)\n+estimated_nnls_props_flat <- melt(estimated_nnls_props)\n+\n+scale_yaxes <- function(gplot, value) {\n+    if (is.na(value)) {\n+        gplot\n+    } else {\n+        gplot + scale_y_continuous(lim = c(0, value))\n+    }\n+}\n+\n+sieve_data <- function(func, music_data, nnls_data) {\n+    if (func == "list") {\n+        res <- list(if ("MuSiC" %in% methods) music_data else NULL,\n+                    if ("NNLS" %in% methods) nnls_data else NULL)\n+        res[lengths(res) > 0] ## filter out NULL elements\n+    } else if (func == "rbind") {\n+        rbind(if ("MuSiC" %in% methods) music_data else NULL,\n+              if ("NNLS" %in% methods) nnls_data else NULL)\n+    } else if (func == "c") {\n+        c(if ("MuSiC" %in% methods) music_data else NULL,\n+          if ("NNLS" %in% methods) nnls_data else NULL)\n+    }\n+}\n+\n+\n+## Show different in estimation methods\n+## Jitter plot of estimated cell type proportions\n+jitter_fig <- scale_yaxes(Jitter_Est(\n+    sieve_data("list",\n+               data.matrix(estimated_music_props),\n+               data.matrix(estimated_nnls_props)),\n+    method.name = methods, title = "Jitter plot of Est Proportions",\n+    size = 2, alpha = 0.7) + theme_minimal(), maxyscale)\n+\n+## Make a Plot\n+## A more sophisticated jitter plot is provided as below. We separated\n+## the T2D subjects and normal subjects by their disease factor levels.\n+m_prop <- sieve_data("rbind",\n+                     estimated_music_props_flat,\n+                     estimated_nnls_props_flat)\n+colnames(m_prop) <- c("Sub", "CellType", "Prop")\n+\n+if (is.null(celltypes)) {\n+    celltypes <- levels(m_prop$CellType)\n+    message("No celltypes declared, using:")\n+    message(celltypes)\n+}\n+\n+if (is.null(phenotype_factors)) {\n+    phenotype_factors <- colnames(pData(bulk_eset))\n+}\n+## filter out unwanted factors like "sampleID" and "subjectName"\n+phenotype_factors <- phenotype_factors[\n+    !(phenotype_factors %in% phenotype_factors_always_exclude)]\n+message("Phenotype Factors to use:")\n+message(paste0(phenotype_factors, collapse = ", "))\n+\n+m_prop$CellType <- factor(m_prop$CellType, levels = celltypes) # nolint\n+m_prop$Method <- factor(rep(methods, each = nrow(estimated_music_props_flat)), # nolint\n+                        levels = methods)\n+\n+if (use_disease_factor) {\n+\n+    if (phenotype_target_threshold == -99) {\n+        phenotype_target_threshold <- -Inf\n+        message("phenotype target threshold set to -Inf")\n+    }\n+    ## the "2" here is to do with the sample groups, not number of methods\n+    m_prop$Disease_factor <- rep(bulk_eset[[phenotype_target]], 2 * length(celltypes)) # nolint\n+    m_prop <- m_prop[!is.na(m_prop$Disease_factor), ]\n+    ## Generate a TRUE/FALSE table of Normal == 1 and Disease == 2\n+    sample_groups <- c("Normal", sample_disease_group)\n+    m_prop$Disease <- factor(sample_groups[(m_prop$Disease_factor > phenotype_target_threshold) + 1], # nolint\n+                             levels = sample_groups)\n+\n+    ## Binary to scale: e.g. TRUE / 5 = 0.2\n+    m_prop$D <- (m_prop$Disease ==   # nolint\n+                 sample_disease_group) / sample_disease_group_scale\n+    ## NA\'s are not included in the comparison below\n+  '..b'_text(angle = -90),\n+          axis.text.y = element_text(size = 6))\n+\n+pdf(file = outfile_pdf, width = 8, height = 8)\n+if (length(celltypes) <= 8) {\n+    plot_grid(jitter_fig, plot_box, labels = "auto", ncol = 1, nrow = 2)\n+} else {\n+    print(jitter_fig)\n+    plot_box\n+}\n+if (use_disease_factor) {\n+    plot_grid(jitter_new, jitt_compare, labels = "auto", ncol = 1, nrow = 2)\n+}\n+plot_hmap\n+message(dev.off())\n+\n+writable <- function(obj, prefix, title) {\n+    write.table(obj,\n+                file = paste0("report_data/", prefix, "_",\n+                              title, ".tabular"),\n+                quote = F, sep = "\\t", col.names = NA)\n+}\n+\n+## Output Proportions\n+if ("NNLS" %in% methods) {\n+    writable(est_prop$Est.prop.allgene, "prop",\n+             "NNLS Estimated Proportions of Cell Types")\n+}\n+\n+if ("MuSiC" %in% methods) {\n+    writable(est_prop$Est.prop.weighted, "prop",\n+             "Music Estimated Proportions of Cell Types")\n+    writable(est_prop$Weight.gene, "weightgene",\n+             "Music Estimated Proportions of Cell Types (by Gene)")\n+    writable(est_prop$r.squared.full, "rsquared",\n+             "Music R-sqr Estimated Proportions of Each Subject")\n+    writable(est_prop$Var.prop, "varprop",\n+             "Matrix of Variance of MuSiC Estimates")\n+}\n+\n+\n+<<<<<<< HEAD\n+=======\n+write.table(est_prop$Est.prop.weighted,\n+            file = paste0("report_data/prop_",\n+                          "Music Estimated Proportions of Cell Types",\n+                          ".tabular"),\n+            quote = F, sep = "\\t", col.names = NA)\n+write.table(est_prop$Est.prop.allgene,\n+            file = paste0("report_data/prop_",\n+                          "NNLS Estimated Proportions of Cell Types",\n+                          ".tabular"),\n+            quote = F, sep = "\\t", col.names = NA)\n+write.table(est_prop$Weight.gene,\n+            file = paste0("report_data/weightgene_",\n+                          "Music Estimated Proportions of Cell Types (by Gene)",\n+                          ".tabular"),\n+            quote = F, sep = "\\t", col.names = NA)\n+write.table(est_prop$r.squared.full,\n+            file = paste0("report_data/rsquared_",\n+                          "Music R-sqr Estimated Proportions of Each Subject",\n+                          ".tabular"),\n+            quote = F, sep = "\\t", col.names = NA)\n+write.table(est_prop$Var.prop,\n+            file = paste0("report_data/varprop_",\n+                          "Matrix of Variance of MuSiC Estimates",\n+                          ".tabular"),\n+            quote = F, sep = "\\t", col.names = NA)\n+\n+\n+>>>>>>> 7a416140 (fitting summaries only apply when disease factor is used)\n+if (use_disease_factor) {\n+    ## Summary table of linear regressions of disease factors\n+    for (meth in methods) {\n+        ##lm_beta_meth = lm(ct.prop ~ age + bmi + hba1c + gender, data =\n+        sub_data <- subset(m_prop_ana, Method == meth)\n+\n+        ## We can only do regression where there are more than 1 factors\n+        ## so we must find and exclude the ones which are not\n+        gt1_facts <- sapply(phenotype_factors, function(facname) {\n+            return(length(unique(sort(sub_data[[facname]]))) == 1)\n+        })\n+        form_factors <- phenotype_factors\n+        exclude_facts <- names(gt1_facts)[gt1_facts]\n+        if (length(exclude_facts) > 0) {\n+            message("Factors with only one level will be excluded:")\n+            message(exclude_facts)\n+            form_factors <- phenotype_factors[\n+                !(phenotype_factors %in% exclude_facts)]\n+        }\n+        lm_beta_meth <- lm(as.formula(\n+            paste("ct.prop", paste(form_factors, collapse = " + "),\n+                  sep = " ~ ")), data = sub_data)\n+        message(paste0("Summary: ", meth))\n+        capture.output(summary(lm_beta_meth),\n+                       file = paste0("report_data/summ_Log of ",\n+                                     meth,\n+                                     " fitting.txt"))\n+    }\n+}\n+\n'