# HG changeset patch # User bgruening # Date 1730136727 0 # Node ID 1b2bed57b44036247a551801eae14af7cce860ed # Parent 22232092be53c1689cfa669d8228f05fd7dcfb44 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit d5c7ca22af1d4f0eaa7a607886554bebb95e8c50 diff -r 22232092be53 -r 1b2bed57b440 music-deconvolution.xml.orig --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/music-deconvolution.xml.orig Mon Oct 28 17:32:07 2024 +0000 @@ -0,0 +1,357 @@ + + estimate cell type proportions in bulk RNA-seq data + + macros.xml + + + + + + +null_str_vec = function(gstr){ + tokens = unlist(as.vector(strsplit(gstr, split=","))) + if (length(tokens) == 0){ + return(NULL) + } + if (length(tokens) == 1){ + return(tokens[[1]]) + } + return(tokens) +} + +bulk_eset = readRDS('$bulk_eset') +scrna_eset = readRDS('$scrna_eset') +use_disease_factor = FALSE +maxyscale = NA + +#if str($do.method) == "estimateprops": + +maxyscale = as.numeric('$do.maxyscale') ## yields "NA" if blank +phenotype_factors = null_str_vec('$do.phenotype_factors') +phenotype_factors_always_exclude = null_str_vec('$do.phenotype_factors_always_exclude') +celltypes_label = null_str_vec('$do.celltypes_label') +samples_label = null_str_vec('$do.samples_label') +celltypes = null_str_vec('$do.celltypes') +methods = c("MuSiC", "NNLS") + + #if str($do.disease_factor.use) == "yes": +use_disease_factor = TRUE +<<<<<<< HEAD +phenotype_scrna_target = null_str_vec('$do.disease_factor.phenotype_scrna_target') +======= +>>>>>>> 768a6e5b (v3 update:) +phenotype_target = null_str_vec('$do.disease_factor.phenotype_target') +phenotype_target_threshold = as.numeric('$do.disease_factor.phenotype_target_threshold') +sample_disease_group = null_str_vec('$do.disease_factor.sample_disease_group') +sample_disease_group_scale = as.integer('$do.disease_factor.sample_disease_group_scale') +<<<<<<< HEAD +======= +compare_title = null_str_vec('$do.disease_factor.compare_title') +>>>>>>> 768a6e5b (v3 update:) + #end if + +outfile_pdf='$out_pdf' + +#elif str($do.method) == "dendrogram": + +celltypes_label = null_str_vec('$do.celltypes_label') +clustertype_label = null_str_vec('$do.clustertype_label') +samples_label = null_str_vec('$do.samples_label') +celltypes = null_str_vec('$do.celltypes') + +data.to.use = list( + #for $i, $repeat in enumerate( $do.cluster_groups ) + #if $i == 0: + $repeat.cluster_id = list(cell.types = null_str_vec('$repeat.celltypes'), + marker.names = null_str_vec('$repeat.marker_name'), + marker.list = read_list('$repeat.marker_list')) + #else + , $repeat.cluster_id = list(cell.types = null_str_vec('$repeat.celltypes'), + marker.names = null_str_vec('$repeat.marker_name'), + marker.list = read_list('$repeat.marker_list')) + #end if + #end for +) + +outfile_pdf='$out_pdf' +outfile_tab='$out_tab' + +#else + stop("No such option") +#end if + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +<<<<<<< HEAD + + + + + + + + + + + + +======= + + + + + + + + + + + + +>>>>>>> 768a6e5b (v3 update:) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + do["method"] == "dendrogram" and len(do["cluster_groups"]) >0 + + + do["method"] == "estimateprops" + + + + do["method"] == "estimateprops" and do["disease_factor"]["use"] == "yes" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +<<<<<<< HEAD + +======= +>>>>>>> 768a6e5b (v3 update:) + + + + + +<<<<<<< HEAD +======= + +>>>>>>> 768a6e5b (v3 update:) + + + + + + + + + + + + + + + + + + + + https://doi.org/10.1038/s41467-018-08023-x + + \ No newline at end of file diff -r 22232092be53 -r 1b2bed57b440 scripts/dendrogram.R.orig --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/dendrogram.R.orig Mon Oct 28 17:32:07 2024 +0000 @@ -0,0 +1,136 @@ +## +suppressWarnings(suppressPackageStartupMessages(library(xbioc))) +suppressWarnings(suppressPackageStartupMessages(library(MuSiC))) +suppressWarnings(suppressPackageStartupMessages(library(reshape2))) +suppressWarnings(suppressPackageStartupMessages(library(cowplot))) +## We use this script to generate a clustering dendrogram of cell +## types, using the prior labelling from scRNA. + +read_list <- function(lfile) { + if (lfile == "None") { + return(NULL) + } +<<<<<<< HEAD + return(read.table(file = lfile, header = FALSE, check.names = FALSE, +======= + return(read.table(file = lfile, header = FALSE, check.names=FALSE, +>>>>>>> 768a6e5b (v3 update:) + stringsAsFactors = FALSE)$V1) +} + +args <- commandArgs(trailingOnly = TRUE) +source(args[1]) + + +## Perform the estimation +## Produce the first step information +sub.basis <- music_basis(scrna_eset, clusters = celltypes_label, + samples = samples_label, + select.ct = celltypes) + +## Plot the dendrogram of design matrix and cross-subject mean of +## realtive abundance +## Hierarchical clustering using Complete Linkage +d1 <- dist(t(log(sub.basis$Disgn.mtx + 1e-6)), method = "euclidean") +hc1 <- hclust(d1, method = "complete") +## Hierarchical clustering using Complete Linkage +d2 <- dist(t(log(sub.basis$M.theta + 1e-8)), method = "euclidean") +hc2 <- hclust(d2, method = "complete") + + +if (length(data.to.use) > 0) { + ## We then perform bulk tissue cell type estimation with pre-grouping + ## of cell types: C, list_of_cell_types, marker genes name, marker + ## genes list. + ## data.to.use = list( + ## "C1" = list(cell.types = c("Neutro"), + ## marker.names=NULL, + ## marker.list=NULL), + ## "C2" = list(cell.types = c("Podo"), + ## marker.names=NULL, + ## marker.list=NULL), + ## "C3" = list(cell.types = c("Endo","CD-PC","LOH","CD-IC","DCT","PT"), + ## marker.names = "Epithelial", + ## marker.list = read_list("../test-data/epith.markers")), + ## "C4" = list(cell.types = c("Macro","Fib","B lymph","NK","T lymph"), + ## marker.names = "Immune", + ## marker.list = read_list("../test-data/immune.markers")) + ## ) + grouped_celltypes <- lapply(data.to.use, function(x) { + x$cell.types + }) + marker_groups <- lapply(data.to.use, function(x) { + x$marker.list + }) + names(marker_groups) <- names(data.to.use) + + + cl_type <- as.character(scrna_eset[[celltypes_label]]) + + for (cl in seq_len(length(grouped_celltypes))) { + cl_type[cl_type %in% + grouped_celltypes[[cl]]] <- names(grouped_celltypes)[cl] + } + pData(scrna_eset)[[clustertype_label]] <- factor( + cl_type, levels = c(names(grouped_celltypes), + "CD-Trans", "Novel1", "Novel2")) + + est_bulk <- music_prop.cluster( + bulk.eset = bulk_eset, sc.eset = scrna_eset, + group.markers = marker_groups, clusters = celltypes_label, + groups = clustertype_label, samples = samples_label, + clusters.type = grouped_celltypes + ) + + estimated_music_props <- est_bulk$Est.prop.weighted.cluster + ## NNLS is not calculated here + + ## Show different in estimation methods + ## Jitter plot of estimated cell type proportions + methods_list <- c("MuSiC") + + jitter_fig <- Jitter_Est( + list(data.matrix(estimated_music_props)), + method.name = methods_list, title = "Jitter plot of Est Proportions", + size = 2, alpha = 0.7) + + theme_minimal() + + labs(x = element_blank(), y = element_blank()) + + theme(axis.text = element_text(size = 6), + axis.text.x = element_blank(), + legend.position = "none") + + plot_box <- Boxplot_Est(list( + data.matrix(estimated_music_props)), + method.name = methods_list) + + theme_minimal() + + labs(x = element_blank(), y = element_blank()) + + theme(axis.text = element_text(size = 6), + axis.text.x = element_blank(), + legend.position = "none") + + plot_hmap <- Prop_heat_Est(list( + data.matrix(estimated_music_props)), + method.name = methods_list) + + labs(x = element_blank(), y = element_blank()) + + theme(axis.text.y = element_text(size = 6), + axis.text.x = element_text(angle = -90, size = 5), + plot.title = element_text(size = 9), + legend.key.width = unit(0.15, "cm"), + legend.text = element_text(size = 5), + legend.title = element_text(size = 5)) + +} + +pdf(file = outfile_pdf, width = 8, height = 8) +par(mfrow = c(1, 2)) +plot(hc1, cex = 0.6, hang = -1, main = "Cluster log(Design Matrix)") +plot(hc2, cex = 0.6, hang = -1, main = "Cluster log(Mean of RA)") +if (length(data.to.use) > 0) { + plot_grid(jitter_fig, plot_box, plot_hmap, ncol = 2, nrow = 2) +} +message(dev.off()) + +if (length(data.to.use) > 0) { + write.table(estimated_music_props, + file = outfile_tab, quote = F, col.names = NA, sep = "\t") +} diff -r 22232092be53 -r 1b2bed57b440 scripts/estimateprops.R.orig --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/estimateprops.R.orig Mon Oct 28 17:32:07 2024 +0000 @@ -0,0 +1,281 @@ +suppressWarnings(suppressPackageStartupMessages(library(xbioc))) +suppressWarnings(suppressPackageStartupMessages(library(MuSiC))) +suppressWarnings(suppressPackageStartupMessages(library(reshape2))) +suppressWarnings(suppressPackageStartupMessages(library(cowplot))) +## We use this script to estimate the effectiveness of proportion methods + +## Load Conf +args <- commandArgs(trailingOnly = TRUE) +source(args[1]) + +## Estimate cell type proportions +est_prop <- music_prop( + bulk.eset = bulk_eset, sc.eset = scrna_eset, + clusters = celltypes_label, + samples = samples_label, select.ct = celltypes, verbose = T) + + +estimated_music_props <- est_prop$Est.prop.weighted +estimated_nnls_props <- est_prop$Est.prop.allgene +## +estimated_music_props_flat <- melt(estimated_music_props) +estimated_nnls_props_flat <- melt(estimated_nnls_props) + +scale_yaxes <- function(gplot, value) { + if (is.na(value)) { + gplot + } else { + gplot + scale_y_continuous(lim = c(0, value)) + } +} + +sieve_data <- function(func, music_data, nnls_data) { + if (func == "list") { + res <- list(if ("MuSiC" %in% methods) music_data else NULL, + if ("NNLS" %in% methods) nnls_data else NULL) + res[lengths(res) > 0] ## filter out NULL elements + } else if (func == "rbind") { + rbind(if ("MuSiC" %in% methods) music_data else NULL, + if ("NNLS" %in% methods) nnls_data else NULL) + } else if (func == "c") { + c(if ("MuSiC" %in% methods) music_data else NULL, + if ("NNLS" %in% methods) nnls_data else NULL) + } +} + + +## Show different in estimation methods +## Jitter plot of estimated cell type proportions +jitter_fig <- scale_yaxes(Jitter_Est( + sieve_data("list", + data.matrix(estimated_music_props), + data.matrix(estimated_nnls_props)), + method.name = methods, title = "Jitter plot of Est Proportions", + size = 2, alpha = 0.7) + theme_minimal(), maxyscale) + +## Make a Plot +## A more sophisticated jitter plot is provided as below. We separated +## the T2D subjects and normal subjects by their disease factor levels. +m_prop <- sieve_data("rbind", + estimated_music_props_flat, + estimated_nnls_props_flat) +colnames(m_prop) <- c("Sub", "CellType", "Prop") + +if (is.null(celltypes)) { + celltypes <- levels(m_prop$CellType) + message("No celltypes declared, using:") + message(celltypes) +} + +if (is.null(phenotype_factors)) { + phenotype_factors <- colnames(pData(bulk_eset)) +} +## filter out unwanted factors like "sampleID" and "subjectName" +phenotype_factors <- phenotype_factors[ + !(phenotype_factors %in% phenotype_factors_always_exclude)] +message("Phenotype Factors to use:") +message(paste0(phenotype_factors, collapse = ", ")) + +m_prop$CellType <- factor(m_prop$CellType, levels = celltypes) # nolint +m_prop$Method <- factor(rep(methods, each = nrow(estimated_music_props_flat)), # nolint + levels = methods) + +if (use_disease_factor) { + + if (phenotype_target_threshold == -99) { + phenotype_target_threshold <- -Inf + message("phenotype target threshold set to -Inf") + } + ## the "2" here is to do with the sample groups, not number of methods + m_prop$Disease_factor <- rep(bulk_eset[[phenotype_target]], 2 * length(celltypes)) # nolint + m_prop <- m_prop[!is.na(m_prop$Disease_factor), ] + ## Generate a TRUE/FALSE table of Normal == 1 and Disease == 2 + sample_groups <- c("Normal", sample_disease_group) + m_prop$Disease <- factor(sample_groups[(m_prop$Disease_factor > phenotype_target_threshold) + 1], # nolint + levels = sample_groups) + + ## Binary to scale: e.g. TRUE / 5 = 0.2 + m_prop$D <- (m_prop$Disease == # nolint + sample_disease_group) / sample_disease_group_scale + ## NA's are not included in the comparison below + m_prop <- rbind(subset(m_prop, Disease != sample_disease_group), + subset(m_prop, Disease == sample_disease_group)) + + jitter_new <- scale_yaxes( + ggplot(m_prop, aes(Method, Prop)) + + geom_point(aes(fill = Method, color = Disease, + stroke = D, shape = Disease), + size = 2, alpha = 0.7, + position = position_jitter(width = 0.25, height = 0)) + + facet_wrap(~ CellType, scales = "free") + + scale_colour_manual(values = c("white", "gray20")) + + scale_shape_manual(values = c(21, 24)) + theme_minimal(), maxyscale) + +} + +if (use_disease_factor) { + + ## Plot to compare method effectiveness + ## Create dataframe for beta cell proportions and Disease_factor levels + ## - Ugly code. Essentially, doubles the cell type proportions for each + ## set of MuSiC and NNLS methods + m_prop_ana <- data.frame( + pData(bulk_eset)[rep(1:nrow(estimated_music_props), length(methods)), #nolint + phenotype_factors], + ## get proportions of target cell type + ct.prop = sieve_data("c", + estimated_music_props[, phenotype_scrna_target], + estimated_nnls_props[, phenotype_scrna_target]), + ## + Method = factor(rep(methods, + each = nrow(estimated_music_props)), + levels = methods)) + ## - fix headers + colnames(m_prop_ana)[1:length(phenotype_factors)] <- phenotype_factors #nolint + ## - drop NA for target phenotype (e.g. hba1c) + m_prop_ana <- subset(m_prop_ana, !is.na(m_prop_ana[phenotype_target])) + m_prop_ana$Disease <- factor( # nolint + ## - Here we set Normal/Disease assignments across the methods + sample_groups[( + m_prop_ana[phenotype_target] > phenotype_target_threshold) + 1 + ], + sample_groups) + ## - Then we scale this binary assignment to a plotable factor + m_prop_ana$D <- (m_prop_ana$Disease == # nolint + sample_disease_group) / sample_disease_group_scale + + jitt_compare <- scale_yaxes( + ggplot(m_prop_ana, aes_string(phenotype_target, "ct.prop")) + + geom_smooth(method = "lm", se = FALSE, col = "black", lwd = 0.25) + + geom_point(aes(fill = Method, color = Disease, + stroke = D, shape = Disease), + size = 2, alpha = 0.7) + facet_wrap(~ Method) + + ggtitle(paste0(toupper(phenotype_target), " vs. ", + toupper(phenotype_scrna_target), + " Cell Type Proportion")) + + theme_minimal() + + ylab(paste0("Proportion of ", + phenotype_scrna_target, " cells")) + + xlab(paste0("Level of bulk factor (", phenotype_target, ")")) + + scale_colour_manual(values = c("white", "gray20")) + + scale_shape_manual(values = c(21, 24)), maxyscale) +} + +## BoxPlot +plot_box <- scale_yaxes(Boxplot_Est( + sieve_data("list", + data.matrix(estimated_music_props), + data.matrix(estimated_nnls_props)), + method.name = methods) + + theme(axis.text.x = element_text(angle = -90), + axis.text.y = element_text(size = 8)) + + ggtitle(element_blank()) + theme_minimal(), maxyscale) + +## Heatmap +plot_hmap <- Prop_heat_Est( + sieve_data( + "list", + data.matrix(estimated_music_props), + data.matrix(estimated_nnls_props)), + method.name = methods) + + theme(axis.text.x = element_text(angle = -90), + axis.text.y = element_text(size = 6)) + +pdf(file = outfile_pdf, width = 8, height = 8) +if (length(celltypes) <= 8) { + plot_grid(jitter_fig, plot_box, labels = "auto", ncol = 1, nrow = 2) +} else { + print(jitter_fig) + plot_box +} +if (use_disease_factor) { + plot_grid(jitter_new, jitt_compare, labels = "auto", ncol = 1, nrow = 2) +} +plot_hmap +message(dev.off()) + +writable <- function(obj, prefix, title) { + write.table(obj, + file = paste0("report_data/", prefix, "_", + title, ".tabular"), + quote = F, sep = "\t", col.names = NA) +} + +## Output Proportions +if ("NNLS" %in% methods) { + writable(est_prop$Est.prop.allgene, "prop", + "NNLS Estimated Proportions of Cell Types") +} + +if ("MuSiC" %in% methods) { + writable(est_prop$Est.prop.weighted, "prop", + "Music Estimated Proportions of Cell Types") + writable(est_prop$Weight.gene, "weightgene", + "Music Estimated Proportions of Cell Types (by Gene)") + writable(est_prop$r.squared.full, "rsquared", + "Music R-sqr Estimated Proportions of Each Subject") + writable(est_prop$Var.prop, "varprop", + "Matrix of Variance of MuSiC Estimates") +} + + +<<<<<<< HEAD +======= +write.table(est_prop$Est.prop.weighted, + file = paste0("report_data/prop_", + "Music Estimated Proportions of Cell Types", + ".tabular"), + quote = F, sep = "\t", col.names = NA) +write.table(est_prop$Est.prop.allgene, + file = paste0("report_data/prop_", + "NNLS Estimated Proportions of Cell Types", + ".tabular"), + quote = F, sep = "\t", col.names = NA) +write.table(est_prop$Weight.gene, + file = paste0("report_data/weightgene_", + "Music Estimated Proportions of Cell Types (by Gene)", + ".tabular"), + quote = F, sep = "\t", col.names = NA) +write.table(est_prop$r.squared.full, + file = paste0("report_data/rsquared_", + "Music R-sqr Estimated Proportions of Each Subject", + ".tabular"), + quote = F, sep = "\t", col.names = NA) +write.table(est_prop$Var.prop, + file = paste0("report_data/varprop_", + "Matrix of Variance of MuSiC Estimates", + ".tabular"), + quote = F, sep = "\t", col.names = NA) + + +>>>>>>> 7a416140 (fitting summaries only apply when disease factor is used) +if (use_disease_factor) { + ## Summary table of linear regressions of disease factors + for (meth in methods) { + ##lm_beta_meth = lm(ct.prop ~ age + bmi + hba1c + gender, data = + sub_data <- subset(m_prop_ana, Method == meth) + + ## We can only do regression where there are more than 1 factors + ## so we must find and exclude the ones which are not + gt1_facts <- sapply(phenotype_factors, function(facname) { + return(length(unique(sort(sub_data[[facname]]))) == 1) + }) + form_factors <- phenotype_factors + exclude_facts <- names(gt1_facts)[gt1_facts] + if (length(exclude_facts) > 0) { + message("Factors with only one level will be excluded:") + message(exclude_facts) + form_factors <- phenotype_factors[ + !(phenotype_factors %in% exclude_facts)] + } + lm_beta_meth <- lm(as.formula( + paste("ct.prop", paste(form_factors, collapse = " + "), + sep = " ~ ")), data = sub_data) + message(paste0("Summary: ", meth)) + capture.output(summary(lm_beta_meth), + file = paste0("report_data/summ_Log of ", + meth, + " fitting.txt")) + } +} +