Mercurial > repos > bgruening > music_construct_eset
diff scripts/estimateprops.R @ 1:be91cb6f48e7 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit 683bb72ae92b5759a239b7e3bf4c5a229ed35b54"
author | bgruening |
---|---|
date | Fri, 26 Nov 2021 15:55:11 +0000 |
parents | 2cfd0db49bbc |
children | 7902cd31b9b5 |
line wrap: on
line diff
--- a/scripts/estimateprops.R Sun Sep 12 19:49:12 2021 +0000 +++ b/scripts/estimateprops.R Fri Nov 26 15:55:11 2021 +0000 @@ -14,36 +14,67 @@ clusters = celltypes_label, samples = samples_label, select.ct = celltypes, verbose = T) +estimated_music_props <- est_prop$Est.prop.weighted +estimated_nnls_props <- est_prop$Est.prop.allgene ## Show different in estimation methods ## Jitter plot of estimated cell type proportions -jitter.fig <- Jitter_Est( - list(data.matrix(est_prop$Est.prop.weighted), - data.matrix(est_prop$Est.prop.allgene)), - method.name = methods, title = "Jitter plot of Est Proportions") +jitter_fig <- Jitter_Est( + list(data.matrix(estimated_music_props), + data.matrix(estimated_nnls_props)), + method.name = methods, title = "Jitter plot of Est Proportions", + size = 2, alpha = 0.7) + theme_minimal() ## Make a Plot ## A more sophisticated jitter plot is provided as below. We separated -## the T2D subjects and normal subjects by their HbA1c levels. -m_prop <- rbind(melt(est_prop$Est.prop.weighted), - melt(est_prop$Est.prop.allgene)) +## the T2D subjects and normal subjects by their disease factor levels. +estimated_music_props_flat <- melt(estimated_music_props) +estimated_nnls_props_flat <- melt(estimated_nnls_props) + +m_prop <- rbind(estimated_music_props_flat, + estimated_nnls_props_flat) +colnames(m_prop) <- c("Sub", "CellType", "Prop") + +if (is.null(celltypes)) { + celltypes <- levels(m_prop$CellType) + message("No celltypes declared, using:") + message(celltypes) +} -colnames(m_prop) <- c("Sub", "CellType", "Prop") +if (phenotype_target_threshold == -99) { + phenotype_target_threshold <- -Inf + message("phenotype target threshold set to -Inf") +} + +if (is.null(phenotype_factors)) { + phenotype_factors <- colnames(pData(bulk_eset)) +} +## filter out unwanted factors like "sampleID" and "subjectName" +phenotype_factors <- phenotype_factors[ + !(phenotype_factors %in% phenotype_factors_always_exclude)] +message("Phenotype Factors to use:") +message(phenotype_factors) + m_prop$CellType <- factor(m_prop$CellType, levels = celltypes) # nolint -m_prop$Method <- factor(rep(methods, each = 89 * 6), levels = methods) # nolint -m_prop$HbA1c <- rep(bulk_eset$hba1c, 2 * 6) # nolint -m_prop <- m_prop[!is.na(m_prop$HbA1c), ] -m_prop$Disease <- factor(sample_groups[(m_prop$HbA1c > 6.5) + 1], # nolint +m_prop$Method <- factor(rep(methods, each = nrow(estimated_music_props_flat)), # nolint + levels = methods) +m_prop$Disease_factor <- rep(bulk_eset[[phenotype_target]], 2 * length(celltypes)) # nolint +m_prop <- m_prop[!is.na(m_prop$Disease_factor), ] +## Generate a TRUE/FALSE table of Normal == 1 and Disease == 2 +sample_groups <- c("Normal", sample_disease_group) +m_prop$Disease <- factor(sample_groups[(m_prop$Disease_factor > phenotype_target_threshold) + 1], # nolint levels = sample_groups) +## Binary to scale: e.g. TRUE / 5 = 0.2 m_prop$D <- (m_prop$Disease == # nolint sample_disease_group) / sample_disease_group_scale -m_prop <- rbind(subset(m_prop, Disease == healthy_phenotype), - subset(m_prop, Disease != healthy_phenotype)) +## NA's are not included in the comparison below +m_prop <- rbind(subset(m_prop, Disease != sample_disease_group), + subset(m_prop, Disease == sample_disease_group)) -jitter.new <- ggplot(m_prop, aes(Method, Prop)) + +jitter_new <- ggplot(m_prop, aes(Method, Prop)) + geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease), size = 2, alpha = 0.7, position = position_jitter(width = 0.25, height = 0)) + @@ -52,20 +83,23 @@ scale_shape_manual(values = c(21, 24)) + theme_minimal() ## Plot to compare method effectiveness -## Create dataframe for beta cell proportions and HbA1c levels -m_prop_ana <- data.frame(pData(bulk_eset)[rep(1:89, 2), phenotype_factors], - ct.prop = c(est_prop$Est.prop.weighted[, 2], - est_prop$Est.prop.allgene[, 2]), - Method = factor(rep(methods, each = 89), +## Create dataframe for beta cell proportions and Disease_factor levels +m_prop_ana <- data.frame(pData(bulk_eset)[rep(1:nrow(estimated_music_props), 2), #nolint + phenotype_factors], + ct.prop = c(estimated_music_props[, 2], + estimated_nnls_props[, 2]), + Method = factor(rep(methods, + each = nrow(estimated_music_props)), levels = methods)) -colnames(m_prop_ana)[1:4] <- phenotype_factors -m_prop_ana <- subset(m_prop_ana, !is.na(m_prop_ana[phenotype_gene])) +colnames(m_prop_ana)[1:length(phenotype_factors)] <- phenotype_factors #nolint +m_prop_ana <- subset(m_prop_ana, !is.na(m_prop_ana[phenotype_target])) m_prop_ana$Disease <- factor(sample_groups[( # nolint - m_prop_ana[phenotype_gene] > 6.5) + 1], sample_groups) + m_prop_ana[phenotype_target] > phenotype_target_threshold) + 1], + sample_groups) m_prop_ana$D <- (m_prop_ana$Disease == # nolint sample_disease_group) / sample_disease_group_scale -jitt_compare <- ggplot(m_prop_ana, aes_string(phenotype_gene, "ct.prop")) + +jitt_compare <- ggplot(m_prop_ana, aes_string(phenotype_target, "ct.prop")) + geom_smooth(method = "lm", se = FALSE, col = "black", lwd = 0.25) + geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease), size = 2, alpha = 0.7) + facet_wrap(~ Method) + @@ -73,21 +107,81 @@ scale_colour_manual(values = c("white", "gray20")) + scale_shape_manual(values = c(21, 24)) +## BoxPlot +plot_box <- Boxplot_Est(list( + data.matrix(estimated_music_props), + data.matrix(estimated_nnls_props)), + method.name = c("MuSiC", "NNLS")) + + theme(axis.text.x = element_text(angle = -90), + axis.text.y = element_text(size = 8)) + + ggtitle(element_blank()) + theme_minimal() + +## Heatmap +plot_hmap <- Prop_heat_Est(list( + data.matrix(estimated_music_props), + data.matrix(estimated_nnls_props)), + method.name = c("MuSiC", "NNLS")) + + theme(axis.text.x = element_text(angle = -90), + axis.text.y = element_text(size = 6)) pdf(file = outfile_pdf, width = 8, height = 8) -plot_grid(jitter.fig, jitter.new, labels = "auto", ncol = 1, nrow = 2) -jitt_compare -dev.off() +plot_grid(jitter_fig, plot_box, labels = "auto", ncol = 1, nrow = 2) +plot_grid(jitter_new, jitt_compare, labels = "auto", ncol = 1, nrow = 2) +plot_hmap +message(dev.off()) + +## Output Proportions + +write.table(est_prop$Est.prop.weighted, + file = paste0("report_data/prop_", + "Music Estimated Proportions of Cell Types", + ".tabular"), + quote = F, sep = "\t", col.names = NA) +write.table(est_prop$Est.prop.allgene, + file = paste0("report_data/prop_", + "NNLS Estimated Proportions of Cell Types", + ".tabular"), + quote = F, sep = "\t", col.names = NA) +write.table(est_prop$Weight.gene, + file = paste0("report_data/weightgene_", + "Music Estimated Proportions of Cell Types (by Gene)", + ".tabular"), + quote = F, sep = "\t", col.names = NA) +write.table(est_prop$r.squared.full, + file = paste0("report_data/rsquared_", + "Music R-sqr Estimated Proportions of Each Subject", + ".tabular"), + quote = F, sep = "\t", col.names = NA) +write.table(est_prop$Var.prop, + file = paste0("report_data/varprop_", + "Matrix of Variance of MuSiC Estimates", + ".tabular"), + quote = F, sep = "\t", col.names = NA) + ## Summary table for (meth in methods) { ##lm_beta_meth = lm(ct.prop ~ age + bmi + hba1c + gender, data = - ##subset(m_prop_ana, Method == meth)) + sub_data <- subset(m_prop_ana, Method == meth) + ## We can only do regression where there are more than 1 factors + ## so we must find and exclude the ones which are not + gt1_facts <- sapply(phenotype_factors, function(facname) { + return(length(unique(sort(sub_data[[facname]]))) == 1) + }) + form_factors <- phenotype_factors + exclude_facts <- names(gt1_facts)[gt1_facts] + if (length(exclude_facts) > 0) { + message("Factors with only one level will be excluded:") + message(exclude_facts) + form_factors <- phenotype_factors[ + !(phenotype_factors %in% exclude_facts)] + } lm_beta_meth <- lm(as.formula( - paste("ct.prop", paste(phenotype_factors, collapse = " + "), - sep = " ~ ")), - data = subset(m_prop_ana, Method == meth)) - print(paste0("Summary: ", meth)) + paste("ct.prop", paste(form_factors, collapse = " + "), + sep = " ~ ")), data = sub_data) + message(paste0("Summary: ", meth)) capture.output(summary(lm_beta_meth), - file = paste0("report_data/summ_", meth, ".txt")) + file = paste0("report_data/summ_Log of ", + meth, + " fitting.txt")) }