Mercurial > repos > bgruening > music_construct_eset
comparison scripts/estimateprops.R @ 1:be91cb6f48e7 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit 683bb72ae92b5759a239b7e3bf4c5a229ed35b54"
author | bgruening |
---|---|
date | Fri, 26 Nov 2021 15:55:11 +0000 |
parents | 2cfd0db49bbc |
children | 7902cd31b9b5 |
comparison
equal
deleted
inserted
replaced
0:2cfd0db49bbc | 1:be91cb6f48e7 |
---|---|
12 est_prop <- music_prop( | 12 est_prop <- music_prop( |
13 bulk.eset = bulk_eset, sc.eset = scrna_eset, | 13 bulk.eset = bulk_eset, sc.eset = scrna_eset, |
14 clusters = celltypes_label, | 14 clusters = celltypes_label, |
15 samples = samples_label, select.ct = celltypes, verbose = T) | 15 samples = samples_label, select.ct = celltypes, verbose = T) |
16 | 16 |
17 estimated_music_props <- est_prop$Est.prop.weighted | |
18 estimated_nnls_props <- est_prop$Est.prop.allgene | |
17 | 19 |
18 ## Show different in estimation methods | 20 ## Show different in estimation methods |
19 ## Jitter plot of estimated cell type proportions | 21 ## Jitter plot of estimated cell type proportions |
20 jitter.fig <- Jitter_Est( | 22 jitter_fig <- Jitter_Est( |
21 list(data.matrix(est_prop$Est.prop.weighted), | 23 list(data.matrix(estimated_music_props), |
22 data.matrix(est_prop$Est.prop.allgene)), | 24 data.matrix(estimated_nnls_props)), |
23 method.name = methods, title = "Jitter plot of Est Proportions") | 25 method.name = methods, title = "Jitter plot of Est Proportions", |
26 size = 2, alpha = 0.7) + theme_minimal() | |
24 | 27 |
25 | 28 |
26 ## Make a Plot | 29 ## Make a Plot |
27 ## A more sophisticated jitter plot is provided as below. We separated | 30 ## A more sophisticated jitter plot is provided as below. We separated |
28 ## the T2D subjects and normal subjects by their HbA1c levels. | 31 ## the T2D subjects and normal subjects by their disease factor levels. |
29 m_prop <- rbind(melt(est_prop$Est.prop.weighted), | 32 estimated_music_props_flat <- melt(estimated_music_props) |
30 melt(est_prop$Est.prop.allgene)) | 33 estimated_nnls_props_flat <- melt(estimated_nnls_props) |
31 | 34 |
35 m_prop <- rbind(estimated_music_props_flat, | |
36 estimated_nnls_props_flat) | |
32 colnames(m_prop) <- c("Sub", "CellType", "Prop") | 37 colnames(m_prop) <- c("Sub", "CellType", "Prop") |
33 | 38 |
39 if (is.null(celltypes)) { | |
40 celltypes <- levels(m_prop$CellType) | |
41 message("No celltypes declared, using:") | |
42 message(celltypes) | |
43 } | |
44 | |
45 if (phenotype_target_threshold == -99) { | |
46 phenotype_target_threshold <- -Inf | |
47 message("phenotype target threshold set to -Inf") | |
48 } | |
49 | |
50 if (is.null(phenotype_factors)) { | |
51 phenotype_factors <- colnames(pData(bulk_eset)) | |
52 } | |
53 ## filter out unwanted factors like "sampleID" and "subjectName" | |
54 phenotype_factors <- phenotype_factors[ | |
55 !(phenotype_factors %in% phenotype_factors_always_exclude)] | |
56 message("Phenotype Factors to use:") | |
57 message(phenotype_factors) | |
58 | |
59 | |
34 m_prop$CellType <- factor(m_prop$CellType, levels = celltypes) # nolint | 60 m_prop$CellType <- factor(m_prop$CellType, levels = celltypes) # nolint |
35 m_prop$Method <- factor(rep(methods, each = 89 * 6), levels = methods) # nolint | 61 m_prop$Method <- factor(rep(methods, each = nrow(estimated_music_props_flat)), # nolint |
36 m_prop$HbA1c <- rep(bulk_eset$hba1c, 2 * 6) # nolint | 62 levels = methods) |
37 m_prop <- m_prop[!is.na(m_prop$HbA1c), ] | 63 m_prop$Disease_factor <- rep(bulk_eset[[phenotype_target]], 2 * length(celltypes)) # nolint |
38 m_prop$Disease <- factor(sample_groups[(m_prop$HbA1c > 6.5) + 1], # nolint | 64 m_prop <- m_prop[!is.na(m_prop$Disease_factor), ] |
65 ## Generate a TRUE/FALSE table of Normal == 1 and Disease == 2 | |
66 sample_groups <- c("Normal", sample_disease_group) | |
67 m_prop$Disease <- factor(sample_groups[(m_prop$Disease_factor > phenotype_target_threshold) + 1], # nolint | |
39 levels = sample_groups) | 68 levels = sample_groups) |
40 | 69 |
70 ## Binary to scale: e.g. TRUE / 5 = 0.2 | |
41 m_prop$D <- (m_prop$Disease == # nolint | 71 m_prop$D <- (m_prop$Disease == # nolint |
42 sample_disease_group) / sample_disease_group_scale | 72 sample_disease_group) / sample_disease_group_scale |
43 m_prop <- rbind(subset(m_prop, Disease == healthy_phenotype), | 73 ## NA's are not included in the comparison below |
44 subset(m_prop, Disease != healthy_phenotype)) | 74 m_prop <- rbind(subset(m_prop, Disease != sample_disease_group), |
75 subset(m_prop, Disease == sample_disease_group)) | |
45 | 76 |
46 jitter.new <- ggplot(m_prop, aes(Method, Prop)) + | 77 jitter_new <- ggplot(m_prop, aes(Method, Prop)) + |
47 geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease), | 78 geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease), |
48 size = 2, alpha = 0.7, | 79 size = 2, alpha = 0.7, |
49 position = position_jitter(width = 0.25, height = 0)) + | 80 position = position_jitter(width = 0.25, height = 0)) + |
50 facet_wrap(~ CellType, scales = "free") + | 81 facet_wrap(~ CellType, scales = "free") + |
51 scale_colour_manual(values = c("white", "gray20")) + | 82 scale_colour_manual(values = c("white", "gray20")) + |
52 scale_shape_manual(values = c(21, 24)) + theme_minimal() | 83 scale_shape_manual(values = c(21, 24)) + theme_minimal() |
53 | 84 |
54 ## Plot to compare method effectiveness | 85 ## Plot to compare method effectiveness |
55 ## Create dataframe for beta cell proportions and HbA1c levels | 86 ## Create dataframe for beta cell proportions and Disease_factor levels |
56 m_prop_ana <- data.frame(pData(bulk_eset)[rep(1:89, 2), phenotype_factors], | 87 m_prop_ana <- data.frame(pData(bulk_eset)[rep(1:nrow(estimated_music_props), 2), #nolint |
57 ct.prop = c(est_prop$Est.prop.weighted[, 2], | 88 phenotype_factors], |
58 est_prop$Est.prop.allgene[, 2]), | 89 ct.prop = c(estimated_music_props[, 2], |
59 Method = factor(rep(methods, each = 89), | 90 estimated_nnls_props[, 2]), |
91 Method = factor(rep(methods, | |
92 each = nrow(estimated_music_props)), | |
60 levels = methods)) | 93 levels = methods)) |
61 colnames(m_prop_ana)[1:4] <- phenotype_factors | 94 colnames(m_prop_ana)[1:length(phenotype_factors)] <- phenotype_factors #nolint |
62 m_prop_ana <- subset(m_prop_ana, !is.na(m_prop_ana[phenotype_gene])) | 95 m_prop_ana <- subset(m_prop_ana, !is.na(m_prop_ana[phenotype_target])) |
63 m_prop_ana$Disease <- factor(sample_groups[( # nolint | 96 m_prop_ana$Disease <- factor(sample_groups[( # nolint |
64 m_prop_ana[phenotype_gene] > 6.5) + 1], sample_groups) | 97 m_prop_ana[phenotype_target] > phenotype_target_threshold) + 1], |
98 sample_groups) | |
65 m_prop_ana$D <- (m_prop_ana$Disease == # nolint | 99 m_prop_ana$D <- (m_prop_ana$Disease == # nolint |
66 sample_disease_group) / sample_disease_group_scale | 100 sample_disease_group) / sample_disease_group_scale |
67 | 101 |
68 jitt_compare <- ggplot(m_prop_ana, aes_string(phenotype_gene, "ct.prop")) + | 102 jitt_compare <- ggplot(m_prop_ana, aes_string(phenotype_target, "ct.prop")) + |
69 geom_smooth(method = "lm", se = FALSE, col = "black", lwd = 0.25) + | 103 geom_smooth(method = "lm", se = FALSE, col = "black", lwd = 0.25) + |
70 geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease), | 104 geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease), |
71 size = 2, alpha = 0.7) + facet_wrap(~ Method) + | 105 size = 2, alpha = 0.7) + facet_wrap(~ Method) + |
72 ggtitle(compare_title) + theme_minimal() + | 106 ggtitle(compare_title) + theme_minimal() + |
73 scale_colour_manual(values = c("white", "gray20")) + | 107 scale_colour_manual(values = c("white", "gray20")) + |
74 scale_shape_manual(values = c(21, 24)) | 108 scale_shape_manual(values = c(21, 24)) |
75 | 109 |
110 ## BoxPlot | |
111 plot_box <- Boxplot_Est(list( | |
112 data.matrix(estimated_music_props), | |
113 data.matrix(estimated_nnls_props)), | |
114 method.name = c("MuSiC", "NNLS")) + | |
115 theme(axis.text.x = element_text(angle = -90), | |
116 axis.text.y = element_text(size = 8)) + | |
117 ggtitle(element_blank()) + theme_minimal() | |
118 | |
119 ## Heatmap | |
120 plot_hmap <- Prop_heat_Est(list( | |
121 data.matrix(estimated_music_props), | |
122 data.matrix(estimated_nnls_props)), | |
123 method.name = c("MuSiC", "NNLS")) + | |
124 theme(axis.text.x = element_text(angle = -90), | |
125 axis.text.y = element_text(size = 6)) | |
76 | 126 |
77 pdf(file = outfile_pdf, width = 8, height = 8) | 127 pdf(file = outfile_pdf, width = 8, height = 8) |
78 plot_grid(jitter.fig, jitter.new, labels = "auto", ncol = 1, nrow = 2) | 128 plot_grid(jitter_fig, plot_box, labels = "auto", ncol = 1, nrow = 2) |
79 jitt_compare | 129 plot_grid(jitter_new, jitt_compare, labels = "auto", ncol = 1, nrow = 2) |
80 dev.off() | 130 plot_hmap |
131 message(dev.off()) | |
132 | |
133 ## Output Proportions | |
134 | |
135 write.table(est_prop$Est.prop.weighted, | |
136 file = paste0("report_data/prop_", | |
137 "Music Estimated Proportions of Cell Types", | |
138 ".tabular"), | |
139 quote = F, sep = "\t", col.names = NA) | |
140 write.table(est_prop$Est.prop.allgene, | |
141 file = paste0("report_data/prop_", | |
142 "NNLS Estimated Proportions of Cell Types", | |
143 ".tabular"), | |
144 quote = F, sep = "\t", col.names = NA) | |
145 write.table(est_prop$Weight.gene, | |
146 file = paste0("report_data/weightgene_", | |
147 "Music Estimated Proportions of Cell Types (by Gene)", | |
148 ".tabular"), | |
149 quote = F, sep = "\t", col.names = NA) | |
150 write.table(est_prop$r.squared.full, | |
151 file = paste0("report_data/rsquared_", | |
152 "Music R-sqr Estimated Proportions of Each Subject", | |
153 ".tabular"), | |
154 quote = F, sep = "\t", col.names = NA) | |
155 write.table(est_prop$Var.prop, | |
156 file = paste0("report_data/varprop_", | |
157 "Matrix of Variance of MuSiC Estimates", | |
158 ".tabular"), | |
159 quote = F, sep = "\t", col.names = NA) | |
160 | |
81 | 161 |
82 ## Summary table | 162 ## Summary table |
83 for (meth in methods) { | 163 for (meth in methods) { |
84 ##lm_beta_meth = lm(ct.prop ~ age + bmi + hba1c + gender, data = | 164 ##lm_beta_meth = lm(ct.prop ~ age + bmi + hba1c + gender, data = |
85 ##subset(m_prop_ana, Method == meth)) | 165 sub_data <- subset(m_prop_ana, Method == meth) |
166 ## We can only do regression where there are more than 1 factors | |
167 ## so we must find and exclude the ones which are not | |
168 gt1_facts <- sapply(phenotype_factors, function(facname) { | |
169 return(length(unique(sort(sub_data[[facname]]))) == 1) | |
170 }) | |
171 form_factors <- phenotype_factors | |
172 exclude_facts <- names(gt1_facts)[gt1_facts] | |
173 if (length(exclude_facts) > 0) { | |
174 message("Factors with only one level will be excluded:") | |
175 message(exclude_facts) | |
176 form_factors <- phenotype_factors[ | |
177 !(phenotype_factors %in% exclude_facts)] | |
178 } | |
86 lm_beta_meth <- lm(as.formula( | 179 lm_beta_meth <- lm(as.formula( |
87 paste("ct.prop", paste(phenotype_factors, collapse = " + "), | 180 paste("ct.prop", paste(form_factors, collapse = " + "), |
88 sep = " ~ ")), | 181 sep = " ~ ")), data = sub_data) |
89 data = subset(m_prop_ana, Method == meth)) | 182 message(paste0("Summary: ", meth)) |
90 print(paste0("Summary: ", meth)) | |
91 capture.output(summary(lm_beta_meth), | 183 capture.output(summary(lm_beta_meth), |
92 file = paste0("report_data/summ_", meth, ".txt")) | 184 file = paste0("report_data/summ_Log of ", |
185 meth, | |
186 " fitting.txt")) | |
93 } | 187 } |