Mercurial > repos > artbio > mutational_patterns
comparison mutational_patterns.R @ 2:aea952be68cb draft
"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mutational_patterns commit cd8f633245d53cf47eaf860a4e0ae8d806c34419"
| author | artbio |
|---|---|
| date | Wed, 16 Sep 2020 22:58:28 +0000 |
| parents | 924c527fb379 |
| children | e332cf9dfa06 |
comparison
equal
deleted
inserted
replaced
| 1:921c1f55481d | 2:aea952be68cb |
|---|---|
| 3 error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) | 3 error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) |
| 4 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | 4 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") |
| 5 warnings() | 5 warnings() |
| 6 library(optparse) | 6 library(optparse) |
| 7 library(rjson) | 7 library(rjson) |
| 8 library(MutationalPatterns) | 8 library(grid) |
| 9 library(ggplot2) | 9 library(gridExtra) |
| 10 | 10 |
| 11 # Arguments | 11 # Arguments |
| 12 option_list = list( | 12 option_list = list( |
| 13 make_option( | 13 make_option( |
| 14 "--inputs", | 14 "--inputs", |
| 33 default = 2, | 33 default = 2, |
| 34 type = 'integer', | 34 type = 'integer', |
| 35 help = "selects the N most significant signatures in samples to express mutational patterns" | 35 help = "selects the N most significant signatures in samples to express mutational patterns" |
| 36 ), | 36 ), |
| 37 make_option( | 37 make_option( |
| 38 "--output", | 38 "--nrun", |
| 39 default = 2, | |
| 40 type = 'integer', | |
| 41 help = "Number of runs to fit signatures" | |
| 42 ), | |
| 43 make_option( | |
| 44 "--rank", | |
| 45 default = 2, | |
| 46 type = 'integer', | |
| 47 help = "number of ranks to display for parameter optimization" | |
| 48 ), | |
| 49 make_option( | |
| 50 "--newsignum", | |
| 51 default = 2, | |
| 52 type = 'integer', | |
| 53 help = "Number of new signatures to be captured" | |
| 54 ), | |
| 55 | |
| 56 make_option( | |
| 57 "--output_spectrum", | |
| 58 default = NA, | |
| 59 type = 'character', | |
| 60 help = "path to output dataset" | |
| 61 ), | |
| 62 make_option( | |
| 63 "--output_denovo", | |
| 64 default = NA, | |
| 65 type = 'character', | |
| 66 help = "path to output dataset" | |
| 67 ), | |
| 68 make_option( | |
| 69 "--output_cosmic", | |
| 39 default = NA, | 70 default = NA, |
| 40 type = 'character', | 71 type = 'character', |
| 41 help = "path to output dataset" | 72 help = "path to output dataset" |
| 42 ) | 73 ) |
| 43 ) | 74 ) |
| 44 | 75 |
| 45 opt = parse_args(OptionParser(option_list = option_list), | 76 opt = parse_args(OptionParser(option_list = option_list), |
| 46 args = commandArgs(trailingOnly = TRUE)) | 77 args = commandArgs(trailingOnly = TRUE)) |
| 47 | 78 |
| 79 ################ Manage input data #################### | |
| 48 json_dict <- opt$inputs | 80 json_dict <- opt$inputs |
| 49 parser <- newJSONParser() | 81 parser <- newJSONParser() |
| 50 parser$addData(json_dict) | 82 parser$addData(json_dict) |
| 51 fileslist <- parser$getObject() | 83 fileslist <- parser$getObject() |
| 52 vcf_files <- attr(fileslist, "names") | 84 vcf_files <- attr(fileslist, "names") |
| 53 sample_names <- unname(unlist(fileslist)) | 85 sample_names <- unname(unlist(fileslist)) |
| 54 pdf(opt$output, paper = "special", width = 11.69, height = 11.69) | |
| 55 ref_genome <- opt$genome | 86 ref_genome <- opt$genome |
| 87 vcf_table <- data.frame(sample_name=sample_names, path=vcf_files) | |
| 88 | |
| 89 library(MutationalPatterns) | |
| 56 library(ref_genome, character.only = TRUE) | 90 library(ref_genome, character.only = TRUE) |
| 57 | 91 |
| 58 # Load the VCF files into a GRangesList: | 92 # Load the VCF files into a GRangesList: |
| 59 vcfs <- read_vcfs_as_granges(vcf_files, sample_names, ref_genome) | 93 vcfs <- read_vcfs_as_granges(vcf_files, sample_names, ref_genome) |
| 60 levels_table <- read.delim(opt$levels, header=FALSE, col.names=c("sample_name","level")) | 94 if (!is.na(opt$levels)[1]) { # manage levels if there are |
| 61 vcf_table <- data.frame(path=vcf_files, sample_name=sample_names) | 95 levels_table <- read.delim(opt$levels, header=FALSE, col.names=c("sample_name","level")) |
| 62 metadata_table <- merge(vcf_table, levels_table, by.x=2, by.y=1) | 96 library(dplyr) |
| 63 levels <- metadata_table$level | 97 metadata_table <- left_join(vcf_table, levels_table, by = "sample_name") |
| 64 muts = mutations_from_vcf(vcfs[[1]]) | 98 # detach("package:dplyr", unload=TRUE) |
| 65 types = mut_type(vcfs[[1]]) | 99 tissue <- metadata_table$level |
| 66 context = mut_context(vcfs[[1]], ref_genome) | 100 } |
| 67 type_context = type_context(vcfs[[1]], ref_genome) | 101 |
| 68 type_occurrences <- mut_type_occurrences(vcfs, ref_genome) | 102 ##### This is done for any section ###### |
| 69 # p1 <- plot_spectrum(type_occurrences) | |
| 70 # p2 <- plot_spectrum(type_occurrences, CT = TRUE) | |
| 71 # p3 <- plot_spectrum(type_occurrences, CT = TRUE, legend = FALSE) | |
| 72 # | |
| 73 # plot(p2) | |
| 74 # p4 <- plot_spectrum(type_occurrences, by = levels, CT = TRUE, legend = TRUE) | |
| 75 # palette <- c("pink", "orange", "blue", "lightblue", "green", "red", "purple") | |
| 76 # p5 <- plot_spectrum(type_occurrences, CT=TRUE, legend=TRUE, colors=palette) | |
| 77 # | |
| 78 # plot(p4) | |
| 79 mut_mat <- mut_matrix(vcf_list = vcfs, ref_genome = ref_genome) | 103 mut_mat <- mut_matrix(vcf_list = vcfs, ref_genome = ref_genome) |
| 80 # plot_96_profile(mut_mat[,1:length(as.data.frame(mut_mat))], condensed = TRUE) | 104 |
| 81 mut_mat <- mut_mat + 0.0001 | 105 |
| 82 # library("NMF") | 106 ###### Section 1 Mutation characteristics and spectrums ############# |
| 83 # estimate <- nmf(mut_mat, rank=2:5, method="brunet", nrun=100, seed=123456) | 107 if (!is.na(opt$output_spectrum)[1]) { |
| 84 # plot(estimate) | 108 pdf(opt$output_spectrum, paper = "special", width = 11.69, height = 11.69) |
| 85 # nmf_res <- extract_signatures(mut_mat, rank = 4, nrun = 100) | 109 type_occurrences <- mut_type_occurrences(vcfs, ref_genome) |
| 86 # colnames(nmf_res$signatures) <- c("Signature A", "Signature B", "Signature C", "Signature D") | 110 |
| 87 # rownames(nmf_res$contribution) <- c("Signature A", "Signature B", "Signature C", "Signature D") | 111 # mutation spectrum, total or by sample |
| 88 # plot_96_profile(nmf_res$signatures, condensed = TRUE) | 112 |
| 89 sp_url <- paste("https://cancer.sanger.ac.uk/cancergenome/assets/", "signatures_probabilities.txt", sep = "") | 113 if (is.na(opt$levels)[1]) { |
| 90 cancer_signatures = read.table(sp_url, sep = "\t", header = TRUE) | 114 p1 <- plot_spectrum(type_occurrences, CT = TRUE, legend=TRUE) |
| 91 new_order = match(row.names(mut_mat), cancer_signatures$Somatic.Mutation.Type) | 115 plot(p1) |
| 92 cancer_signatures = cancer_signatures[as.vector(new_order),] | 116 } else { |
| 93 row.names(cancer_signatures) = cancer_signatures$Somatic.Mutation.Type | 117 p2 <- plot_spectrum(type_occurrences, by = tissue, CT=TRUE, legend=TRUE) # by sample |
| 94 cancer_signatures = as.matrix(cancer_signatures[,4:33]) | 118 p3 <- plot_spectrum(type_occurrences, CT=TRUE, legend=TRUE) # total |
| 95 # plot_96_profile(cancer_signatures, condensed = TRUE, ymax = 0.3) | 119 grid.arrange(p2, p3, ncol=2, widths=c(4,2.3), heights=c(4,1)) |
| 96 hclust_cosmic = cluster_signatures(cancer_signatures, method = "average") | 120 } |
| 97 cosmic_order = colnames(cancer_signatures)[hclust_cosmic$order] | 121 plot_96_profile(mut_mat, condensed = TRUE) |
| 98 # plot(hclust_cosmic) | 122 dev.off() |
| 99 cos_sim(mut_mat[,1], cancer_signatures[,1]) | 123 } |
| 100 cos_sim_samples_signatures = cos_sim_matrix(mut_mat, cancer_signatures) | 124 |
| 101 plot_cosine_heatmap(cos_sim_samples_signatures, col_order = cosmic_order, cluster_rows = TRUE) | 125 ###### Section 2: De novo mutational signature extraction using NMF ####### |
| 102 fit_res <- fit_to_signatures(mut_mat, cancer_signatures) | 126 if (!is.na(opt$output_denovo)[1]) { |
| 103 threshold <- tail(sort(unlist(rowSums(fit_res$contribution), use.names = FALSE)), opt$signum)[1] | 127 mut_mat <- mut_mat + 0.0001 # First add a small psuedocount to the mutation count matrix |
| 104 select <- which(rowSums(fit_res$contribution) >= threshold) # ensure opt$signum best signatures in samples are retained, the others discarded | 128 # Use the NMF package to generate an estimate rank plot |
| 105 plot_contribution(fit_res$contribution[select,], cancer_signatures[,select], coord_flip = T, mode = "absolute") | 129 library("NMF") |
| 106 plot_contribution(fit_res$contribution[select,], cancer_signatures[,select], coord_flip = T, mode = "relative") | 130 estimate <- nmf(mut_mat, rank=1:opt$rank+1, method="brunet", nrun=opt$nrun, seed=123456) |
| 107 plot_contribution_heatmap(fit_res$contribution, cluster_samples = TRUE, method = "complete") | 131 # And plot it |
| 108 | 132 pdf(opt$output_denovo, paper = "special", width = 11.69, height = 11.69) |
| 109 | 133 p.trans <- plot(estimate) |
| 110 sig5data <- as.data.frame(t(head(fit_res$contribution[select,]))) | 134 grid.arrange(p.trans) |
| 111 colnames(sig5data) <- gsub("nature", "", colnames(sig5data)) | 135 # Extract 4 (PARAMETIZE) mutational signatures from the mutation count matrix with extract_signatures |
| 112 sig5data_percents <- sig5data / (apply(sig5data,1,sum)) * 100 | 136 # (For larger datasets it is wise to perform more iterations by changing the nrun parameter |
| 113 sig5data_percents$sample <- rownames(sig5data_percents) | 137 # to achieve stability and avoid local minima) |
| 114 library(reshape2) | 138 nmf_res <- extract_signatures(mut_mat, rank=opt$rank, nrun=opt$nrun) |
| 115 melted_sig5data_percents <-melt(data=sig5data_percents) | 139 # Assign signature names |
| 116 melted_sig5data_percents$label <- sub("Sig.", "", melted_sig5data_percents$variable) | 140 colnames(nmf_res$signatures) <- paste0("NewSig_", 1:opt$rank) |
| 117 melted_sig5data_percents$pos <- cumsum(melted_sig5data_percents$value) - melted_sig5data_percents$value/2 | 141 rownames(nmf_res$contribution) <- paste0("NewSig_", 1:opt$rank) |
| 118 ggplot(melted_sig5data_percents, aes(x="", y=value, group=variable, fill=variable)) + | 142 # Plot the 96-profile of the signatures: |
| 119 geom_bar(width = 1, stat = "identity") + | 143 p.trans <- plot_96_profile(nmf_res$signatures, condensed = TRUE) |
| 120 geom_text(aes(label = label), position = position_stack(vjust = 0.5), color="black", size=3) + | 144 grid.arrange(p.trans) |
| 121 coord_polar("y", start=0) + facet_wrap(~ sample) + | 145 # Visualize the contribution of the signatures in a barplot |
| 122 labs(x="", y="Samples", fill = "Signatures (Cosmic_v2,March 2015)") + | 146 pc1 <- plot_contribution(nmf_res$contribution, nmf_res$signature, mode="relative", coord_flip = TRUE) |
| 123 theme(axis.text = element_blank(), | 147 # Visualize the contribution of the signatures in absolute number of mutations |
| 124 axis.ticks = element_blank(), | 148 pc2 <- plot_contribution(nmf_res$contribution, nmf_res$signature, mode="absolute", coord_flip = TRUE) |
| 125 panel.grid = element_blank()) | 149 # Combine the two plots: |
| 126 dev.off() | 150 grid.arrange(pc1, pc2) |
| 151 | |
| 152 # The relative contribution of each signature for each sample can also be plotted as a heatmap with | |
| 153 # plot_contribution_heatmap, which might be easier to interpret and compare than stacked barplots. | |
| 154 # The samples can be hierarchically clustered based on their euclidean dis- tance. The signatures | |
| 155 # can be plotted in a user-specified order. | |
| 156 # Plot signature contribution as a heatmap with sample clustering dendrogram and a specified signature order: | |
| 157 pch1 <- plot_contribution_heatmap(nmf_res$contribution, | |
| 158 sig_order = paste0("NewSig_", 1:opt$rank)) | |
| 159 # Plot signature contribution as a heatmap without sample clustering: | |
| 160 pch2 <- plot_contribution_heatmap(nmf_res$contribution, cluster_samples=FALSE) | |
| 161 #Combine the plots into one figure: | |
| 162 grid.arrange(pch1, pch2, ncol = 2, widths = c(2,1.6)) | |
| 163 | |
| 164 # Compare the reconstructed mutational profile with the original mutational profile: | |
| 165 plot_compare_profiles(mut_mat[,1], | |
| 166 nmf_res$reconstructed[,1], | |
| 167 profile_names = c("Original", "Reconstructed"), | |
| 168 condensed = TRUE) | |
| 169 dev.off() | |
| 170 } | |
| 171 ##### Section 3: Find optimal contribution of known signatures: COSMIC mutational signatures #### | |
| 172 | |
| 173 if (!is.na(opt$output_cosmic)[1]) { | |
| 174 pdf(opt$output_cosmic, paper = "special", width = 11.69, height = 11.69) | |
| 175 sp_url <- paste("https://cancer.sanger.ac.uk/cancergenome/assets/", "signatures_probabilities.txt", sep = "") | |
| 176 cancer_signatures = read.table(sp_url, sep = "\t", header = TRUE) | |
| 177 mut_mat <- mut_mat + 0.0001 # First add a small psuedocount to the mutation count matrix | |
| 178 new_order = match(row.names(mut_mat), cancer_signatures$Somatic.Mutation.Type) | |
| 179 cancer_signatures = cancer_signatures[as.vector(new_order),] | |
| 180 row.names(cancer_signatures) = cancer_signatures$Somatic.Mutation.Type | |
| 181 cancer_signatures = as.matrix(cancer_signatures[,4:33]) | |
| 182 | |
| 183 # Plot mutational profiles of the COSMIC signatures | |
| 184 colnames(cancer_signatures) <- gsub("Signature.", "", colnames(cancer_signatures)) # shorten signature labels | |
| 185 p.trans <- plot_96_profile(cancer_signatures, condensed = TRUE, ymax = 0.3) | |
| 186 grid.arrange(p.trans, top = textGrob("COSMIC signature profiles",gp=gpar(fontsize=12,font=3))) | |
| 187 | |
| 188 # Hierarchically cluster the COSMIC signatures based on their similarity with average linkage | |
| 189 # hclust_cosmic = cluster_signatures(cancer_signatures, method = "average") | |
| 190 # store signatures in new order | |
| 191 # cosmic_order = colnames(cancer_signatures)[hclust_cosmic$order] | |
| 192 # plot(hclust_cosmic) | |
| 193 | |
| 194 # Similarity between mutational profiles and COSMIC signatures | |
| 195 | |
| 196 | |
| 197 # The similarity between each mutational profile and each COSMIC signature, can be calculated | |
| 198 # with cos_sim_matrix, and visualized with plot_cosine_heatmap. The cosine similarity reflects | |
| 199 # how well each mutational profile can be explained by each signature individually. The advantage | |
| 200 # of this heatmap representation is that it shows in a glance the similarity in mutational | |
| 201 # profiles between samples, while at the same time providing information on which signatures | |
| 202 # are most prominent. The samples can be hierarchically clustered in plot_cosine_heatmap. | |
| 203 # The cosine similarity between two mutational profiles/signatures can be calculated with cos_sim : | |
| 204 # cos_sim(mut_mat[,1], cancer_signatures[,1]) | |
| 205 | |
| 206 # Calculate pairwise cosine similarity between mutational profiles and COSMIC signatures | |
| 207 | |
| 208 # cos_sim_samples_signatures = cos_sim_matrix(mut_mat, cancer_signatures) | |
| 209 # Plot heatmap with specified signature order | |
| 210 # p.trans <- plot_cosine_heatmap(cos_sim_samples_signatures, col_order = cosmic_order, cluster_rows = TRUE) | |
| 211 # grid.arrange(p.trans) | |
| 212 | |
| 213 # Find optimal contribution of COSMIC signatures to reconstruct 96 mutational profiles | |
| 214 fit_res <- fit_to_signatures(mut_mat, cancer_signatures) | |
| 215 | |
| 216 # Select signatures with some contribution (above a threshold) | |
| 217 threshold <- tail(sort(unlist(rowSums(fit_res$contribution), use.names = FALSE)), opt$signum)[1] | |
| 218 select <- which(rowSums(fit_res$contribution) >= threshold) # ensure opt$signum best signatures in samples are retained, the others discarded | |
| 219 # Plot contribution barplots | |
| 220 pc1 <- plot_contribution(fit_res$contribution[select,], cancer_signatures[,select], coord_flip = T, mode = "absolute") | |
| 221 pc2 <- plot_contribution(fit_res$contribution[select,], cancer_signatures[,select], coord_flip = T, mode = "relative") | |
| 222 # Combine the two plots: | |
| 223 grid.arrange(pc1, pc2) | |
| 224 ## pie charts of comic signatures contributions in samples | |
| 225 | |
| 226 sig_data_pie <- as.data.frame(t(head(fit_res$contribution[select,]))) | |
| 227 colnames(sig_data_pie) <- gsub("nature", "", colnames(sig_data_pie)) | |
| 228 sig_data_pie_percents <- sig_data_pie / (apply(sig_data_pie,1,sum)) * 100 | |
| 229 sig_data_pie_percents$sample <- rownames(sig_data_pie_percents) | |
| 230 library(reshape2) | |
| 231 melted_sig_data_pie_percents <-melt(data=sig_data_pie_percents) | |
| 232 melted_sig_data_pie_percents$label <- sub("Sig.", "", melted_sig_data_pie_percents$variable) | |
| 233 melted_sig_data_pie_percents$pos <- cumsum(melted_sig_data_pie_percents$value) - melted_sig_data_pie_percents$value/2 | |
| 234 library(ggplot2) | |
| 235 p.trans <- ggplot(melted_sig_data_pie_percents, aes(x="", y=value, group=variable, fill=variable)) + | |
| 236 geom_bar(width = 1, stat = "identity") + | |
| 237 geom_text(aes(label = label), position = position_stack(vjust = 0.5), color="black", size=3) + | |
| 238 coord_polar("y", start=0) + facet_wrap(~ sample) + | |
| 239 labs(x="", y="Samples", fill = "Signatures (Cosmic_v2,March 2015)") + | |
| 240 theme(axis.text = element_blank(), | |
| 241 axis.ticks = element_blank(), | |
| 242 panel.grid = element_blank()) | |
| 243 grid.arrange(p.trans) | |
| 244 | |
| 245 # Plot relative contribution of the cancer signatures in each sample as a heatmap with sample clustering | |
| 246 p.trans <- plot_contribution_heatmap(fit_res$contribution, cluster_samples = TRUE, method = "complete") | |
| 247 grid.arrange(p.trans) | |
| 248 | |
| 249 # Compare the reconstructed mutational profile of sample 1 with its original mutational profile | |
| 250 # plot_compare_profiles(mut_mat[,1], fit_res$reconstructed[,1], | |
| 251 # profile_names = c("Original", "Reconstructed"), | |
| 252 # condensed = TRUE) | |
| 253 | |
| 254 # Calculate the cosine similarity between all original and reconstructed mutational profiles with | |
| 255 # `cos_sim_matrix` | |
| 256 | |
| 257 # calculate all pairwise cosine similarities | |
| 258 cos_sim_ori_rec <- cos_sim_matrix(mut_mat, fit_res$reconstructed) | |
| 259 # extract cosine similarities per sample between original and reconstructed | |
| 260 cos_sim_ori_rec <- as.data.frame(diag(cos_sim_ori_rec)) | |
| 261 | |
| 262 # We can use ggplot to make a barplot of the cosine similarities between the original and | |
| 263 # reconstructed mutational profile of each sample. This clearly shows how well each mutational | |
| 264 # profile can be reconstructed with the COSMIC mutational signatures. Two identical profiles | |
| 265 # have a cosine similarity of 1. The lower the cosine similarity between original and | |
| 266 # reconstructed, the less well the original mutational profile can be reconstructed with | |
| 267 # the COSMIC signatures. You could use, for example, cosine similarity of 0.95 as a cutoff. | |
| 268 | |
| 269 # Adjust data frame for plotting with gpplot | |
| 270 colnames(cos_sim_ori_rec) = "cos_sim" | |
| 271 cos_sim_ori_rec$sample = row.names(cos_sim_ori_rec) | |
| 272 # Make barplot | |
| 273 p.trans <- ggplot(cos_sim_ori_rec, aes(y=cos_sim, x=sample)) + | |
| 274 geom_bar(stat="identity", fill = "skyblue4") + | |
| 275 coord_cartesian(ylim=c(0.8, 1)) + | |
| 276 # coord_flip(ylim=c(0.8,1)) + | |
| 277 ylab("Cosine similarity\n original VS reconstructed") + | |
| 278 xlab("") + | |
| 279 # Reverse order of the samples such that first is up | |
| 280 # xlim(rev(levels(factor(cos_sim_ori_rec$sample)))) + | |
| 281 theme_bw() + | |
| 282 theme(panel.grid.minor.y=element_blank(), | |
| 283 panel.grid.major.y=element_blank()) + | |
| 284 # Add cut.off line | |
| 285 geom_hline(aes(yintercept=.95)) | |
| 286 grid.arrange(p.trans, top = textGrob("Similarity between true and reconstructed profiles (with all Cosmic sig.)",gp=gpar(fontsize=12,font=3))) | |
| 287 | |
| 288 dev.off() | |
| 289 } |
