comparison mutational_patterns.R @ 2:aea952be68cb draft

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mutational_patterns commit cd8f633245d53cf47eaf860a4e0ae8d806c34419"
author artbio
date Wed, 16 Sep 2020 22:58:28 +0000
parents 924c527fb379
children e332cf9dfa06
comparison
equal deleted inserted replaced
1:921c1f55481d 2:aea952be68cb
3 error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) 3 error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
4 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") 4 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
5 warnings() 5 warnings()
6 library(optparse) 6 library(optparse)
7 library(rjson) 7 library(rjson)
8 library(MutationalPatterns) 8 library(grid)
9 library(ggplot2) 9 library(gridExtra)
10 10
11 # Arguments 11 # Arguments
12 option_list = list( 12 option_list = list(
13 make_option( 13 make_option(
14 "--inputs", 14 "--inputs",
33 default = 2, 33 default = 2,
34 type = 'integer', 34 type = 'integer',
35 help = "selects the N most significant signatures in samples to express mutational patterns" 35 help = "selects the N most significant signatures in samples to express mutational patterns"
36 ), 36 ),
37 make_option( 37 make_option(
38 "--output", 38 "--nrun",
39 default = 2,
40 type = 'integer',
41 help = "Number of runs to fit signatures"
42 ),
43 make_option(
44 "--rank",
45 default = 2,
46 type = 'integer',
47 help = "number of ranks to display for parameter optimization"
48 ),
49 make_option(
50 "--newsignum",
51 default = 2,
52 type = 'integer',
53 help = "Number of new signatures to be captured"
54 ),
55
56 make_option(
57 "--output_spectrum",
58 default = NA,
59 type = 'character',
60 help = "path to output dataset"
61 ),
62 make_option(
63 "--output_denovo",
64 default = NA,
65 type = 'character',
66 help = "path to output dataset"
67 ),
68 make_option(
69 "--output_cosmic",
39 default = NA, 70 default = NA,
40 type = 'character', 71 type = 'character',
41 help = "path to output dataset" 72 help = "path to output dataset"
42 ) 73 )
43 ) 74 )
44 75
45 opt = parse_args(OptionParser(option_list = option_list), 76 opt = parse_args(OptionParser(option_list = option_list),
46 args = commandArgs(trailingOnly = TRUE)) 77 args = commandArgs(trailingOnly = TRUE))
47 78
79 ################ Manage input data ####################
48 json_dict <- opt$inputs 80 json_dict <- opt$inputs
49 parser <- newJSONParser() 81 parser <- newJSONParser()
50 parser$addData(json_dict) 82 parser$addData(json_dict)
51 fileslist <- parser$getObject() 83 fileslist <- parser$getObject()
52 vcf_files <- attr(fileslist, "names") 84 vcf_files <- attr(fileslist, "names")
53 sample_names <- unname(unlist(fileslist)) 85 sample_names <- unname(unlist(fileslist))
54 pdf(opt$output, paper = "special", width = 11.69, height = 11.69)
55 ref_genome <- opt$genome 86 ref_genome <- opt$genome
87 vcf_table <- data.frame(sample_name=sample_names, path=vcf_files)
88
89 library(MutationalPatterns)
56 library(ref_genome, character.only = TRUE) 90 library(ref_genome, character.only = TRUE)
57 91
58 # Load the VCF files into a GRangesList: 92 # Load the VCF files into a GRangesList:
59 vcfs <- read_vcfs_as_granges(vcf_files, sample_names, ref_genome) 93 vcfs <- read_vcfs_as_granges(vcf_files, sample_names, ref_genome)
60 levels_table <- read.delim(opt$levels, header=FALSE, col.names=c("sample_name","level")) 94 if (!is.na(opt$levels)[1]) { # manage levels if there are
61 vcf_table <- data.frame(path=vcf_files, sample_name=sample_names) 95 levels_table <- read.delim(opt$levels, header=FALSE, col.names=c("sample_name","level"))
62 metadata_table <- merge(vcf_table, levels_table, by.x=2, by.y=1) 96 library(dplyr)
63 levels <- metadata_table$level 97 metadata_table <- left_join(vcf_table, levels_table, by = "sample_name")
64 muts = mutations_from_vcf(vcfs[[1]]) 98 # detach("package:dplyr", unload=TRUE)
65 types = mut_type(vcfs[[1]]) 99 tissue <- metadata_table$level
66 context = mut_context(vcfs[[1]], ref_genome) 100 }
67 type_context = type_context(vcfs[[1]], ref_genome) 101
68 type_occurrences <- mut_type_occurrences(vcfs, ref_genome) 102 ##### This is done for any section ######
69 # p1 <- plot_spectrum(type_occurrences)
70 # p2 <- plot_spectrum(type_occurrences, CT = TRUE)
71 # p3 <- plot_spectrum(type_occurrences, CT = TRUE, legend = FALSE)
72 #
73 # plot(p2)
74 # p4 <- plot_spectrum(type_occurrences, by = levels, CT = TRUE, legend = TRUE)
75 # palette <- c("pink", "orange", "blue", "lightblue", "green", "red", "purple")
76 # p5 <- plot_spectrum(type_occurrences, CT=TRUE, legend=TRUE, colors=palette)
77 #
78 # plot(p4)
79 mut_mat <- mut_matrix(vcf_list = vcfs, ref_genome = ref_genome) 103 mut_mat <- mut_matrix(vcf_list = vcfs, ref_genome = ref_genome)
80 # plot_96_profile(mut_mat[,1:length(as.data.frame(mut_mat))], condensed = TRUE) 104
81 mut_mat <- mut_mat + 0.0001 105
82 # library("NMF") 106 ###### Section 1 Mutation characteristics and spectrums #############
83 # estimate <- nmf(mut_mat, rank=2:5, method="brunet", nrun=100, seed=123456) 107 if (!is.na(opt$output_spectrum)[1]) {
84 # plot(estimate) 108 pdf(opt$output_spectrum, paper = "special", width = 11.69, height = 11.69)
85 # nmf_res <- extract_signatures(mut_mat, rank = 4, nrun = 100) 109 type_occurrences <- mut_type_occurrences(vcfs, ref_genome)
86 # colnames(nmf_res$signatures) <- c("Signature A", "Signature B", "Signature C", "Signature D") 110
87 # rownames(nmf_res$contribution) <- c("Signature A", "Signature B", "Signature C", "Signature D") 111 # mutation spectrum, total or by sample
88 # plot_96_profile(nmf_res$signatures, condensed = TRUE) 112
89 sp_url <- paste("https://cancer.sanger.ac.uk/cancergenome/assets/", "signatures_probabilities.txt", sep = "") 113 if (is.na(opt$levels)[1]) {
90 cancer_signatures = read.table(sp_url, sep = "\t", header = TRUE) 114 p1 <- plot_spectrum(type_occurrences, CT = TRUE, legend=TRUE)
91 new_order = match(row.names(mut_mat), cancer_signatures$Somatic.Mutation.Type) 115 plot(p1)
92 cancer_signatures = cancer_signatures[as.vector(new_order),] 116 } else {
93 row.names(cancer_signatures) = cancer_signatures$Somatic.Mutation.Type 117 p2 <- plot_spectrum(type_occurrences, by = tissue, CT=TRUE, legend=TRUE) # by sample
94 cancer_signatures = as.matrix(cancer_signatures[,4:33]) 118 p3 <- plot_spectrum(type_occurrences, CT=TRUE, legend=TRUE) # total
95 # plot_96_profile(cancer_signatures, condensed = TRUE, ymax = 0.3) 119 grid.arrange(p2, p3, ncol=2, widths=c(4,2.3), heights=c(4,1))
96 hclust_cosmic = cluster_signatures(cancer_signatures, method = "average") 120 }
97 cosmic_order = colnames(cancer_signatures)[hclust_cosmic$order] 121 plot_96_profile(mut_mat, condensed = TRUE)
98 # plot(hclust_cosmic) 122 dev.off()
99 cos_sim(mut_mat[,1], cancer_signatures[,1]) 123 }
100 cos_sim_samples_signatures = cos_sim_matrix(mut_mat, cancer_signatures) 124
101 plot_cosine_heatmap(cos_sim_samples_signatures, col_order = cosmic_order, cluster_rows = TRUE) 125 ###### Section 2: De novo mutational signature extraction using NMF #######
102 fit_res <- fit_to_signatures(mut_mat, cancer_signatures) 126 if (!is.na(opt$output_denovo)[1]) {
103 threshold <- tail(sort(unlist(rowSums(fit_res$contribution), use.names = FALSE)), opt$signum)[1] 127 mut_mat <- mut_mat + 0.0001 # First add a small psuedocount to the mutation count matrix
104 select <- which(rowSums(fit_res$contribution) >= threshold) # ensure opt$signum best signatures in samples are retained, the others discarded 128 # Use the NMF package to generate an estimate rank plot
105 plot_contribution(fit_res$contribution[select,], cancer_signatures[,select], coord_flip = T, mode = "absolute") 129 library("NMF")
106 plot_contribution(fit_res$contribution[select,], cancer_signatures[,select], coord_flip = T, mode = "relative") 130 estimate <- nmf(mut_mat, rank=1:opt$rank+1, method="brunet", nrun=opt$nrun, seed=123456)
107 plot_contribution_heatmap(fit_res$contribution, cluster_samples = TRUE, method = "complete") 131 # And plot it
108 132 pdf(opt$output_denovo, paper = "special", width = 11.69, height = 11.69)
109 133 p.trans <- plot(estimate)
110 sig5data <- as.data.frame(t(head(fit_res$contribution[select,]))) 134 grid.arrange(p.trans)
111 colnames(sig5data) <- gsub("nature", "", colnames(sig5data)) 135 # Extract 4 (PARAMETIZE) mutational signatures from the mutation count matrix with extract_signatures
112 sig5data_percents <- sig5data / (apply(sig5data,1,sum)) * 100 136 # (For larger datasets it is wise to perform more iterations by changing the nrun parameter
113 sig5data_percents$sample <- rownames(sig5data_percents) 137 # to achieve stability and avoid local minima)
114 library(reshape2) 138 nmf_res <- extract_signatures(mut_mat, rank=opt$rank, nrun=opt$nrun)
115 melted_sig5data_percents <-melt(data=sig5data_percents) 139 # Assign signature names
116 melted_sig5data_percents$label <- sub("Sig.", "", melted_sig5data_percents$variable) 140 colnames(nmf_res$signatures) <- paste0("NewSig_", 1:opt$rank)
117 melted_sig5data_percents$pos <- cumsum(melted_sig5data_percents$value) - melted_sig5data_percents$value/2 141 rownames(nmf_res$contribution) <- paste0("NewSig_", 1:opt$rank)
118 ggplot(melted_sig5data_percents, aes(x="", y=value, group=variable, fill=variable)) + 142 # Plot the 96-profile of the signatures:
119 geom_bar(width = 1, stat = "identity") + 143 p.trans <- plot_96_profile(nmf_res$signatures, condensed = TRUE)
120 geom_text(aes(label = label), position = position_stack(vjust = 0.5), color="black", size=3) + 144 grid.arrange(p.trans)
121 coord_polar("y", start=0) + facet_wrap(~ sample) + 145 # Visualize the contribution of the signatures in a barplot
122 labs(x="", y="Samples", fill = "Signatures (Cosmic_v2,March 2015)") + 146 pc1 <- plot_contribution(nmf_res$contribution, nmf_res$signature, mode="relative", coord_flip = TRUE)
123 theme(axis.text = element_blank(), 147 # Visualize the contribution of the signatures in absolute number of mutations
124 axis.ticks = element_blank(), 148 pc2 <- plot_contribution(nmf_res$contribution, nmf_res$signature, mode="absolute", coord_flip = TRUE)
125 panel.grid = element_blank()) 149 # Combine the two plots:
126 dev.off() 150 grid.arrange(pc1, pc2)
151
152 # The relative contribution of each signature for each sample can also be plotted as a heatmap with
153 # plot_contribution_heatmap, which might be easier to interpret and compare than stacked barplots.
154 # The samples can be hierarchically clustered based on their euclidean dis- tance. The signatures
155 # can be plotted in a user-specified order.
156 # Plot signature contribution as a heatmap with sample clustering dendrogram and a specified signature order:
157 pch1 <- plot_contribution_heatmap(nmf_res$contribution,
158 sig_order = paste0("NewSig_", 1:opt$rank))
159 # Plot signature contribution as a heatmap without sample clustering:
160 pch2 <- plot_contribution_heatmap(nmf_res$contribution, cluster_samples=FALSE)
161 #Combine the plots into one figure:
162 grid.arrange(pch1, pch2, ncol = 2, widths = c(2,1.6))
163
164 # Compare the reconstructed mutational profile with the original mutational profile:
165 plot_compare_profiles(mut_mat[,1],
166 nmf_res$reconstructed[,1],
167 profile_names = c("Original", "Reconstructed"),
168 condensed = TRUE)
169 dev.off()
170 }
171 ##### Section 3: Find optimal contribution of known signatures: COSMIC mutational signatures ####
172
173 if (!is.na(opt$output_cosmic)[1]) {
174 pdf(opt$output_cosmic, paper = "special", width = 11.69, height = 11.69)
175 sp_url <- paste("https://cancer.sanger.ac.uk/cancergenome/assets/", "signatures_probabilities.txt", sep = "")
176 cancer_signatures = read.table(sp_url, sep = "\t", header = TRUE)
177 mut_mat <- mut_mat + 0.0001 # First add a small psuedocount to the mutation count matrix
178 new_order = match(row.names(mut_mat), cancer_signatures$Somatic.Mutation.Type)
179 cancer_signatures = cancer_signatures[as.vector(new_order),]
180 row.names(cancer_signatures) = cancer_signatures$Somatic.Mutation.Type
181 cancer_signatures = as.matrix(cancer_signatures[,4:33])
182
183 # Plot mutational profiles of the COSMIC signatures
184 colnames(cancer_signatures) <- gsub("Signature.", "", colnames(cancer_signatures)) # shorten signature labels
185 p.trans <- plot_96_profile(cancer_signatures, condensed = TRUE, ymax = 0.3)
186 grid.arrange(p.trans, top = textGrob("COSMIC signature profiles",gp=gpar(fontsize=12,font=3)))
187
188 # Hierarchically cluster the COSMIC signatures based on their similarity with average linkage
189 # hclust_cosmic = cluster_signatures(cancer_signatures, method = "average")
190 # store signatures in new order
191 # cosmic_order = colnames(cancer_signatures)[hclust_cosmic$order]
192 # plot(hclust_cosmic)
193
194 # Similarity between mutational profiles and COSMIC signatures
195
196
197 # The similarity between each mutational profile and each COSMIC signature, can be calculated
198 # with cos_sim_matrix, and visualized with plot_cosine_heatmap. The cosine similarity reflects
199 # how well each mutational profile can be explained by each signature individually. The advantage
200 # of this heatmap representation is that it shows in a glance the similarity in mutational
201 # profiles between samples, while at the same time providing information on which signatures
202 # are most prominent. The samples can be hierarchically clustered in plot_cosine_heatmap.
203 # The cosine similarity between two mutational profiles/signatures can be calculated with cos_sim :
204 # cos_sim(mut_mat[,1], cancer_signatures[,1])
205
206 # Calculate pairwise cosine similarity between mutational profiles and COSMIC signatures
207
208 # cos_sim_samples_signatures = cos_sim_matrix(mut_mat, cancer_signatures)
209 # Plot heatmap with specified signature order
210 # p.trans <- plot_cosine_heatmap(cos_sim_samples_signatures, col_order = cosmic_order, cluster_rows = TRUE)
211 # grid.arrange(p.trans)
212
213 # Find optimal contribution of COSMIC signatures to reconstruct 96 mutational profiles
214 fit_res <- fit_to_signatures(mut_mat, cancer_signatures)
215
216 # Select signatures with some contribution (above a threshold)
217 threshold <- tail(sort(unlist(rowSums(fit_res$contribution), use.names = FALSE)), opt$signum)[1]
218 select <- which(rowSums(fit_res$contribution) >= threshold) # ensure opt$signum best signatures in samples are retained, the others discarded
219 # Plot contribution barplots
220 pc1 <- plot_contribution(fit_res$contribution[select,], cancer_signatures[,select], coord_flip = T, mode = "absolute")
221 pc2 <- plot_contribution(fit_res$contribution[select,], cancer_signatures[,select], coord_flip = T, mode = "relative")
222 # Combine the two plots:
223 grid.arrange(pc1, pc2)
224 ## pie charts of comic signatures contributions in samples
225
226 sig_data_pie <- as.data.frame(t(head(fit_res$contribution[select,])))
227 colnames(sig_data_pie) <- gsub("nature", "", colnames(sig_data_pie))
228 sig_data_pie_percents <- sig_data_pie / (apply(sig_data_pie,1,sum)) * 100
229 sig_data_pie_percents$sample <- rownames(sig_data_pie_percents)
230 library(reshape2)
231 melted_sig_data_pie_percents <-melt(data=sig_data_pie_percents)
232 melted_sig_data_pie_percents$label <- sub("Sig.", "", melted_sig_data_pie_percents$variable)
233 melted_sig_data_pie_percents$pos <- cumsum(melted_sig_data_pie_percents$value) - melted_sig_data_pie_percents$value/2
234 library(ggplot2)
235 p.trans <- ggplot(melted_sig_data_pie_percents, aes(x="", y=value, group=variable, fill=variable)) +
236 geom_bar(width = 1, stat = "identity") +
237 geom_text(aes(label = label), position = position_stack(vjust = 0.5), color="black", size=3) +
238 coord_polar("y", start=0) + facet_wrap(~ sample) +
239 labs(x="", y="Samples", fill = "Signatures (Cosmic_v2,March 2015)") +
240 theme(axis.text = element_blank(),
241 axis.ticks = element_blank(),
242 panel.grid = element_blank())
243 grid.arrange(p.trans)
244
245 # Plot relative contribution of the cancer signatures in each sample as a heatmap with sample clustering
246 p.trans <- plot_contribution_heatmap(fit_res$contribution, cluster_samples = TRUE, method = "complete")
247 grid.arrange(p.trans)
248
249 # Compare the reconstructed mutational profile of sample 1 with its original mutational profile
250 # plot_compare_profiles(mut_mat[,1], fit_res$reconstructed[,1],
251 # profile_names = c("Original", "Reconstructed"),
252 # condensed = TRUE)
253
254 # Calculate the cosine similarity between all original and reconstructed mutational profiles with
255 # `cos_sim_matrix`
256
257 # calculate all pairwise cosine similarities
258 cos_sim_ori_rec <- cos_sim_matrix(mut_mat, fit_res$reconstructed)
259 # extract cosine similarities per sample between original and reconstructed
260 cos_sim_ori_rec <- as.data.frame(diag(cos_sim_ori_rec))
261
262 # We can use ggplot to make a barplot of the cosine similarities between the original and
263 # reconstructed mutational profile of each sample. This clearly shows how well each mutational
264 # profile can be reconstructed with the COSMIC mutational signatures. Two identical profiles
265 # have a cosine similarity of 1. The lower the cosine similarity between original and
266 # reconstructed, the less well the original mutational profile can be reconstructed with
267 # the COSMIC signatures. You could use, for example, cosine similarity of 0.95 as a cutoff.
268
269 # Adjust data frame for plotting with gpplot
270 colnames(cos_sim_ori_rec) = "cos_sim"
271 cos_sim_ori_rec$sample = row.names(cos_sim_ori_rec)
272 # Make barplot
273 p.trans <- ggplot(cos_sim_ori_rec, aes(y=cos_sim, x=sample)) +
274 geom_bar(stat="identity", fill = "skyblue4") +
275 coord_cartesian(ylim=c(0.8, 1)) +
276 # coord_flip(ylim=c(0.8,1)) +
277 ylab("Cosine similarity\n original VS reconstructed") +
278 xlab("") +
279 # Reverse order of the samples such that first is up
280 # xlim(rev(levels(factor(cos_sim_ori_rec$sample)))) +
281 theme_bw() +
282 theme(panel.grid.minor.y=element_blank(),
283 panel.grid.major.y=element_blank()) +
284 # Add cut.off line
285 geom_hline(aes(yintercept=.95))
286 grid.arrange(p.trans, top = textGrob("Similarity between true and reconstructed profiles (with all Cosmic sig.)",gp=gpar(fontsize=12,font=3)))
287
288 dev.off()
289 }