comparison mutational_patterns.R @ 14:56c8869a231e draft

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mutational_patterns commit 518fb067e8206ecafbf673a5e4cf375ccead11e3"
author artbio
date Fri, 04 Jun 2021 22:35:48 +0000
parents 6741b819cc15
children 8182d1625433
comparison
equal deleted inserted replaced
13:6741b819cc15 14:56c8869a231e
1 # load packages that are provided in the conda env 1 # load packages that are provided in the conda env
2 options( show.error.messages=F, 2 options(show.error.messages = F,
3 error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) 3 error = function() {
4 cat(geterrmessage(), file = stderr()); q("no", 1, F)
5 }
6 )
4 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") 7 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
5 warnings() 8 warnings()
6 library(optparse) 9 library(optparse)
7 library(rjson) 10 library(rjson)
8 library(grid) 11 library(grid)
9 library(gridExtra) 12 library(gridExtra)
10 library(scales) 13 library(scales)
11 library(RColorBrewer) 14 library(RColorBrewer)
12 15
13 # Arguments 16 # Arguments
14 option_list = list( 17 option_list <- list(
15 make_option( 18 make_option(
16 "--inputs", 19 "--inputs",
17 default = NA, 20 default = NA,
18 type = 'character', 21 type = "character",
19 help = "json formatted dictionary of datasets and their paths" 22 help = "json formatted dictionary of datasets and their paths"
20 ), 23 ),
21 make_option( 24 make_option(
22 "--genome", 25 "--genome",
23 default = NA, 26 default = NA,
24 type = 'character', 27 type = "character",
25 help = "genome name in the BSgenome bioconductor package" 28 help = "genome name in the BSgenome bioconductor package"
26 ), 29 ),
27 make_option( 30 make_option(
28 "--levels", 31 "--levels",
29 default = NA, 32 default = NA,
30 type = 'character', 33 type = "character",
31 help = "path to the tab separated file describing the levels in function of datasets" 34 help = "path to the tab separated file describing the levels in function of datasets"
32 ), 35 ),
33 make_option( 36 make_option(
34 "--cosmic_version", 37 "--cosmic_version",
35 default = "v2", 38 default = "v2",
36 type = 'character', 39 type = "character",
37 help = "Version of the Cosmic Signature set to be used to express mutational patterns" 40 help = "Version of the Cosmic Signature set to be used to express mutational patterns"
38 ), 41 ),
39 make_option( 42 make_option(
40 "--signum", 43 "--signum",
41 default = 2, 44 default = 2,
42 type = 'integer', 45 type = "integer",
43 help = "selects the N most significant signatures in samples to express mutational patterns" 46 help = "selects the N most significant signatures in samples to express mutational patterns"
44 ), 47 ),
45 make_option( 48 make_option(
46 "--nrun", 49 "--nrun",
47 default = 2, 50 default = 2,
48 type = 'integer', 51 type = "integer",
49 help = "Number of runs to fit signatures" 52 help = "Number of runs to fit signatures"
50 ), 53 ),
51 make_option( 54 make_option(
52 "--rank", 55 "--rank",
53 default = 2, 56 default = 2,
54 type = 'integer', 57 type = "integer",
55 help = "number of ranks to display for parameter optimization" 58 help = "number of ranks to display for parameter optimization"
56 ), 59 ),
57 make_option( 60 make_option(
58 "--newsignum", 61 "--newsignum",
59 default = 2, 62 default = 2,
60 type = 'integer', 63 type = "integer",
61 help = "Number of new signatures to be captured" 64 help = "Number of new signatures to be captured"
62 ), 65 ),
63 make_option( 66 make_option(
64 "--output_spectrum", 67 "--output_spectrum",
65 default = NA, 68 default = NA,
66 type = 'character', 69 type = "character",
67 help = "path to output dataset" 70 help = "path to output dataset"
68 ), 71 ),
69 make_option( 72 make_option(
70 "--output_denovo", 73 "--output_denovo",
71 default = NA, 74 default = NA,
72 type = 'character', 75 type = "character",
73 help = "path to output dataset" 76 help = "path to output dataset"
74 ), 77 ),
75 make_option( 78 make_option(
76 "--sigmatrix", 79 "--sigmatrix",
77 default = NA, 80 default = NA,
78 type = 'character', 81 type = "character",
79 help = "path to signature matrix" 82 help = "path to signature matrix"
80 ), 83 ),
81 make_option( 84 make_option(
82 "--output_cosmic", 85 "--output_cosmic",
83 default = NA, 86 default = NA,
84 type = 'character', 87 type = "character",
85 help = "path to output dataset" 88 help = "path to output dataset"
86 ), 89 ),
87 make_option( 90 make_option(
88 "--sig_contrib_matrix", 91 "--sig_contrib_matrix",
89 default = NA, 92 default = NA,
90 type = 'character', 93 type = "character",
91 help = "path to signature contribution matrix" 94 help = "path to signature contribution matrix"
92 ), 95 ),
93 make_option( 96 make_option(
94 c("-r", "--rdata"), 97 c("-r", "--rdata"),
95 type="character", 98 type = "character",
96 default=NULL, 99 default = NULL,
97 help="Path to RData output file") 100 help = "Path to RData output file")
98 ) 101 )
99 102
100 opt = parse_args(OptionParser(option_list = option_list), 103 opt <- parse_args(OptionParser(option_list = option_list),
101 args = commandArgs(trailingOnly = TRUE)) 104 args = commandArgs(trailingOnly = TRUE))
102 105
103 ################ Manage input data #################### 106 ################ Manage input data ####################
104 json_dict <- opt$inputs 107 json_dict <- opt$inputs
105 parser <- newJSONParser() 108 parser <- newJSONParser()
106 parser$addData(json_dict) 109 parser$addData(json_dict)
107 fileslist <- parser$getObject() 110 fileslist <- parser$getObject()
108 vcf_paths <- attr(fileslist, "names") 111 vcf_paths <- attr(fileslist, "names")
109 element_identifiers <- unname(unlist(fileslist)) 112 element_identifiers <- unname(unlist(fileslist))
110 ref_genome <- opt$genome 113 ref_genome <- opt$genome
111 vcf_table <- data.frame(element_identifier=as.character(element_identifiers), path=vcf_paths) 114 vcf_table <- data.frame(element_identifier = as.character(element_identifiers), path = vcf_paths)
112 115
113 library(MutationalPatterns) 116 library(MutationalPatterns)
114 library(ref_genome, character.only = TRUE) 117 library(ref_genome, character.only = TRUE)
115 library(ggplot2) 118 library(ggplot2)
116 119
117 # Load the VCF files into a GRangesList: 120 # Load the VCF files into a GRangesList:
118 vcfs <- read_vcfs_as_granges(vcf_paths, element_identifiers, ref_genome) 121 vcfs <- read_vcfs_as_granges(vcf_paths, element_identifiers, ref_genome)
119 library(plyr) 122 library(plyr)
120 if (!is.na(opt$levels)[1]) { # manage levels if there are 123 if (!is.na(opt$levels)[1]) { # manage levels if there are
121 levels_table <- read.delim(opt$levels, header=FALSE, col.names=c("element_identifier","level")) 124 levels_table <- read.delim(opt$levels, header = FALSE,
125 col.names = c("element_identifier", "level"))
122 } else { 126 } else {
123 levels_table <- data.frame(element_identifier=vcf_table$element_identifier, 127 levels_table <- data.frame(element_identifier = vcf_table$element_identifier,
124 level=rep("nolabels", length(vcf_table$element_identifier))) 128 level = rep("nolabels", length(vcf_table$element_identifier)))
125 } 129 }
126 metadata_table <- join(vcf_table, levels_table, by = "element_identifier") 130 metadata_table <- join(vcf_table, levels_table, by = "element_identifier")
127 tissue <- as.vector(metadata_table$level) 131 tissue <- as.vector(metadata_table$level)
128 detach(package:plyr) 132 detach(package:plyr)
129 133
130 ##### This is done for any section ###### 134 ##### This is done for any section ######
131 mut_mat <- mut_matrix(vcf_list = vcfs, ref_genome = ref_genome) 135 mut_mat <- mut_matrix(vcf_list = vcfs, ref_genome = ref_genome)
132 qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',] 136 qual_col_pals <- brewer.pal.info[brewer.pal.info$category == "qual", ]
133 col_vector = unique(unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))) 137 col_vector <- unique(unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals))))
134 col_vector <- col_vector[c(-32, -34, -39)] # 67-color palette 138 col_vector <- col_vector[c(-32, -34, -39)] # 67-color palette
135 139
136 ###### Section 1 Mutation characteristics and spectrums ############# 140 ###### Section 1 Mutation characteristics and spectrums #############
137 if (!is.na(opt$output_spectrum)[1]) { 141 if (!is.na(opt$output_spectrum)[1]) {
138 pdf(opt$output_spectrum, paper = "special", width = 11.69, height = 11.69) 142 pdf(opt$output_spectrum, paper = "special", width = 11.69, height = 11.69)
139 type_occurrences <- mut_type_occurrences(vcfs, ref_genome) 143 type_occurrences <- mut_type_occurrences(vcfs, ref_genome)
140 144
141 # mutation spectrum, total or by sample 145 # mutation spectrum, total or by sample
142 146
143 if (length(levels(factor(levels_table$level))) == 1) { # (is.na(opt$levels)[1]) 147 if (length(levels(factor(levels_table$level))) == 1) {
144 p1 <- plot_spectrum(type_occurrences, CT = TRUE, legend=TRUE) 148 p1 <- plot_spectrum(type_occurrences, CT = TRUE, legend = TRUE)
145 plot(p1) 149 plot(p1)
146 } else { 150 } else {
147 p2 <- plot_spectrum(type_occurrences, by = tissue, CT=TRUE) # by levels 151 p2 <- plot_spectrum(type_occurrences, by = tissue, CT = TRUE) # by levels
148 p3 <- plot_spectrum(type_occurrences, CT=TRUE, legend=TRUE) # total 152 p3 <- plot_spectrum(type_occurrences, CT = TRUE, legend = TRUE) # total
149 grid.arrange(p2, p3, ncol=2, widths=c(4,2.3), heights=c(4,1)) 153 grid.arrange(p2, p3, ncol = 2, widths = c(4, 2.3), heights = c(4, 1))
150 } 154 }
151 plot_96_profile(mut_mat, condensed = TRUE) 155 plot_96_profile(mut_mat, condensed = TRUE)
152 dev.off() 156 dev.off()
153 } 157 }
154 158
155 ###### Section 2: De novo mutational signature extraction using NMF ####### 159 ###### Section 2: De novo mutational signature extraction using NMF #######
160 # opt$rank cannot be higher than the number of samples and
161 # likewise, opt$signum cannot be higher thant the number of samples
156 if (!is.na(opt$output_denovo)[1]) { 162 if (!is.na(opt$output_denovo)[1]) {
157 # opt$rank cannot be higher than the number of samples 163
158 if (opt$rank > length(element_identifiers)) {opt$rank <-length(element_identifiers)} 164 if (opt$rank > length(element_identifiers)) {
159 # likewise, opt$signum cannot be higher thant the number of samples 165 opt$rank <- length(element_identifiers)
160 if (opt$signum > length(element_identifiers)) {opt$signum <-length(element_identifiers)} 166 }
167 if (opt$signum > length(element_identifiers)) {
168 opt$signum <- length(element_identifiers)
169 }
161 pseudo_mut_mat <- mut_mat + 0.0001 # First add a small pseudocount to the mutation count matrix 170 pseudo_mut_mat <- mut_mat + 0.0001 # First add a small pseudocount to the mutation count matrix
162 # Use the NMF package to generate an estimate rank plot 171 # Use the NMF package to generate an estimate rank plot
163 library("NMF") 172 library("NMF")
164 estimate <- nmf(pseudo_mut_mat, rank=1:opt$rank, method="brunet", nrun=opt$nrun, seed=123456) 173 estimate <- nmf(pseudo_mut_mat, rank = 1:opt$rank, method = "brunet", nrun = opt$nrun, seed = 123456)
165 # And plot it 174 # And plot it
166 pdf(opt$output_denovo, paper = "special", width = 11.69, height = 11.69) 175 pdf(opt$output_denovo, paper = "special", width = 11.69, height = 11.69)
167 p4 <- plot(estimate) 176 p4 <- plot(estimate)
168 grid.arrange(p4) 177 grid.arrange(p4)
169 # Extract 4 (PARAMETIZE) mutational signatures from the mutation count matrix with extract_signatures 178 # Extract 4 (PARAMETIZE) mutational signatures from the mutation count matrix with extract_signatures
170 # (For larger datasets it is wise to perform more iterations by changing the nrun parameter 179 # (For larger datasets it is wise to perform more iterations by changing the nrun parameter
171 # to achieve stability and avoid local minima) 180 # to achieve stability and avoid local minima)
172 nmf_res <- extract_signatures(pseudo_mut_mat, rank=opt$newsignum, nrun=opt$nrun) 181 nmf_res <- extract_signatures(pseudo_mut_mat, rank = opt$newsignum, nrun = opt$nrun)
173 # Assign signature names 182 # Assign signature names
174 colnames(nmf_res$signatures) <- paste0("NewSig_", 1:opt$newsignum) 183 colnames(nmf_res$signatures) <- paste0("NewSig_", 1:opt$newsignum)
175 rownames(nmf_res$contribution) <- paste0("NewSig_", 1:opt$newsignum) 184 rownames(nmf_res$contribution) <- paste0("NewSig_", 1:opt$newsignum)
176 # Plot the 96-profile of the signatures: 185 # Plot the 96-profile of the signatures:
177 p5 <- plot_96_profile(nmf_res$signatures, condensed = TRUE) 186 p5 <- plot_96_profile(nmf_res$signatures, condensed = TRUE)
178 new_sig_matrix <- reshape2::dcast(p5$data, substitution + context ~ variable, value.var = "value") 187 new_sig_matrix <- reshape2::dcast(p5$data, substitution + context ~ variable, value.var = "value")
179 new_sig_matrix = format(new_sig_matrix, scientific=TRUE) 188 new_sig_matrix <- format(new_sig_matrix, scientific = TRUE)
180 write.table(new_sig_matrix, file=opt$sigmatrix, quote = FALSE, row.names = FALSE, sep="\t") 189 write.table(new_sig_matrix, file = opt$sigmatrix, quote = FALSE, row.names = FALSE, sep = "\t")
181 grid.arrange(p5) 190 grid.arrange(p5)
182 # Visualize the contribution of the signatures in a barplot 191 # Visualize the contribution of the signatures in a barplot
183 pc1 <- plot_contribution(nmf_res$contribution, nmf_res$signature, mode="relative", coord_flip = TRUE) 192 pc1 <- plot_contribution(nmf_res$contribution, nmf_res$signature, mode = "relative", coord_flip = TRUE)
184 # Visualize the contribution of the signatures in absolute number of mutations 193 # Visualize the contribution of the signatures in absolute number of mutations
185 pc2 <- plot_contribution(nmf_res$contribution, nmf_res$signature, mode="absolute", coord_flip = TRUE) 194 pc2 <- plot_contribution(nmf_res$contribution, nmf_res$signature, mode = "absolute", coord_flip = TRUE)
186 # Combine the two plots: 195 # Combine the two plots:
187 grid.arrange(pc1, pc2) 196 grid.arrange(pc1, pc2)
188 197
189 # The relative contribution of each signature for each sample can also be plotted as a heatmap with 198 # The relative contribution of each signature for each sample can also be plotted as a heatmap with
190 # plot_contribution_heatmap, which might be easier to interpret and compare than stacked barplots. 199 # plot_contribution_heatmap, which might be easier to interpret and compare than stacked barplots.
191 # The samples can be hierarchically clustered based on their euclidean dis- tance. The signatures 200 # The samples can be hierarchically clustered based on their euclidean dis- tance. The signatures
192 # can be plotted in a user-specified order. 201 # can be plotted in a user-specified order.
193 # Plot signature contribution as a heatmap with sample clustering dendrogram and a specified signature order: 202 # Plot signature contribution as a heatmap with sample clustering dendrogram and a specified signature order:
194 pch1 <- plot_contribution_heatmap(nmf_res$contribution, 203 pch1 <- plot_contribution_heatmap(nmf_res$contribution,
195 sig_order = paste0("NewSig_", 1:opt$newsignum)) 204 sig_order = paste0("NewSig_", 1:opt$newsignum))
196 # Plot signature contribution as a heatmap without sample clustering: 205 # Plot signature contribution as a heatmap without sample clustering:
197 pch2 <- plot_contribution_heatmap(nmf_res$contribution, cluster_samples=FALSE) 206 pch2 <- plot_contribution_heatmap(nmf_res$contribution, cluster_samples = FALSE)
198 #Combine the plots into one figure: 207 #Combine the plots into one figure:
199 grid.arrange(pch1, pch2, ncol = 2, widths = c(2,1.6)) 208 grid.arrange(pch1, pch2, ncol = 2, widths = c(2, 1.6))
200 209
201 # Compare the reconstructed mutational profile with the original mutational profile: 210 # Compare the reconstructed mutational profile with the original mutational profile:
202 plot_compare_profiles(pseudo_mut_mat[,1], 211 plot_compare_profiles(pseudo_mut_mat[, 1],
203 nmf_res$reconstructed[,1], 212 nmf_res$reconstructed[, 1],
204 profile_names = c("Original", "Reconstructed"), 213 profile_names = c("Original", "Reconstructed"),
205 condensed = TRUE) 214 condensed = TRUE)
206 dev.off() 215 dev.off()
207 } 216 }
208 ##### Section 3: Find optimal contribution of known signatures: COSMIC mutational signatures #### 217 ##### Section 3: Find optimal contribution of known signatures: COSMIC mutational signatures ####
209 218
210 if (!is.na(opt$output_cosmic)[1]) { 219 if (!is.na(opt$output_cosmic)[1]) {
211 pdf(opt$output_cosmic, paper = "special", width = 11.69, height = 11.69) 220 pdf(opt$output_cosmic, paper = "special", width = 11.69, height = 11.69)
212 pseudo_mut_mat <- mut_mat + 0.0001 # First add a small psuedocount to the mutation count matrix 221 pseudo_mut_mat <- mut_mat + 0.0001 # First add a small psuedocount to the mutation count matrix
213 if (opt$cosmic_version == "v2") { 222 if (opt$cosmic_version == "v2") {
214 sp_url <- paste("https://cancer.sanger.ac.uk/cancergenome/assets/", "signatures_probabilities.txt", sep = "") 223 sp_url <- paste("https://cancer.sanger.ac.uk/cancergenome/assets/", "signatures_probabilities.txt", sep = "")
215 cancer_signatures = read.table(sp_url, sep = "\t", header = TRUE) 224 cancer_signatures <- read.table(sp_url, sep = "\t", header = TRUE)
216 new_order = match(row.names(pseudo_mut_mat), cancer_signatures$Somatic.Mutation.Type) 225 new_order <- match(row.names(pseudo_mut_mat), cancer_signatures$Somatic.Mutation.Type)
217 cancer_signatures = cancer_signatures[as.vector(new_order),] 226 cancer_signatures <- cancer_signatures[as.vector(new_order), ]
218 row.names(cancer_signatures) = cancer_signatures$Somatic.Mutation.Type 227 row.names(cancer_signatures) <- cancer_signatures$Somatic.Mutation.Type
219 cancer_signatures = as.matrix(cancer_signatures[,4:33]) 228 cancer_signatures <- as.matrix(cancer_signatures[, 4:33])
220 colnames(cancer_signatures) <- gsub("Signature.", "", colnames(cancer_signatures)) # shorten signature labels 229 colnames(cancer_signatures) <- gsub("Signature.", "", colnames(cancer_signatures)) # shorten signature labels
221 cosmic_tag <- "Signatures (Cosmic v2, March 2015)" 230 cosmic_tag <- "Signatures (Cosmic v2, March 2015)"
222 cosmic_colors <- col_vector[1:30] 231 cosmic_colors <- col_vector[1:30]
223 } else { 232 } else {
224 sp_url <- "https://raw.githubusercontent.com/ARTbio/startbio/master/sigProfiler_SBS_signatures_2019_05_22.tsv" 233 sp_url <- "https://raw.githubusercontent.com/ARTbio/startbio/master/sigProfiler_SBS_signatures_2019_05_22.tsv"
225 cancer_signatures = read.table(sp_url, sep = "\t", header = TRUE) 234 cancer_signatures <- read.table(sp_url, sep = "\t", header = TRUE)
226 new_order = match(row.names(pseudo_mut_mat), cancer_signatures$Somatic.Mutation.Type) 235 new_order <- match(row.names(pseudo_mut_mat), cancer_signatures$Somatic.Mutation.Type)
227 cancer_signatures = cancer_signatures[as.vector(new_order),] 236 cancer_signatures <- cancer_signatures[as.vector(new_order), ]
228 row.names(cancer_signatures) = cancer_signatures$Somatic.Mutation.Type 237 row.names(cancer_signatures) <- cancer_signatures$Somatic.Mutation.Type
229 cancer_signatures = as.matrix(cancer_signatures[,4:70]) 238 cancer_signatures <- as.matrix(cancer_signatures[, 4:70])
230 colnames(cancer_signatures) <- gsub("SBS", "", colnames(cancer_signatures)) # shorten signature labels 239 colnames(cancer_signatures) <- gsub("SBS", "", colnames(cancer_signatures)) # shorten signature labels
231 cosmic_tag <- "Signatures (Cosmic v3, May 2019)" 240 cosmic_tag <- "Signatures (Cosmic v3, May 2019)"
232 cosmic_colors <- col_vector[1:67] 241 cosmic_colors <- col_vector[1:67]
233 } 242 }
234 243
235 # Plot mutational profiles of the COSMIC signatures 244 # Plot mutational profiles of the COSMIC signatures
236 if (opt$cosmic_version == "v2") { 245 if (opt$cosmic_version == "v2") {
237 p6 <- plot_96_profile(cancer_signatures, condensed = TRUE, ymax = 0.3) 246 p6 <- plot_96_profile(cancer_signatures, condensed = TRUE, ymax = 0.3)
238 grid.arrange(p6, top = textGrob("COSMIC signature profiles",gp=gpar(fontsize=12,font=3))) 247 grid.arrange(p6, top = textGrob("COSMIC signature profiles", gp = gpar(fontsize = 12, font = 3)))
239 } else { 248 } else {
240 p6 <- plot_96_profile(cancer_signatures[,1:33], condensed = TRUE, ymax = 0.3) 249 p6 <- plot_96_profile(cancer_signatures[, 1:33], condensed = TRUE, ymax = 0.3)
241 p6bis <- plot_96_profile(cancer_signatures[,34:67], condensed = TRUE, ymax = 0.3) 250 p6bis <- plot_96_profile(cancer_signatures[, 34:67], condensed = TRUE, ymax = 0.3)
242 grid.arrange(p6, top = textGrob("COSMIC signature profiles (on two pages)",gp=gpar(fontsize=12,font=3))) 251 grid.arrange(p6, top = textGrob("COSMIC signature profiles (on two pages)",
243 grid.arrange(p6bis, top = textGrob("COSMIC signature profiles (continued)",gp=gpar(fontsize=12,font=3))) 252 gp = gpar(fontsize = 12, font = 3)))
244 } 253 grid.arrange(p6bis, top = textGrob("COSMIC signature profiles (continued)",
245 254 gp = gpar(fontsize = 12, font = 3)))
246 255 }
256
257
247 # Find optimal contribution of COSMIC signatures to reconstruct 96 mutational profiles 258 # Find optimal contribution of COSMIC signatures to reconstruct 96 mutational profiles
248 fit_res <- fit_to_signatures(pseudo_mut_mat, cancer_signatures) 259 fit_res <- fit_to_signatures(pseudo_mut_mat, cancer_signatures)
249 260
250 # Plot contribution barplots 261 # Plot contribution barplots
251 pc3 <- plot_contribution(fit_res$contribution, cancer_signatures, coord_flip = T, mode = "absolute") 262 pc3 <- plot_contribution(fit_res$contribution, cancer_signatures, coord_flip = T, mode = "absolute")
252 pc4 <- plot_contribution(fit_res$contribution, cancer_signatures, coord_flip = T, mode = "relative") 263 pc4 <- plot_contribution(fit_res$contribution, cancer_signatures, coord_flip = T, mode = "relative")
253 pc3_data <- pc3$data 264 pc3_data <- pc3$data
254 pc3 <- ggplot(pc3_data, aes(x=Sample, y=Contribution, fill=as.factor(Signature))) + 265 pc3 <- ggplot(pc3_data, aes(x = Sample, y = Contribution, fill = as.factor(Signature))) +
255 geom_bar(stat="identity", position='stack') + 266 geom_bar(stat = "identity", position = "stack") +
256 coord_flip() + 267 coord_flip() +
257 scale_fill_manual(name = "Cosmic\nSignatures", values = cosmic_colors) + 268 scale_fill_manual(name = "Cosmic\nSignatures", values = cosmic_colors) +
258 labs(x = "Samples", y = "Absolute contribution") + theme_bw() + 269 labs(x = "Samples", y = "Absolute contribution") + theme_bw() +
259 theme(panel.grid.minor.x = element_blank(), panel.grid.major.x = element_blank(), legend.position="right", 270 theme(panel.grid.minor.x = element_blank(),
260 text = element_text(size=8), 271 panel.grid.major.x = element_blank(),
261 axis.text.x = element_text(angle=90, hjust=1)) 272 legend.position = "right",
273 text = element_text(size = 8),
274 axis.text.x = element_text(angle = 90, hjust = 1))
262 pc4_data <- pc4$data 275 pc4_data <- pc4$data
263 pc4 <- ggplot(pc4_data, aes(x=Sample, y=Contribution, fill=as.factor(Signature))) + 276 pc4 <- ggplot(pc4_data, aes(x = Sample, y = Contribution, fill = as.factor(Signature))) +
264 geom_bar(stat="identity", position='fill') + 277 geom_bar(stat = "identity", position = "fill") +
265 coord_flip() + 278 coord_flip() +
266 scale_fill_manual(name = "Cosmic\nSignatures", values = cosmic_colors) + 279 scale_fill_manual(name = "Cosmic\nSignatures", values = cosmic_colors) +
267 scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + 280 scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
268 labs(x = "Samples", y = "Relative contribution") + theme_bw() + 281 labs(x = "Samples", y = "Relative contribution") + theme_bw() +
269 theme(panel.grid.minor.x = element_blank(), panel.grid.major.x = element_blank(), legend.position="right", 282 theme(panel.grid.minor.x = element_blank(), panel.grid.major.x = element_blank(), legend.position = "right",
270 text = element_text(size=8), 283 text = element_text(size = 8),
271 axis.text.x = element_text(angle=90, hjust=1)) 284 axis.text.x = element_text(angle = 90, hjust = 1))
272 ##### 285 #####
273 # ggplot2 alternative 286 # ggplot2 alternative
274 if (!is.na(opt$levels)[1]) { # if there are levels to display in graphs 287 if (!is.na(opt$levels)[1]) { # if there are levels to display in graphs
275 pc3_data <- pc3$data 288 pc3_data <- pc3$data
276 pc3_data <- merge (pc3_data, metadata_table[,c(1,3)], by.x="Sample", by.y="element_identifier") 289 pc3_data <- merge(pc3_data, metadata_table[, c(1, 3)], by.x = "Sample", by.y = "element_identifier")
277 pc3 <- ggplot(pc3_data, aes(x=Sample, y=Contribution, fill=as.factor(Signature))) + 290 pc3 <- ggplot(pc3_data, aes(x = Sample, y = Contribution, fill = as.factor(Signature))) +
278 geom_bar(stat="identity", position='stack') + 291 geom_bar(stat = "identity", position = "stack") +
279 scale_fill_manual(name = "Cosmic\nSignatures", values = cosmic_colors) + 292 scale_fill_manual(name = "Cosmic\nSignatures", values = cosmic_colors) +
280 labs(x = "Samples", y = "Absolute contribution") + theme_bw() + 293 labs(x = "Samples", y = "Absolute contribution") + theme_bw() +
281 theme(panel.grid.minor.x = element_blank(), panel.grid.major.x = element_blank(), legend.position="right", 294 theme(panel.grid.minor.x = element_blank(),
282 text = element_text(size=8), 295 panel.grid.major.x = element_blank(),
283 axis.text.x = element_text(angle=90, hjust=1)) + 296 legend.position = "right",
284 facet_grid(~level, scales = "free_x", space="free") 297 text = element_text(size = 8),
298 axis.text.x = element_text(angle = 90, hjust = 1)) +
299 facet_grid(~level, scales = "free_x", space = "free")
285 pc4_data <- pc4$data 300 pc4_data <- pc4$data
286 pc4_data <- merge (pc4_data, metadata_table[,c(1,3)], by.x="Sample", by.y="element_identifier") 301 pc4_data <- merge(pc4_data, metadata_table[, c(1, 3)], by.x = "Sample", by.y = "element_identifier")
287 pc4 <- ggplot(pc4_data, aes(x=Sample, y=Contribution, fill=as.factor(Signature))) + 302 pc4 <- ggplot(pc4_data, aes(x = Sample, y = Contribution, fill = as.factor(Signature))) +
288 geom_bar(stat="identity", position='fill') + 303 geom_bar(stat = "identity", position = "fill") +
289 scale_fill_manual(name = "Cosmic\nSignatures", values = cosmic_colors) + 304 scale_fill_manual(name = "Cosmic\nSignatures", values = cosmic_colors) +
290 scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + 305 scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
291 labs(x = "Samples", y = "Relative contribution") + theme_bw() + 306 labs(x = "Samples", y = "Relative contribution") + theme_bw() +
292 theme(panel.grid.minor.x = element_blank(), panel.grid.major.x = element_blank(), legend.position="right", 307 theme(panel.grid.minor.x = element_blank(),
293 text = element_text(size=8), 308 panel.grid.major.x = element_blank(),
294 axis.text.x = element_text(angle=90, hjust=1)) + 309 legend.position = "right",
295 facet_grid(~level, scales = "free_x", space="free") 310 text = element_text(size = 8),
311 axis.text.x = element_text(angle = 90, hjust = 1)) +
312 facet_grid(~level, scales = "free_x", space = "free")
296 } 313 }
297 # Combine the two plots: 314 # Combine the two plots:
298 grid.arrange(pc3, pc4, top = textGrob("Absolute and Relative Contributions of Cosmic signatures to mutational patterns",gp=gpar(fontsize=12,font=3))) 315 grid.arrange(pc3, pc4,
299 316 top = textGrob("Absolute and Relative Contributions of Cosmic signatures to mutational patterns",
317 gp = gpar(fontsize = 12, font = 3)))
318
300 #### pie charts of comic signatures contributions in samples ### 319 #### pie charts of comic signatures contributions in samples ###
301 library(reshape2) 320 library(reshape2)
302 library(dplyr) 321 library(dplyr)
303 if (length(levels(factor(levels_table$level))) < 2) { 322 if (length(levels(factor(levels_table$level))) < 2) {
304 fit_res_contrib <- as.data.frame(fit_res$contribution) 323 fit_res_contrib <- as.data.frame(fit_res$contribution)
305 worklist <- cbind(signature=rownames(fit_res$contribution), 324 worklist <- cbind(signature = rownames(fit_res$contribution),
306 level=rep("nolabels", length(fit_res_contrib[,1])), 325 level = rep("nolabels", length(fit_res_contrib[, 1])),
307 fit_res_contrib, 326 fit_res_contrib,
308 sum=rowSums(fit_res_contrib)) 327 sum = rowSums(fit_res_contrib))
309 worklist <- worklist[order(worklist[ ,"sum"], decreasing = T), ] 328 worklist <- worklist[order(worklist[, "sum"], decreasing = T), ]
310 worklist <- worklist[1:opt$signum,] 329 worklist <- worklist[1:opt$signum, ]
311 worklist <- worklist[ , -length(worklist[1,])] 330 worklist <- worklist[, -length(worklist[1, ])]
312 worklist <- melt(worklist) 331 worklist <- melt(worklist)
313 worklist <- worklist[,c(1,3,4,2)] 332 worklist <- worklist[, c(1, 3, 4, 2)]
314 } else { 333 } else {
315 worklist <- list() 334 worklist <- list()
316 for (i in levels(factor(levels_table$level))) { 335 for (i in levels(factor(levels_table$level))) {
317 fit_res$contribution[,levels_table$element_identifier[levels_table$level == i]] -> worklist[[i]] 336 fit_res$contribution[, levels_table$element_identifier[levels_table$level == i]] -> worklist[[i]]
318 sum <- rowSums(as.data.frame(worklist[[i]])) 337 sum <- rowSums(as.data.frame(worklist[[i]]))
319 worklist[[i]] <- cbind(worklist[[i]], sum) 338 worklist[[i]] <- cbind(worklist[[i]], sum)
320 worklist[[i]] <- worklist[[i]][order(worklist[[i]][ ,"sum"], decreasing = T), ] 339 worklist[[i]] <- worklist[[i]][order(worklist[[i]][, "sum"], decreasing = T), ]
321 worklist[[i]] <- worklist[[i]][1:opt$signum,] 340 worklist[[i]] <- worklist[[i]][1:opt$signum, ]
322 worklist[[i]] <- worklist[[i]][ , -length(as.data.frame(worklist[[i]]))] 341 worklist[[i]] <- worklist[[i]][, -length(as.data.frame(worklist[[i]]))]
323 } 342 }
324 worklist <- as.data.frame(melt(worklist)) 343 worklist <- as.data.frame(melt(worklist))
325 worklist[,2] <- paste0(worklist[,4], " - ", worklist[,2]) 344 worklist[, 2] <- paste0(worklist[, 4], " - ", worklist[, 2])
326 head(worklist) 345 head(worklist)
327 } 346 }
328 347
329 colnames(worklist) <- c("signature", "sample", "value", "level") 348 colnames(worklist) <- c("signature", "sample", "value", "level")
330 worklist <- as.data.frame(worklist %>% group_by(sample) %>% mutate(value=value/sum(value)*100)) 349 worklist <- as.data.frame(worklist %>% group_by(sample) %>% mutate(value = value / sum(value) * 100))
331 worklist$pos <- cumsum(worklist$value) - worklist$value/2 350 worklist$pos <- cumsum(worklist$value) - worklist$value / 2
332 worklist$label <- factor(worklist$signature) 351 worklist$label <- factor(worklist$signature)
333 worklist$signature <- factor(worklist$signature) 352 worklist$signature <- factor(worklist$signature)
334 p7 <- ggplot(worklist, aes(x="", y=value, group=signature, fill=signature)) + 353 p7 <- ggplot(worklist, aes(x = "", y = value, group = signature, fill = signature)) +
335 geom_bar(width = 1, stat = "identity") + 354 geom_bar(width = 1, stat = "identity") +
336 geom_text(aes(label = label), position = position_stack(vjust = 0.5), color="black", size=3) + 355 geom_text(aes(label = label), position = position_stack(vjust = 0.5), color = "black", size = 3) +
337 coord_polar("y", start=0) + facet_wrap(.~sample) + 356 coord_polar("y", start = 0) + facet_wrap(.~sample) +
338 labs(x="", y="Samples", fill = cosmic_tag) + 357 labs(x = "", y = "Samples", fill = cosmic_tag) +
339 scale_fill_manual(name = paste0(opt$signum, " most contributing\nsignatures\n(in each label/tissue)"), 358 scale_fill_manual(name = paste0(opt$signum, " most contributing\nsignatures\n(in each label/tissue)"),
340 values = cosmic_colors[as.numeric(levels(factor(worklist$label)))]) + 359 values = cosmic_colors[as.numeric(levels(factor(worklist$label)))]) +
341 theme(axis.text = element_blank(), 360 theme(axis.text = element_blank(),
342 axis.ticks = element_blank(), 361 axis.ticks = element_blank(),
343 panel.grid = element_blank()) 362 panel.grid = element_blank())
346 # Plot relative contribution of the cancer signatures in each sample as a heatmap with sample clustering 365 # Plot relative contribution of the cancer signatures in each sample as a heatmap with sample clustering
347 if (length(vcf_paths) > 1) { 366 if (length(vcf_paths) > 1) {
348 p8 <- plot_contribution_heatmap(fit_res$contribution, cluster_samples = TRUE, method = "complete") 367 p8 <- plot_contribution_heatmap(fit_res$contribution, cluster_samples = TRUE, method = "complete")
349 grid.arrange(p8) 368 grid.arrange(p8)
350 } 369 }
351 370
352 # export relative contribution matrix 371 # export relative contribution matrix
353 if (!is.na(opt$sig_contrib_matrix)) { 372 if (!is.na(opt$sig_contrib_matrix)) {
354 output_table <- t(fit_res$contribution)/rowSums(t(fit_res$contribution)) 373 output_table <- t(fit_res$contribution) / rowSums(t(fit_res$contribution))
355 colnames(output_table) <- paste0("s", colnames(output_table)) 374 colnames(output_table) <- paste0("s", colnames(output_table))
356 if (length(levels(factor(levels_table$level))) > 1) { 375 if (length(levels(factor(levels_table$level))) > 1) {
357 output_table <- data.frame(sample=paste0(metadata_table[metadata_table$element_identifier==colnames(fit_res$contribution), 376 output_table <- data.frame(sample = paste0(metadata_table[metadata_table$element_identifier == colnames(fit_res$contribution),
358 3], "-", colnames(fit_res$contribution) ), 377 3], "-", colnames(fit_res$contribution)),
359 output_table) 378 output_table)
360 } else { 379 } else {
361 output_table <- data.frame(sample=rownames(output_table), output_table) 380 output_table <- data.frame(sample = rownames(output_table), output_table)
362 } 381 }
363 write.table(output_table, file = opt$sig_contrib_matrix, sep = "\t", quote = F, row.names = F) 382 write.table(output_table, file = opt$sig_contrib_matrix, sep = "\t", quote = F, row.names = F)
364 } 383 }
365 384
366 # calculate all pairwise cosine similarities 385 # calculate all pairwise cosine similarities
367 cos_sim_ori_rec <- cos_sim_matrix(pseudo_mut_mat, fit_res$reconstructed) 386 cos_sim_ori_rec <- cos_sim_matrix(pseudo_mut_mat, fit_res$reconstructed)
368 # extract cosine similarities per sample between original and reconstructed 387 # extract cosine similarities per sample between original and reconstructed
369 cos_sim_ori_rec <- as.data.frame(diag(cos_sim_ori_rec)) 388 cos_sim_ori_rec <- as.data.frame(diag(cos_sim_ori_rec))
370 389
371 # We can use ggplot to make a barplot of the cosine similarities between the original and 390 # We can use ggplot to make a barplot of the cosine similarities between the original and
372 # reconstructed mutational profile of each sample. This clearly shows how well each mutational 391 # reconstructed mutational profile of each sample. This clearly shows how well each mutational
373 # profile can be reconstructed with the COSMIC mutational signatures. Two identical profiles 392 # profile can be reconstructed with the COSMIC mutational signatures. Two identical profiles
374 # have a cosine similarity of 1. The lower the cosine similarity between original and 393 # have a cosine similarity of 1. The lower the cosine similarity between original and
375 # reconstructed, the less well the original mutational profile can be reconstructed with 394 # reconstructed, the less well the original mutational profile can be reconstructed with
376 # the COSMIC signatures. You could use, for example, cosine similarity of 0.95 as a cutoff. 395 # the COSMIC signatures. You could use, for example, cosine similarity of 0.95 as a cutoff.
377 396
378 # Adjust data frame for plotting with gpplot 397 # Adjust data frame for plotting with gpplot
379 colnames(cos_sim_ori_rec) = "cos_sim" 398 colnames(cos_sim_ori_rec) <- "cos_sim"
380 cos_sim_ori_rec$sample = row.names(cos_sim_ori_rec) 399 cos_sim_ori_rec$sample <- row.names(cos_sim_ori_rec)
381 # Make barplot 400 # Make barplot
382 p9 <- ggplot(cos_sim_ori_rec, aes(y=cos_sim, x=sample)) + 401 p9 <- ggplot(cos_sim_ori_rec, aes(y = cos_sim, x = sample)) +
383 geom_bar(stat="identity", fill = "skyblue4") + 402 geom_bar(stat = "identity", fill = "skyblue4") +
384 coord_cartesian(ylim=c(0.8, 1)) + 403 coord_cartesian(ylim = c(0.8, 1)) +
385 # coord_flip(ylim=c(0.8,1)) + 404 # coord_flip(ylim=c(0.8,1)) +
386 ylab("Cosine similarity\n original VS reconstructed") + 405 ylab("Cosine similarity\n original VS reconstructed") +
387 xlab("") + 406 xlab("") +
388 # Reverse order of the samples such that first is up 407 # Reverse order of the samples such that first is up
389 # xlim(rev(levels(factor(cos_sim_ori_rec$sample)))) + 408 # xlim(rev(levels(factor(cos_sim_ori_rec$sample)))) +
390 theme_bw() + 409 theme_bw() +
391 theme(panel.grid.minor.y=element_blank(), 410 theme(panel.grid.minor.y = element_blank(),
392 panel.grid.major.y=element_blank()) + 411 panel.grid.major.y = element_blank()) +
393 # Add cut.off line 412 # Add cut.off line
394 geom_hline(aes(yintercept=.95)) 413 geom_hline(aes(yintercept = .95))
395 grid.arrange(p9, top = textGrob("Similarity between true and reconstructed profiles (with all Cosmic sig.)",gp=gpar(fontsize=12,font=3))) 414 grid.arrange(p9, top = textGrob("Similarity between true and reconstructed profiles (with all Cosmic sig.)", gp = gpar(fontsize = 12, font = 3)))
396 dev.off() 415 dev.off()
397 } 416 }
398 417
399 418
400 # Output RData file 419 # Output RData file
401 if (!is.null(opt$rdata)) { 420 if (!is.null(opt$rdata)) {
402 save.image(file=opt$rdata) 421 save.image(file = opt$rdata)
403 } 422 }