diff mutational_patterns.R @ 2:aea952be68cb draft

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mutational_patterns commit cd8f633245d53cf47eaf860a4e0ae8d806c34419"
author artbio
date Wed, 16 Sep 2020 22:58:28 +0000
parents 924c527fb379
children e332cf9dfa06
line wrap: on
line diff
--- a/mutational_patterns.R	Sun Sep 13 22:27:03 2020 +0000
+++ b/mutational_patterns.R	Wed Sep 16 22:58:28 2020 +0000
@@ -5,8 +5,8 @@
 warnings()
 library(optparse)
 library(rjson)
-library(MutationalPatterns)
-library(ggplot2)
+library(grid)
+library(gridExtra)
 
 # Arguments
 option_list = list(
@@ -35,7 +35,38 @@
     help = "selects the N most significant signatures in samples to express mutational patterns"
   ),
   make_option(
-    "--output",
+    "--nrun",
+    default = 2,
+    type = 'integer',
+    help = "Number of runs to fit signatures"
+  ),
+  make_option(
+    "--rank",
+    default = 2,
+    type = 'integer',
+    help = "number of ranks to display for parameter optimization"
+  ),
+    make_option(
+    "--newsignum",
+    default = 2,
+    type = 'integer',
+    help = "Number of new signatures to be captured"
+  ),
+
+  make_option(
+    "--output_spectrum",
+    default = NA,
+    type = 'character',
+    help = "path to output dataset"
+  ),
+  make_option(
+    "--output_denovo",
+    default = NA,
+    type = 'character',
+    help = "path to output dataset"
+  ),
+  make_option(
+    "--output_cosmic",
     default = NA,
     type = 'character',
     help = "path to output dataset"
@@ -45,82 +76,214 @@
 opt = parse_args(OptionParser(option_list = option_list),
                  args = commandArgs(trailingOnly = TRUE))
 
+################ Manage input data ####################
 json_dict <- opt$inputs
 parser <- newJSONParser()
 parser$addData(json_dict)
 fileslist <- parser$getObject()
 vcf_files <- attr(fileslist, "names")
 sample_names <- unname(unlist(fileslist))
-pdf(opt$output, paper = "special", width = 11.69, height = 11.69)
 ref_genome <- opt$genome
+vcf_table <- data.frame(sample_name=sample_names, path=vcf_files)
+
+library(MutationalPatterns)
 library(ref_genome, character.only = TRUE)
 
 # Load the VCF files into a GRangesList:
 vcfs <- read_vcfs_as_granges(vcf_files, sample_names, ref_genome)
-levels_table  <- read.delim(opt$levels, header=FALSE, col.names=c("sample_name","level"))
-vcf_table <- data.frame(path=vcf_files, sample_name=sample_names)
-metadata_table <- merge(vcf_table, levels_table, by.x=2, by.y=1)
-levels <- metadata_table$level
-muts = mutations_from_vcf(vcfs[[1]])
-types = mut_type(vcfs[[1]])
-context = mut_context(vcfs[[1]], ref_genome)
-type_context = type_context(vcfs[[1]], ref_genome)
-type_occurrences <- mut_type_occurrences(vcfs, ref_genome)
-# p1 <- plot_spectrum(type_occurrences)
-# p2 <- plot_spectrum(type_occurrences, CT = TRUE)
-# p3 <- plot_spectrum(type_occurrences, CT = TRUE, legend = FALSE)
-# 
-# plot(p2)
-# p4 <- plot_spectrum(type_occurrences, by = levels, CT = TRUE, legend = TRUE)
-# palette <- c("pink", "orange", "blue", "lightblue", "green", "red", "purple")
-# p5 <- plot_spectrum(type_occurrences, CT=TRUE, legend=TRUE, colors=palette)
-# 
-# plot(p4)
+if (!is.na(opt$levels)[1]) {  # manage levels if there are
+    levels_table  <- read.delim(opt$levels, header=FALSE, col.names=c("sample_name","level"))
+    library(dplyr)
+    metadata_table <- left_join(vcf_table, levels_table, by = "sample_name")
+#    detach("package:dplyr", unload=TRUE)
+    tissue <- metadata_table$level
+}
+
+##### This is done for any section ######
 mut_mat <- mut_matrix(vcf_list = vcfs, ref_genome = ref_genome)
-# plot_96_profile(mut_mat[,1:length(as.data.frame(mut_mat))], condensed = TRUE)
-mut_mat <- mut_mat + 0.0001
-# library("NMF")
-# estimate <- nmf(mut_mat, rank=2:5, method="brunet", nrun=100, seed=123456)
-# plot(estimate)
-# nmf_res <- extract_signatures(mut_mat, rank = 4, nrun = 100)
-# colnames(nmf_res$signatures) <- c("Signature A", "Signature B", "Signature C", "Signature D")
-# rownames(nmf_res$contribution) <- c("Signature A", "Signature B", "Signature C", "Signature D")
-# plot_96_profile(nmf_res$signatures, condensed = TRUE)
-sp_url <- paste("https://cancer.sanger.ac.uk/cancergenome/assets/", "signatures_probabilities.txt", sep = "")
-cancer_signatures = read.table(sp_url, sep = "\t", header = TRUE)
-new_order = match(row.names(mut_mat), cancer_signatures$Somatic.Mutation.Type)
-cancer_signatures = cancer_signatures[as.vector(new_order),]
-row.names(cancer_signatures) = cancer_signatures$Somatic.Mutation.Type
-cancer_signatures = as.matrix(cancer_signatures[,4:33])
-# plot_96_profile(cancer_signatures, condensed = TRUE, ymax = 0.3)
-hclust_cosmic = cluster_signatures(cancer_signatures, method = "average")
-cosmic_order = colnames(cancer_signatures)[hclust_cosmic$order]
-# plot(hclust_cosmic)
-cos_sim(mut_mat[,1], cancer_signatures[,1])
-cos_sim_samples_signatures = cos_sim_matrix(mut_mat, cancer_signatures)
-plot_cosine_heatmap(cos_sim_samples_signatures, col_order = cosmic_order, cluster_rows = TRUE)
-fit_res <- fit_to_signatures(mut_mat, cancer_signatures)
-threshold <- tail(sort(unlist(rowSums(fit_res$contribution), use.names = FALSE)), opt$signum)[1]
-select <- which(rowSums(fit_res$contribution) >= threshold) # ensure opt$signum best signatures in samples are retained, the others discarded
-plot_contribution(fit_res$contribution[select,], cancer_signatures[,select], coord_flip = T, mode = "absolute")
-plot_contribution(fit_res$contribution[select,], cancer_signatures[,select], coord_flip = T, mode = "relative")
-plot_contribution_heatmap(fit_res$contribution, cluster_samples = TRUE, method = "complete")
 
 
-sig5data <- as.data.frame(t(head(fit_res$contribution[select,])))
-colnames(sig5data) <- gsub("nature", "", colnames(sig5data))
-sig5data_percents <- sig5data / (apply(sig5data,1,sum)) * 100
-sig5data_percents$sample <- rownames(sig5data_percents)
-library(reshape2)
-melted_sig5data_percents <-melt(data=sig5data_percents)
-melted_sig5data_percents$label <- sub("Sig.", "", melted_sig5data_percents$variable)
-melted_sig5data_percents$pos <- cumsum(melted_sig5data_percents$value) - melted_sig5data_percents$value/2
-ggplot(melted_sig5data_percents, aes(x="", y=value, group=variable, fill=variable)) +
-  geom_bar(width = 1, stat = "identity") +
-  geom_text(aes(label = label), position = position_stack(vjust = 0.5), color="black", size=3) +
-  coord_polar("y", start=0) + facet_wrap(~ sample) +
-  labs(x="", y="Samples", fill = "Signatures (Cosmic_v2,March 2015)") +
-    theme(axis.text = element_blank(),
-        axis.ticks = element_blank(),
-        panel.grid  = element_blank())
-dev.off()
+###### Section 1 Mutation characteristics and spectrums #############
+if (!is.na(opt$output_spectrum)[1]) {
+    pdf(opt$output_spectrum, paper = "special", width = 11.69, height = 11.69)
+    type_occurrences <- mut_type_occurrences(vcfs, ref_genome)
+    
+    # mutation spectrum, total or by sample
+    
+    if (is.na(opt$levels)[1]) {
+        p1 <- plot_spectrum(type_occurrences, CT = TRUE, legend=TRUE)
+        plot(p1)
+    } else {
+        p2 <- plot_spectrum(type_occurrences, by = tissue, CT=TRUE, legend=TRUE) # by sample
+        p3 <- plot_spectrum(type_occurrences, CT=TRUE, legend=TRUE) # total
+        grid.arrange(p2, p3, ncol=2, widths=c(4,2.3), heights=c(4,1))
+    }
+    plot_96_profile(mut_mat, condensed = TRUE)
+    dev.off()
+}
+
+###### Section 2: De novo mutational signature extraction using NMF #######
+if (!is.na(opt$output_denovo)[1]) {
+    mut_mat <- mut_mat + 0.0001 # First add a small psuedocount to the mutation count matrix
+    # Use the NMF package to generate an estimate rank plot
+    library("NMF")
+    estimate <- nmf(mut_mat, rank=1:opt$rank+1, method="brunet", nrun=opt$nrun, seed=123456)
+    # And plot it
+    pdf(opt$output_denovo, paper = "special", width = 11.69, height = 11.69)
+    p.trans <- plot(estimate)
+    grid.arrange(p.trans)
+    # Extract 4 (PARAMETIZE) mutational signatures from the mutation count matrix with extract_signatures
+    # (For larger datasets it is wise to perform more iterations by changing the nrun parameter
+    # to achieve stability and avoid local minima)
+    nmf_res <- extract_signatures(mut_mat, rank=opt$rank, nrun=opt$nrun)
+    # Assign signature names
+    colnames(nmf_res$signatures) <- paste0("NewSig_", 1:opt$rank)
+    rownames(nmf_res$contribution) <- paste0("NewSig_", 1:opt$rank)
+    # Plot the 96-profile of the signatures:
+    p.trans <- plot_96_profile(nmf_res$signatures, condensed = TRUE)
+    grid.arrange(p.trans)
+    # Visualize the contribution of the signatures in a barplot
+    pc1 <- plot_contribution(nmf_res$contribution, nmf_res$signature, mode="relative", coord_flip = TRUE)
+    # Visualize the contribution of the signatures in absolute number of mutations
+    pc2 <- plot_contribution(nmf_res$contribution, nmf_res$signature, mode="absolute", coord_flip = TRUE)
+    # Combine the two plots:
+    grid.arrange(pc1, pc2)
+        
+    # The relative contribution of each signature for each sample can also be plotted as a heatmap with
+    # plot_contribution_heatmap, which might be easier to interpret and compare than stacked barplots.
+    # The samples can be hierarchically clustered based on their euclidean dis- tance. The signatures
+    # can be plotted in a user-specified order.
+    # Plot signature contribution as a heatmap with sample clustering dendrogram and a specified signature order:
+    pch1 <- plot_contribution_heatmap(nmf_res$contribution,
+                                      sig_order = paste0("NewSig_", 1:opt$rank))
+    # Plot signature contribution as a heatmap without sample clustering:
+    pch2 <- plot_contribution_heatmap(nmf_res$contribution, cluster_samples=FALSE)
+    #Combine the plots into one figure:
+    grid.arrange(pch1, pch2, ncol = 2, widths = c(2,1.6))
+    
+    # Compare the reconstructed mutational profile with the original mutational profile:   
+    plot_compare_profiles(mut_mat[,1],
+                          nmf_res$reconstructed[,1],
+                          profile_names = c("Original", "Reconstructed"),
+                          condensed = TRUE)
+    dev.off()
+}
+##### Section 3: Find optimal contribution of known signatures: COSMIC mutational signatures ####
+
+if (!is.na(opt$output_cosmic)[1]) {
+    pdf(opt$output_cosmic, paper = "special", width = 11.69, height = 11.69)
+    sp_url <- paste("https://cancer.sanger.ac.uk/cancergenome/assets/", "signatures_probabilities.txt", sep = "")
+    cancer_signatures = read.table(sp_url, sep = "\t", header = TRUE)
+    mut_mat <- mut_mat + 0.0001 # First add a small psuedocount to the mutation count matrix
+    new_order = match(row.names(mut_mat), cancer_signatures$Somatic.Mutation.Type)
+    cancer_signatures = cancer_signatures[as.vector(new_order),]
+    row.names(cancer_signatures) = cancer_signatures$Somatic.Mutation.Type
+    cancer_signatures = as.matrix(cancer_signatures[,4:33])
+    
+    # Plot mutational profiles of the COSMIC signatures
+    colnames(cancer_signatures) <- gsub("Signature.", "", colnames(cancer_signatures)) # shorten signature labels
+    p.trans <- plot_96_profile(cancer_signatures, condensed = TRUE, ymax = 0.3)
+    grid.arrange(p.trans, top = textGrob("COSMIC signature profiles",gp=gpar(fontsize=12,font=3)))
+
+    # Hierarchically cluster the COSMIC signatures based on their similarity with average linkage
+    # hclust_cosmic = cluster_signatures(cancer_signatures, method = "average")
+    # store signatures in new order
+    # cosmic_order = colnames(cancer_signatures)[hclust_cosmic$order]
+    # plot(hclust_cosmic)
+    
+    # Similarity between mutational profiles and COSMIC signatures
+    
+    
+    # The similarity between each mutational profile and each COSMIC signature, can be calculated
+    # with cos_sim_matrix, and visualized with plot_cosine_heatmap. The cosine similarity reflects
+    # how well each mutational profile can be explained by each signature individually. The advantage
+    # of this heatmap representation is that it shows in a glance the similarity in mutational
+    # profiles between samples, while at the same time providing information on which signatures
+    # are most prominent. The samples can be hierarchically clustered in plot_cosine_heatmap.
+    # The cosine similarity between two mutational profiles/signatures can be calculated with cos_sim :
+    # cos_sim(mut_mat[,1], cancer_signatures[,1])
+    
+    # Calculate pairwise cosine similarity between mutational profiles and COSMIC signatures
+    
+    # cos_sim_samples_signatures = cos_sim_matrix(mut_mat, cancer_signatures)
+    # Plot heatmap with specified signature order
+    # p.trans <- plot_cosine_heatmap(cos_sim_samples_signatures, col_order = cosmic_order, cluster_rows = TRUE)
+    # grid.arrange(p.trans)
+    
+    # Find optimal contribution of COSMIC signatures to reconstruct 96 mutational profiles
+    fit_res <- fit_to_signatures(mut_mat, cancer_signatures)
+
+    # Select signatures with some contribution (above a threshold)
+    threshold <- tail(sort(unlist(rowSums(fit_res$contribution), use.names = FALSE)), opt$signum)[1]
+    select <- which(rowSums(fit_res$contribution) >= threshold) # ensure opt$signum best signatures in samples are retained, the others discarded
+    # Plot contribution barplots
+    pc1 <- plot_contribution(fit_res$contribution[select,], cancer_signatures[,select], coord_flip = T, mode = "absolute")
+    pc2 <- plot_contribution(fit_res$contribution[select,], cancer_signatures[,select], coord_flip = T, mode = "relative")
+    # Combine the two plots:
+    grid.arrange(pc1, pc2)
+    ## pie charts of comic signatures contributions in samples
+    
+    sig_data_pie <- as.data.frame(t(head(fit_res$contribution[select,])))
+    colnames(sig_data_pie) <- gsub("nature", "", colnames(sig_data_pie))
+    sig_data_pie_percents <- sig_data_pie / (apply(sig_data_pie,1,sum)) * 100
+    sig_data_pie_percents$sample <- rownames(sig_data_pie_percents)
+    library(reshape2)
+    melted_sig_data_pie_percents <-melt(data=sig_data_pie_percents)
+    melted_sig_data_pie_percents$label <- sub("Sig.", "", melted_sig_data_pie_percents$variable)
+    melted_sig_data_pie_percents$pos <- cumsum(melted_sig_data_pie_percents$value) - melted_sig_data_pie_percents$value/2
+    library(ggplot2)
+    p.trans <-  ggplot(melted_sig_data_pie_percents, aes(x="", y=value, group=variable, fill=variable)) +
+                       geom_bar(width = 1, stat = "identity") +
+                       geom_text(aes(label = label), position = position_stack(vjust = 0.5), color="black", size=3) +
+                       coord_polar("y", start=0) + facet_wrap(~ sample) +
+                       labs(x="", y="Samples", fill = "Signatures (Cosmic_v2,March 2015)") +
+                       theme(axis.text = element_blank(),
+                             axis.ticks = element_blank(),
+                             panel.grid  = element_blank())
+    grid.arrange(p.trans)
+
+    # Plot relative contribution of the cancer signatures in each sample as a heatmap with sample clustering
+    p.trans <- plot_contribution_heatmap(fit_res$contribution, cluster_samples = TRUE, method = "complete")
+    grid.arrange(p.trans)
+
+    # Compare the reconstructed mutational profile of sample 1 with its original mutational profile
+    # plot_compare_profiles(mut_mat[,1], fit_res$reconstructed[,1],
+    #                       profile_names = c("Original", "Reconstructed"),
+    #                       condensed = TRUE)
+    
+    # Calculate the cosine similarity between all original and reconstructed mutational profiles with
+    # `cos_sim_matrix`
+    
+    # calculate all pairwise cosine similarities
+    cos_sim_ori_rec <- cos_sim_matrix(mut_mat, fit_res$reconstructed)
+    # extract cosine similarities per sample between original and reconstructed
+    cos_sim_ori_rec <- as.data.frame(diag(cos_sim_ori_rec))
+    
+    # We can use ggplot to make a barplot of the cosine similarities between the original and
+    # reconstructed mutational profile of each sample. This clearly shows how well each mutational
+    # profile can be reconstructed with the COSMIC mutational signatures. Two identical profiles
+    # have a cosine similarity of 1. The lower the cosine similarity between original and
+    # reconstructed, the less well the original mutational profile can be reconstructed with
+    # the COSMIC signatures. You could use, for example, cosine similarity of 0.95 as a cutoff.
+    
+    # Adjust data frame for plotting with gpplot
+    colnames(cos_sim_ori_rec) = "cos_sim"
+    cos_sim_ori_rec$sample = row.names(cos_sim_ori_rec)
+    # Make barplot
+    p.trans <- ggplot(cos_sim_ori_rec, aes(y=cos_sim, x=sample)) +
+                      geom_bar(stat="identity", fill = "skyblue4") +
+                      coord_cartesian(ylim=c(0.8, 1)) +
+                      # coord_flip(ylim=c(0.8,1)) +
+                      ylab("Cosine similarity\n original VS reconstructed") +
+                      xlab("") +
+                      # Reverse order of the samples such that first is up
+                      # xlim(rev(levels(factor(cos_sim_ori_rec$sample)))) +
+                      theme_bw() +
+                      theme(panel.grid.minor.y=element_blank(),
+                      panel.grid.major.y=element_blank()) +
+                      # Add cut.off line
+                      geom_hline(aes(yintercept=.95))
+    grid.arrange(p.trans, top = textGrob("Similarity between true and reconstructed profiles (with all Cosmic sig.)",gp=gpar(fontsize=12,font=3)))    
+    
+    dev.off()
+}