diff mutational_patterns.R @ 3:e332cf9dfa06 draft

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mutational_patterns commit 0f7593f703ba0dfd12aea1c0460e371f08b57d2f"
author artbio
date Thu, 24 Sep 2020 01:32:52 +0000
parents aea952be68cb
children 7ba08c826888
line wrap: on
line diff
--- a/mutational_patterns.R	Wed Sep 16 22:58:28 2020 +0000
+++ b/mutational_patterns.R	Thu Sep 24 01:32:52 2020 +0000
@@ -29,6 +29,12 @@
     help = "path to the tab separated file describing the levels in function of datasets"
   ),
   make_option(
+    "--cosmic_version",
+    default = "v2",
+    type = 'character',
+    help = "Version of the Cosmic Signature set to be used to express mutational patterns"
+  ),
+  make_option(
     "--signum",
     default = 2,
     type = 'integer',
@@ -52,7 +58,6 @@
     type = 'integer',
     help = "Number of new signatures to be captured"
   ),
-
   make_option(
     "--output_spectrum",
     default = NA,
@@ -66,11 +71,22 @@
     help = "path to output dataset"
   ),
   make_option(
+    "--sigmatrix",
+    default = NA,
+    type = 'character',
+    help = "path to signature matrix"
+  ),
+  make_option(
     "--output_cosmic",
     default = NA,
     type = 'character',
     help = "path to output dataset"
-  )
+  ),
+  make_option(
+    c("-r", "--rdata"),
+    type="character",
+    default=NULL,
+    help="Path to RData output file")
 )
 
 opt = parse_args(OptionParser(option_list = option_list),
@@ -81,28 +97,27 @@
 parser <- newJSONParser()
 parser$addData(json_dict)
 fileslist <- parser$getObject()
-vcf_files <- attr(fileslist, "names")
-sample_names <- unname(unlist(fileslist))
+vcf_paths <- attr(fileslist, "names")
+element_identifiers <- unname(unlist(fileslist))
 ref_genome <- opt$genome
-vcf_table <- data.frame(sample_name=sample_names, path=vcf_files)
+vcf_table <- data.frame(element_identifier=element_identifiers, path=vcf_paths)
 
 library(MutationalPatterns)
 library(ref_genome, character.only = TRUE)
+library(ggplot2)
 
 # Load the VCF files into a GRangesList:
-vcfs <- read_vcfs_as_granges(vcf_files, sample_names, ref_genome)
+vcfs <- read_vcfs_as_granges(vcf_paths, element_identifiers, ref_genome)
 if (!is.na(opt$levels)[1]) {  # manage levels if there are
-    levels_table  <- read.delim(opt$levels, header=FALSE, col.names=c("sample_name","level"))
-    library(dplyr)
-    metadata_table <- left_join(vcf_table, levels_table, by = "sample_name")
-#    detach("package:dplyr", unload=TRUE)
-    tissue <- metadata_table$level
+    levels_table  <- read.delim(opt$levels, header=FALSE, col.names=c("element_identifier","level"))
+    library(plyr)
+    metadata_table <- join(vcf_table, levels_table, by = "element_identifier")
+    tissue <- as.vector(metadata_table$level)
 }
 
 ##### This is done for any section ######
 mut_mat <- mut_matrix(vcf_list = vcfs, ref_genome = ref_genome)
 
-
 ###### Section 1 Mutation characteristics and spectrums #############
 if (!is.na(opt$output_spectrum)[1]) {
     pdf(opt$output_spectrum, paper = "special", width = 11.69, height = 11.69)
@@ -114,34 +129,41 @@
         p1 <- plot_spectrum(type_occurrences, CT = TRUE, legend=TRUE)
         plot(p1)
     } else {
-        p2 <- plot_spectrum(type_occurrences, by = tissue, CT=TRUE, legend=TRUE) # by sample
+        p2 <- plot_spectrum(type_occurrences, by = tissue, CT=TRUE) # by levels
         p3 <- plot_spectrum(type_occurrences, CT=TRUE, legend=TRUE) # total
         grid.arrange(p2, p3, ncol=2, widths=c(4,2.3), heights=c(4,1))
-    }
+   }
     plot_96_profile(mut_mat, condensed = TRUE)
     dev.off()
 }
 
 ###### Section 2: De novo mutational signature extraction using NMF #######
 if (!is.na(opt$output_denovo)[1]) {
-    mut_mat <- mut_mat + 0.0001 # First add a small psuedocount to the mutation count matrix
+    # opt$rank cannot be higher than the number of samples
+    if (opt$rank > length(element_identifiers)) {opt$rank <-length(element_identifiers)}
+    # likewise, opt$signum cannot be higher thant the number of samples
+    if (opt$signum > length(element_identifiers)) {opt$signum <-length(element_identifiers)}
+    pseudo_mut_mat <- mut_mat + 0.0001 # First add a small pseudocount to the mutation count matrix
     # Use the NMF package to generate an estimate rank plot
     library("NMF")
-    estimate <- nmf(mut_mat, rank=1:opt$rank+1, method="brunet", nrun=opt$nrun, seed=123456)
+    estimate <- nmf(pseudo_mut_mat, rank=1:opt$rank, method="brunet", nrun=opt$nrun, seed=123456)
     # And plot it
     pdf(opt$output_denovo, paper = "special", width = 11.69, height = 11.69)
-    p.trans <- plot(estimate)
-    grid.arrange(p.trans)
+    p4 <- plot(estimate)
+    grid.arrange(p4)
     # Extract 4 (PARAMETIZE) mutational signatures from the mutation count matrix with extract_signatures
     # (For larger datasets it is wise to perform more iterations by changing the nrun parameter
     # to achieve stability and avoid local minima)
-    nmf_res <- extract_signatures(mut_mat, rank=opt$rank, nrun=opt$nrun)
+    nmf_res <- extract_signatures(pseudo_mut_mat, rank=opt$newsignum, nrun=opt$nrun)
     # Assign signature names
-    colnames(nmf_res$signatures) <- paste0("NewSig_", 1:opt$rank)
-    rownames(nmf_res$contribution) <- paste0("NewSig_", 1:opt$rank)
+    colnames(nmf_res$signatures) <- paste0("NewSig_", 1:opt$newsignum)
+    rownames(nmf_res$contribution) <- paste0("NewSig_", 1:opt$newsignum)
     # Plot the 96-profile of the signatures:
-    p.trans <- plot_96_profile(nmf_res$signatures, condensed = TRUE)
-    grid.arrange(p.trans)
+    p5 <- plot_96_profile(nmf_res$signatures, condensed = TRUE)
+    new_sig_matrix <- reshape2::dcast(p5$data, substitution + context ~ variable, value.var = "value")
+    new_sig_matrix = format(new_sig_matrix, scientific=TRUE)
+    write.table(new_sig_matrix, file=opt$sigmatrix, quote = FALSE, row.names = FALSE, sep="\t") 
+    grid.arrange(p5)
     # Visualize the contribution of the signatures in a barplot
     pc1 <- plot_contribution(nmf_res$contribution, nmf_res$signature, mode="relative", coord_flip = TRUE)
     # Visualize the contribution of the signatures in absolute number of mutations
@@ -155,14 +177,14 @@
     # can be plotted in a user-specified order.
     # Plot signature contribution as a heatmap with sample clustering dendrogram and a specified signature order:
     pch1 <- plot_contribution_heatmap(nmf_res$contribution,
-                                      sig_order = paste0("NewSig_", 1:opt$rank))
+                                      sig_order = paste0("NewSig_", 1:opt$newsignum))
     # Plot signature contribution as a heatmap without sample clustering:
     pch2 <- plot_contribution_heatmap(nmf_res$contribution, cluster_samples=FALSE)
     #Combine the plots into one figure:
     grid.arrange(pch1, pch2, ncol = 2, widths = c(2,1.6))
     
     # Compare the reconstructed mutational profile with the original mutational profile:   
-    plot_compare_profiles(mut_mat[,1],
+    plot_compare_profiles(pseudo_mut_mat[,1],
                           nmf_res$reconstructed[,1],
                           profile_names = c("Original", "Reconstructed"),
                           condensed = TRUE)
@@ -172,18 +194,38 @@
 
 if (!is.na(opt$output_cosmic)[1]) {
     pdf(opt$output_cosmic, paper = "special", width = 11.69, height = 11.69)
-    sp_url <- paste("https://cancer.sanger.ac.uk/cancergenome/assets/", "signatures_probabilities.txt", sep = "")
-    cancer_signatures = read.table(sp_url, sep = "\t", header = TRUE)
-    mut_mat <- mut_mat + 0.0001 # First add a small psuedocount to the mutation count matrix
-    new_order = match(row.names(mut_mat), cancer_signatures$Somatic.Mutation.Type)
-    cancer_signatures = cancer_signatures[as.vector(new_order),]
-    row.names(cancer_signatures) = cancer_signatures$Somatic.Mutation.Type
-    cancer_signatures = as.matrix(cancer_signatures[,4:33])
+    pseudo_mut_mat <- mut_mat + 0.0001 # First add a small psuedocount to the mutation count matrix
+    if (opt$cosmic_version == "v2") {
+        sp_url <- paste("https://cancer.sanger.ac.uk/cancergenome/assets/", "signatures_probabilities.txt", sep = "")
+        cancer_signatures = read.table(sp_url, sep = "\t", header = TRUE)
+        new_order = match(row.names(pseudo_mut_mat), cancer_signatures$Somatic.Mutation.Type)
+        cancer_signatures = cancer_signatures[as.vector(new_order),]
+        row.names(cancer_signatures) = cancer_signatures$Somatic.Mutation.Type
+        cancer_signatures = as.matrix(cancer_signatures[,4:33])
+        colnames(cancer_signatures) <- gsub("Signature.", "", colnames(cancer_signatures)) # shorten signature labels
+        cosmic_tag <- "Signatures (Cosmic v2, March 2015)"
+        } else {
+        sp_url <- "https://raw.githubusercontent.com/ARTbio/startbio/master/sigProfiler_SBS_signatures_2019_05_22.tsv"
+        cancer_signatures = read.table(sp_url, sep = "\t", header = TRUE)
+        new_order = match(row.names(pseudo_mut_mat), cancer_signatures$Somatic.Mutation.Type)
+        cancer_signatures = cancer_signatures[as.vector(new_order),]
+        row.names(cancer_signatures) = cancer_signatures$Somatic.Mutation.Type
+        cancer_signatures = as.matrix(cancer_signatures[,4:70])        
+        colnames(cancer_signatures) <- gsub("SBS", "", colnames(cancer_signatures)) # shorten signature labels
+        cosmic_tag <- "Signatures (Cosmic v3, May 2019)"
+        }
     
     # Plot mutational profiles of the COSMIC signatures
-    colnames(cancer_signatures) <- gsub("Signature.", "", colnames(cancer_signatures)) # shorten signature labels
-    p.trans <- plot_96_profile(cancer_signatures, condensed = TRUE, ymax = 0.3)
-    grid.arrange(p.trans, top = textGrob("COSMIC signature profiles",gp=gpar(fontsize=12,font=3)))
+    if (opt$cosmic_version == "v2") {
+        p6 <- plot_96_profile(cancer_signatures, condensed = TRUE, ymax = 0.3)
+        grid.arrange(p6, top = textGrob("COSMIC signature profiles",gp=gpar(fontsize=12,font=3)))
+    } else {
+        print(length(cancer_signatures))
+        p6 <- plot_96_profile(cancer_signatures[,1:33], condensed = TRUE, ymax = 0.3)
+        p6bis <- plot_96_profile(cancer_signatures[,34:67], condensed = TRUE, ymax = 0.3)
+        grid.arrange(p6, top = textGrob("COSMIC signature profiles (on two pages)",gp=gpar(fontsize=12,font=3)))
+        grid.arrange(p6bis, top = textGrob("COSMIC signature profiles (continued)",gp=gpar(fontsize=12,font=3)))
+    }
 
     # Hierarchically cluster the COSMIC signatures based on their similarity with average linkage
     # hclust_cosmic = cluster_signatures(cancer_signatures, method = "average")
@@ -211,16 +253,37 @@
     # grid.arrange(p.trans)
     
     # Find optimal contribution of COSMIC signatures to reconstruct 96 mutational profiles
-    fit_res <- fit_to_signatures(mut_mat, cancer_signatures)
+    fit_res <- fit_to_signatures(pseudo_mut_mat, cancer_signatures)
 
     # Select signatures with some contribution (above a threshold)
     threshold <- tail(sort(unlist(rowSums(fit_res$contribution), use.names = FALSE)), opt$signum)[1]
     select <- which(rowSums(fit_res$contribution) >= threshold) # ensure opt$signum best signatures in samples are retained, the others discarded
     # Plot contribution barplots
-    pc1 <- plot_contribution(fit_res$contribution[select,], cancer_signatures[,select], coord_flip = T, mode = "absolute")
-    pc2 <- plot_contribution(fit_res$contribution[select,], cancer_signatures[,select], coord_flip = T, mode = "relative")
+    pc3 <- plot_contribution(fit_res$contribution[select,], cancer_signatures[,select], coord_flip = T, mode = "absolute")
+    pc4 <- plot_contribution(fit_res$contribution[select,], cancer_signatures[,select], coord_flip = T, mode = "relative")
+    #####
+    # ggplot2 alternative
+    if (!is.na(opt$levels)[1]) {  # if there are levels to display in graphs
+        pc3_data <- pc3$data
+        pc3_data <- merge (pc3_data, metadata_table[,c(1,3)], by.x="Sample", by.y="element_identifier")
+        pc3 <- ggplot(pc3_data, aes(x=Sample, y=Contribution, fill=as.factor(Signature))) +
+               geom_bar(stat="identity", position='stack') +
+               scale_fill_discrete(name="Cosmic\nSignature") +
+               labs(x = "Samples", y = "Absolute contribution") + theme_bw() + 
+               theme(panel.grid.minor.x = element_blank(), panel.grid.major.x = element_blank()) + 
+               facet_grid(~level, scales = "free_x")
+        pc4_data <- pc4$data
+        pc4_data <- merge (pc4_data, metadata_table[,c(1,3)], by.x="Sample", by.y="element_identifier")
+        pc4 <- ggplot(pc4_data, aes(x=Sample, y=Contribution, fill=as.factor(Signature))) +
+               geom_bar(stat="identity", position='fill') +
+               scale_fill_discrete(name="Cosmic\nSignature") +
+               scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
+               labs(x = "Samples", y = "Relative contribution") + theme_bw() + 
+               theme(panel.grid.minor.x = element_blank(), panel.grid.major.x = element_blank()) + 
+               facet_grid(~level, scales = "free_x")
+    }
     # Combine the two plots:
-    grid.arrange(pc1, pc2)
+    grid.arrange(pc3, pc4, top = textGrob("Absolute and Relative Contributions of Cosmic signatures to mutational patterns",gp=gpar(fontsize=12,font=3)))
     ## pie charts of comic signatures contributions in samples
     
     sig_data_pie <- as.data.frame(t(head(fit_res$contribution[select,])))
@@ -231,20 +294,19 @@
     melted_sig_data_pie_percents <-melt(data=sig_data_pie_percents)
     melted_sig_data_pie_percents$label <- sub("Sig.", "", melted_sig_data_pie_percents$variable)
     melted_sig_data_pie_percents$pos <- cumsum(melted_sig_data_pie_percents$value) - melted_sig_data_pie_percents$value/2
-    library(ggplot2)
-    p.trans <-  ggplot(melted_sig_data_pie_percents, aes(x="", y=value, group=variable, fill=variable)) +
+    p7 <-  ggplot(melted_sig_data_pie_percents, aes(x="", y=value, group=variable, fill=variable)) +
                        geom_bar(width = 1, stat = "identity") +
                        geom_text(aes(label = label), position = position_stack(vjust = 0.5), color="black", size=3) +
                        coord_polar("y", start=0) + facet_wrap(~ sample) +
-                       labs(x="", y="Samples", fill = "Signatures (Cosmic_v2,March 2015)") +
+                       labs(x="", y="Samples", fill = cosmic_tag) +
                        theme(axis.text = element_blank(),
                              axis.ticks = element_blank(),
                              panel.grid  = element_blank())
-    grid.arrange(p.trans)
+    grid.arrange(p7)
 
     # Plot relative contribution of the cancer signatures in each sample as a heatmap with sample clustering
-    p.trans <- plot_contribution_heatmap(fit_res$contribution, cluster_samples = TRUE, method = "complete")
-    grid.arrange(p.trans)
+    p8 <- plot_contribution_heatmap(fit_res$contribution, cluster_samples = TRUE, method = "complete")
+    grid.arrange(p8)
 
     # Compare the reconstructed mutational profile of sample 1 with its original mutational profile
     # plot_compare_profiles(mut_mat[,1], fit_res$reconstructed[,1],
@@ -255,7 +317,7 @@
     # `cos_sim_matrix`
     
     # calculate all pairwise cosine similarities
-    cos_sim_ori_rec <- cos_sim_matrix(mut_mat, fit_res$reconstructed)
+    cos_sim_ori_rec <- cos_sim_matrix(pseudo_mut_mat, fit_res$reconstructed)
     # extract cosine similarities per sample between original and reconstructed
     cos_sim_ori_rec <- as.data.frame(diag(cos_sim_ori_rec))
     
@@ -270,7 +332,7 @@
     colnames(cos_sim_ori_rec) = "cos_sim"
     cos_sim_ori_rec$sample = row.names(cos_sim_ori_rec)
     # Make barplot
-    p.trans <- ggplot(cos_sim_ori_rec, aes(y=cos_sim, x=sample)) +
+    p9 <- ggplot(cos_sim_ori_rec, aes(y=cos_sim, x=sample)) +
                       geom_bar(stat="identity", fill = "skyblue4") +
                       coord_cartesian(ylim=c(0.8, 1)) +
                       # coord_flip(ylim=c(0.8,1)) +
@@ -283,7 +345,12 @@
                       panel.grid.major.y=element_blank()) +
                       # Add cut.off line
                       geom_hline(aes(yintercept=.95))
-    grid.arrange(p.trans, top = textGrob("Similarity between true and reconstructed profiles (with all Cosmic sig.)",gp=gpar(fontsize=12,font=3)))    
-    
+    grid.arrange(p9, top = textGrob("Similarity between true and reconstructed profiles (with all Cosmic sig.)",gp=gpar(fontsize=12,font=3)))    
     dev.off()
 }
+
+
+# Output RData file
+if (!is.null(opt$rdata)) {
+  save.image(file=opt$rdata)
+}