diff heatmap_for_variants.R @ 0:1062d6ad6503 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/snpfreqplot/ commit 1f35303af979c16d9a3126dbc882a59f686ace5d"
author iuc
date Wed, 02 Dec 2020 21:23:06 +0000
parents
children e362b3143cde
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/heatmap_for_variants.R	Wed Dec 02 21:23:06 2020 +0000
@@ -0,0 +1,212 @@
+#!/usr/bin/env R
+
+suppressPackageStartupMessages(library(pheatmap))
+suppressPackageStartupMessages(library(RColorBrewer))
+suppressPackageStartupMessages(library(tidyverse))
+
+fapply <- function(vect_ids, func) {
+    #' List apply but preserve the names
+    res <- lapply(vect_ids, func)
+    names(res) <- vect_ids
+    return(res)
+}
+
+                                        # M A I N
+stopifnot(exists("samples"))
+variant_files <- fapply(samples$ids, read_and_process)  # nolint
+
+extractall_data <- function(id) {
+    variants <- variant_files[[id]]
+    tmp <- variants %>%
+        mutate(posalt = uni_select) %>%
+        select(posalt, AF)
+    colnames(tmp) <- c("Mutation", id)
+    return(tmp)
+}
+
+extractall_annots <- function(id) {
+    variants <- variant_files[[id]]
+    tmp <- variants %>%
+        mutate(posalt = uni_select,
+               effect = EFF....EFFECT, gene = EFF....GENE) %>%
+        select(posalt, effect, gene)
+    return(tmp)
+}
+
+                                        # process allele frequencies
+processed_files <- fapply(samples$ids, extractall_data)
+final <- as_tibble(
+    processed_files %>%
+    reduce(full_join, by = "Mutation", copy = T))
+
+final <- final[str_order(final$Mutation, numeric = T), ] %>%
+    column_to_rownames("Mutation")              ## sort and set rownames
+final[final < variant_frequency] <- NA          ## adjust the variant frequency:
+final <- final[rowSums(is.na(final)) != ncol(final), ]
+final <- t(final)
+final[is.na(final)] <- 0
+class(final) <- "numeric"
+
+                                        # add annotations
+## readout annotations
+processed_annots <- fapply(samples$ids, extractall_annots)
+ann_final <- processed_annots %>%
+    reduce(function(x, y) {
+        unique(rbind(x, y))}) %>%
+    filter(posalt %in% colnames(final))         ## apply frequency filter
+ann_final <- as_tibble(ann_final[str_order(
+    ann_final$posalt, numeric = T), ]) %>%
+    column_to_rownames("posalt")                       ## sort
+
+                                        # rename annotations
+trans <- function(x, mapping, replace_missing=NULL) {
+    # helper function for translating effects
+    mapped <- mapping[[x]]
+    if (is.null(mapped)) {
+        if (is.null(replace_missing)) x else replace_missing
+    } else {
+        mapped
+    }
+}
+
+# handle translation of classic SnpEff effects to sequence ontology terms
+# The following list defines the complete mapping between classic and So effect
+# terms even if not all of these are likely to appear in viral variant data.
+classic_snpeff_effects_to_so <- list(
+    "coding_sequence_variant", "coding_sequence_variant", "disruptive_inframe_deletion", "disruptive_inframe_insertion", "inframe_deletion", "inframe_insertion", "downstream_gene_variant", "exon_variant", "exon_loss_variant", "frameshift_variant", "gene_variant", "intergenic_variant", "intergenic_region", "conserved_intergenic_variant", "intragenic_variant", "intron_variant", "conserved_intron_variant", "missense_variant", "rare_amino_acid_variant", "splice_acceptor_variant", "splice_donor_variant", "splice_region_variant", "5_prime_UTR_premature_start_codon_variant", "start_lost", "stop_gained", "stop_lost", "synonymous_variant", "start_retained_variant", "stop_retained_variant", "transcript_variant", "upstream_gene_variant", "3_prime_UTR_truncation_+_exon_loss_variant", "3_prime_UTR_variant", "5_prime_UTR_truncation_+_exon_loss_variant", "5_prime_UTR_variant", "initiator_codon_variant", "None", "chromosomal_deletion"
+)
+names(classic_snpeff_effects_to_so) <- c(
+    "CDS", "CODON_CHANGE", "CODON_CHANGE_PLUS_CODON_DELETION", "CODON_CHANGE_PLUS_CODON_INSERTION", "CODON_DELETION", "CODON_INSERTION", "DOWNSTREAM", "EXON", "EXON_DELETED", "FRAME_SHIFT", "GENE", "INTERGENIC", "INTERGENIC_REGION", "INTERGENIC_CONSERVED", "INTRAGENIC", "INTRON", "INTRON_CONSERVED", "NON_SYNONYMOUS_CODING", "RARE_AMINO_ACID", "SPLICE_SITE_ACCEPTOR", "SPLICE_SITE_DONOR", "SPLICE_SITE_REGION", "START_GAINED", "START_LOST", "STOP_GAINED", "STOP_LOST", "SYNONYMOUS_CODING", "SYNONYMOUS_START", "SYNONYMOUS_STOP", "TRANSCRIPT", "UPSTREAM", "UTR_3_DELETED", "UTR_3_PRIME", "UTR_5_DELETED", "UTR_5_PRIME", "NON_SYNONYMOUS_START", "NONE", "CHROMOSOME_LARGE_DELETION"
+)
+# translate classic effects into SO terms leaving unknown terms (possibly SO already) as is
+so_effects <- sapply(ann_final$effect, function(x) trans(x, classic_snpeff_effects_to_so))
+
+# handle further translation of effects we care about
+so_effects_translation <- list(
+    "non-syn", "syn",
+    "deletion", "deletion", "deletion",
+    "insertion", "insertion", "frame shift",
+    "stop gained", "stop lost"
+)
+names(so_effects_translation) <- c(
+    "missense_variant", "synonymous_variant",
+    "disruptive_inframe_deletion", "inframe_deletion", "chromosomal_deletion",
+    "disruptive_inframe_insertion", "inframe_insertion", "frameshift_variant",
+    "stop_gained", "stop_lost"
+)
+# translate to our simple terms turning undefined terms into '?'
+simple_effects <- sapply(so_effects, function(x) trans(x, so_effects_translation, replace_missing = "?"))
+# complex variant effects (those that do more than one thing) are concatenated
+# with either '+' (for classic terms) or '&' (for SO terms)
+simple_effects[grepl("+", so_effects, fixed = TRUE)] <- "complex"
+simple_effects[grepl("&", so_effects, fixed = TRUE)] <- "complex"
+simple_effects[so_effects == ""] <- "non-coding"
+
+ann_final$effect <- simple_effects
+ann_final$gene <- sub("^$", "NCR", ann_final$gene)
+
+## automatically determine gaps for the heatmap
+gap_vector <- which(!(ann_final$gene[1:length(ann_final$gene) - 1] ==  # nolint
+                      ann_final$gene[2:length(ann_final$gene)]))
+
+                                        # colormanagement
+my_colors <- colorRampPalette(c("grey93", "brown", "black")) #heatmap
+count <- length(unique(ann_final$gene))                     #annotations (genes)
+gene_color <- c(brewer.pal(brewer_color_gene_annotation, n = count))
+names(gene_color) <- unique(ann_final$gene)
+
+                                        # colormanagement annotations (effect)
+## Define the full set of colors for each effect that we can encounter
+## This is not bulletproof. The effect names given here were swapped into the
+## data (see above substitutions in ann_final$effect) and so are hard-coded,
+## as well as their preferred colors.
+
+all_colors <- data.frame(
+    color = c("white", "green", "orange", "red",
+              "black", "grey", "yellow", "blue", "purple", "brown"),
+    name = c("non-coding", "syn", "non-syn", "deletion",
+             "frame shift", "stop gained", "stop lost", "insertion",
+             "complex", "?"))
+## Reduce the full set to just those that we want
+detected_effects <- unique(ann_final$effect)
+subset_colors <- subset(all_colors, name %in% detected_effects)
+effect_color <- subset_colors$color
+names(effect_color) <- subset_colors$name
+color_list <- list(gene_color = gene_color, effect_color = effect_color)
+names(color_list) <- c("gene", "effect")
+
+                                        # visualize heatmap
+if (pheat_number_of_clusters > length(samples$ids)) {
+    print(paste0("[INFO] Number of clusters: User-specified clusters (",
+                 pheat_number_of_clusters,
+                 ") is greater than the number of samples (",
+                 length(samples$ids), ")"))
+    pheat_number_of_clusters <- length(samples$ids)
+    print(paste0("[INFO] Number of clusters: now set to ",
+                 pheat_number_of_clusters))
+}
+
+get_plot_dims <- function(heat_map) {
+    ## get the dimensions of a pheatmap object
+    ## useful for plot formats that can't be written to a file directly, but
+    ## for which we need to set up a plotting device
+    ## source: https://stackoverflow.com/a/61876386
+    plot_height <- sum(sapply(heat_map$gtable$heights,
+                              grid::convertHeight, "in"))
+    plot_width  <- sum(sapply(heat_map$gtable$widths,
+                              grid::convertWidth, "in"))
+  return(list(height = plot_height, width = plot_width))
+}
+
+height <- round(max(c(max(c(
+    16 * (length(unique(ann_final$effect)) +
+        length(unique(ann_final$gene))), 160)) /
+    nrow(final), 15)))
+width <- round(ratio * height)
+
+
+if (!(out_ext %in% c("svg", "jpeg", "png", "pdf"))) {
+    stop("Unknown extension: ", ext, ", aborting.")
+}
+plot_device <- get(out_ext)
+
+
+## A constant scaling factor based on the calculated dimensions
+## above does not work for PNG, so we resort to feeding pheatmap
+## with a direct filename
+plot_filename <- NA
+if (out_ext %in% c("jpeg", "png")) {
+    plot_filename <- out_file
+}
+
+## SVG is not a format pheatmap knows how to write to a file directly.
+## As a workaround we
+## 1. create the plot object
+## 2. get its dimensions
+## 3. set up a svg plotting device with these dimensions
+## 4. print the heatmap object to the device
+hm <- pheatmap(
+    final,
+    color = my_colors(100),
+    cellwidth = width,
+    cellheight = height,
+    fontsize_col = round(1 / 3 * width),
+    fontsize_row = round(1 / 3 * min(c(height, width))),
+    clustering_method = pheat_clustering_method,
+    cluster_rows = pheat_clustering,
+    cluster_cols = F,
+    cutree_rows = pheat_number_of_clusters,
+    annotation_col = ann_final,
+    annotation_colors = color_list,
+    filename = plot_filename,
+    gaps_col = gap_vector
+)
+
+if (out_ext %in% c("pdf", "svg")) {
+    plot_dims <- get_plot_dims(hm)
+    plot_device(out_file,
+                width = plot_dims$width,
+                height = plot_dims$height)
+    print(hm)
+    dev.off()
+}