Mercurial > repos > iuc > snpfreqplot
diff snpEffExtract.R @ 0:1062d6ad6503 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/snpfreqplot/ commit 1f35303af979c16d9a3126dbc882a59f686ace5d"
author | iuc |
---|---|
date | Wed, 02 Dec 2020 21:23:06 +0000 |
parents | |
children | dc51db22310c |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snpEffExtract.R Wed Dec 02 21:23:06 2020 +0000 @@ -0,0 +1,94 @@ +#!/usr/bin/env R + +suppressPackageStartupMessages(library(VariantAnnotation)) +suppressPackageStartupMessages(library(tidyverse)) + +tsv_eff_from_vcf <- function(input_vcf, output_tab) { + read_vcf <- readVcf(input_vcf) # nolint + chrom_pos <- data.frame(read_vcf@rowRanges)[, c("seqnames", "start")] + ref_alt_filter <- read_vcf@fixed[, c("REF", "ALT", "FILTER")] + dp_af <- read_vcf@info[c("DP", "AF")] + + ## Unwrap the DNAStringList + # nolint start + ref_alt_filter <- data.frame( + REF = as.character(ref_alt_filter$REF), + ALT = sapply(seq_len(nrow(ref_alt_filter)), function(i) { + as.character(ref_alt_filter$ALT[[i]]) + }), + FILTER = as.character(ref_alt_filter$FILTER)) + # nolint end + ## + ## Don't unwrap EFF yet, we need to preserve rows + eff <- read_vcf@info["EFF"] + + stopifnot(nrow(chrom_pos) == nrow(ref_alt_filter)) + stopifnot(nrow(ref_alt_filter) == nrow(dp_af)) + stopifnot(nrow(dp_af) == nrow(eff)) + + ## EFF data contains nested constructs we need to unify all + ## data sources first, and then explode the EFF column. + united <- as_tibble(cbind(chrom_pos, ref_alt_filter, dp_af, eff)) # nolint + united_exploderows <- unnest(united, cols = c(EFF)) # nolint + + united_exploderows <- united_exploderows %>% + dplyr::mutate(CHROM = seqnames, POS = start) %>% + dplyr::select(CHROM, POS, REF, ALT, FILTER, DP, AF, EFF) + + ## EFF columns are defined here: + ## https://pcingola.github.io/SnpEff/se_inputoutput/ + options(warn = -1) ## suppress warnings + seperated_info <- united_exploderows %>% + separate(EFF, sep = "[(|)]", + extra = "merge", ## extra values merged into "extra" column + into = c("EFF[*].EFFECT", "EFF[*].IMPACT", "EFF[*].FUNCLASS", + "codon.change", "EFF[*].AA", "AA.length", + "EFF[*].GENE", "trans.biotype", "gene.coding", + "trans.id", "exon.rank", "gt.num", "warnings", + "extra")) + options(warn = 0) + ## If there is data that has been dropped or filled-in, we will see it in + ## the "extra" column if it isn't NA or an empty quote. + test_missing <- seperated_info %>% + dplyr::select("CHROM", "POS", "extra") %>% + replace_na(list(extra = "")) %>% + filter(extra != "") + + if (nrow(test_missing) > 0) { + print(test_missing) + stop("Extra values were not parsed") + } + + vcf_info <- seperated_info %>% + dplyr::select("CHROM", "POS", "REF", "ALT", "FILTER", "DP", "AF", + "EFF[*].EFFECT", "EFF[*].IMPACT", "EFF[*].FUNCLASS", + "EFF[*].AA", "EFF[*].GENE") %>% + ## now we de-duplicate any rows that arise from subselecting columns + dplyr::distinct() + + ## At this point, we would still have rows which share a POS and ALT pair + ## which could be problematic for the heatmap plot later. + ## + ## This is not something to worry about here, and is resolved in the heatmap + ## script later. + write.table(vcf_info, file = output_tab, + quote = F, sep = "\t", row.names = F) +} + + + # M A I N +stopifnot(exists("samples")) + +for (i in seq_len(nrow(samples))) { + entry <- samples[i, ]; + if (entry$exts %in% c("vcf", "vcf.gz")) { + in_vcf <- entry$files + out_tsv <- paste0(entry$ids, ".tsv") ## use local dir + tsv_eff_from_vcf(in_vcf, out_tsv) + ## point to the new file + samples[i, ]$files <- out_tsv + message(paste(entry$ids, ": converted from VCF to tabular")) + } else { + message(paste(entry$ids, ": already tabular")) + } +}