Mercurial > repos > iuc > snpfreqplot
view snpEffExtract.R @ 1:e362b3143cde draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/snpfreqplot/ commit 1bde09fccd1a5412240ebd5c1f34a45ad73cebe2"
author | iuc |
---|---|
date | Thu, 10 Dec 2020 13:41:29 +0000 |
parents | 1062d6ad6503 |
children | dc51db22310c |
line wrap: on
line source
#!/usr/bin/env R suppressPackageStartupMessages(library(VariantAnnotation)) suppressPackageStartupMessages(library(tidyverse)) tsv_eff_from_vcf <- function(input_vcf, output_tab) { read_vcf <- readVcf(input_vcf) # nolint chrom_pos <- data.frame(read_vcf@rowRanges)[, c("seqnames", "start")] ref_alt_filter <- read_vcf@fixed[, c("REF", "ALT", "FILTER")] dp_af <- read_vcf@info[c("DP", "AF")] ## Unwrap the DNAStringList # nolint start ref_alt_filter <- data.frame( REF = as.character(ref_alt_filter$REF), ALT = sapply(seq_len(nrow(ref_alt_filter)), function(i) { as.character(ref_alt_filter$ALT[[i]]) }), FILTER = as.character(ref_alt_filter$FILTER)) # nolint end ## ## Don't unwrap EFF yet, we need to preserve rows eff <- read_vcf@info["EFF"] stopifnot(nrow(chrom_pos) == nrow(ref_alt_filter)) stopifnot(nrow(ref_alt_filter) == nrow(dp_af)) stopifnot(nrow(dp_af) == nrow(eff)) ## EFF data contains nested constructs we need to unify all ## data sources first, and then explode the EFF column. united <- as_tibble(cbind(chrom_pos, ref_alt_filter, dp_af, eff)) # nolint united_exploderows <- unnest(united, cols = c(EFF)) # nolint united_exploderows <- united_exploderows %>% dplyr::mutate(CHROM = seqnames, POS = start) %>% dplyr::select(CHROM, POS, REF, ALT, FILTER, DP, AF, EFF) ## EFF columns are defined here: ## https://pcingola.github.io/SnpEff/se_inputoutput/ options(warn = -1) ## suppress warnings seperated_info <- united_exploderows %>% separate(EFF, sep = "[(|)]", extra = "merge", ## extra values merged into "extra" column into = c("EFF[*].EFFECT", "EFF[*].IMPACT", "EFF[*].FUNCLASS", "codon.change", "EFF[*].AA", "AA.length", "EFF[*].GENE", "trans.biotype", "gene.coding", "trans.id", "exon.rank", "gt.num", "warnings", "extra")) options(warn = 0) ## If there is data that has been dropped or filled-in, we will see it in ## the "extra" column if it isn't NA or an empty quote. test_missing <- seperated_info %>% dplyr::select("CHROM", "POS", "extra") %>% replace_na(list(extra = "")) %>% filter(extra != "") if (nrow(test_missing) > 0) { print(test_missing) stop("Extra values were not parsed") } vcf_info <- seperated_info %>% dplyr::select("CHROM", "POS", "REF", "ALT", "FILTER", "DP", "AF", "EFF[*].EFFECT", "EFF[*].IMPACT", "EFF[*].FUNCLASS", "EFF[*].AA", "EFF[*].GENE") %>% ## now we de-duplicate any rows that arise from subselecting columns dplyr::distinct() ## At this point, we would still have rows which share a POS and ALT pair ## which could be problematic for the heatmap plot later. ## ## This is not something to worry about here, and is resolved in the heatmap ## script later. write.table(vcf_info, file = output_tab, quote = F, sep = "\t", row.names = F) } # M A I N stopifnot(exists("samples")) for (i in seq_len(nrow(samples))) { entry <- samples[i, ]; if (entry$exts %in% c("vcf", "vcf.gz")) { in_vcf <- entry$files out_tsv <- paste0(entry$ids, ".tsv") ## use local dir tsv_eff_from_vcf(in_vcf, out_tsv) ## point to the new file samples[i, ]$files <- out_tsv message(paste(entry$ids, ": converted from VCF to tabular")) } else { message(paste(entry$ids, ": already tabular")) } }