comparison snpEffExtract.R @ 0:1062d6ad6503 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/snpfreqplot/ commit 1f35303af979c16d9a3126dbc882a59f686ace5d"
author iuc
date Wed, 02 Dec 2020 21:23:06 +0000
parents
children dc51db22310c
comparison
equal deleted inserted replaced
-1:000000000000 0:1062d6ad6503
1 #!/usr/bin/env R
2
3 suppressPackageStartupMessages(library(VariantAnnotation))
4 suppressPackageStartupMessages(library(tidyverse))
5
6 tsv_eff_from_vcf <- function(input_vcf, output_tab) {
7 read_vcf <- readVcf(input_vcf) # nolint
8 chrom_pos <- data.frame(read_vcf@rowRanges)[, c("seqnames", "start")]
9 ref_alt_filter <- read_vcf@fixed[, c("REF", "ALT", "FILTER")]
10 dp_af <- read_vcf@info[c("DP", "AF")]
11
12 ## Unwrap the DNAStringList
13 # nolint start
14 ref_alt_filter <- data.frame(
15 REF = as.character(ref_alt_filter$REF),
16 ALT = sapply(seq_len(nrow(ref_alt_filter)), function(i) {
17 as.character(ref_alt_filter$ALT[[i]])
18 }),
19 FILTER = as.character(ref_alt_filter$FILTER))
20 # nolint end
21 ##
22 ## Don't unwrap EFF yet, we need to preserve rows
23 eff <- read_vcf@info["EFF"]
24
25 stopifnot(nrow(chrom_pos) == nrow(ref_alt_filter))
26 stopifnot(nrow(ref_alt_filter) == nrow(dp_af))
27 stopifnot(nrow(dp_af) == nrow(eff))
28
29 ## EFF data contains nested constructs we need to unify all
30 ## data sources first, and then explode the EFF column.
31 united <- as_tibble(cbind(chrom_pos, ref_alt_filter, dp_af, eff)) # nolint
32 united_exploderows <- unnest(united, cols = c(EFF)) # nolint
33
34 united_exploderows <- united_exploderows %>%
35 dplyr::mutate(CHROM = seqnames, POS = start) %>%
36 dplyr::select(CHROM, POS, REF, ALT, FILTER, DP, AF, EFF)
37
38 ## EFF columns are defined here:
39 ## https://pcingola.github.io/SnpEff/se_inputoutput/
40 options(warn = -1) ## suppress warnings
41 seperated_info <- united_exploderows %>%
42 separate(EFF, sep = "[(|)]",
43 extra = "merge", ## extra values merged into "extra" column
44 into = c("EFF[*].EFFECT", "EFF[*].IMPACT", "EFF[*].FUNCLASS",
45 "codon.change", "EFF[*].AA", "AA.length",
46 "EFF[*].GENE", "trans.biotype", "gene.coding",
47 "trans.id", "exon.rank", "gt.num", "warnings",
48 "extra"))
49 options(warn = 0)
50 ## If there is data that has been dropped or filled-in, we will see it in
51 ## the "extra" column if it isn't NA or an empty quote.
52 test_missing <- seperated_info %>%
53 dplyr::select("CHROM", "POS", "extra") %>%
54 replace_na(list(extra = "")) %>%
55 filter(extra != "")
56
57 if (nrow(test_missing) > 0) {
58 print(test_missing)
59 stop("Extra values were not parsed")
60 }
61
62 vcf_info <- seperated_info %>%
63 dplyr::select("CHROM", "POS", "REF", "ALT", "FILTER", "DP", "AF",
64 "EFF[*].EFFECT", "EFF[*].IMPACT", "EFF[*].FUNCLASS",
65 "EFF[*].AA", "EFF[*].GENE") %>%
66 ## now we de-duplicate any rows that arise from subselecting columns
67 dplyr::distinct()
68
69 ## At this point, we would still have rows which share a POS and ALT pair
70 ## which could be problematic for the heatmap plot later.
71 ##
72 ## This is not something to worry about here, and is resolved in the heatmap
73 ## script later.
74 write.table(vcf_info, file = output_tab,
75 quote = F, sep = "\t", row.names = F)
76 }
77
78
79 # M A I N
80 stopifnot(exists("samples"))
81
82 for (i in seq_len(nrow(samples))) {
83 entry <- samples[i, ];
84 if (entry$exts %in% c("vcf", "vcf.gz")) {
85 in_vcf <- entry$files
86 out_tsv <- paste0(entry$ids, ".tsv") ## use local dir
87 tsv_eff_from_vcf(in_vcf, out_tsv)
88 ## point to the new file
89 samples[i, ]$files <- out_tsv
90 message(paste(entry$ids, ": converted from VCF to tabular"))
91 } else {
92 message(paste(entry$ids, ": already tabular"))
93 }
94 }