Mercurial > repos > iuc > snpfreqplot
comparison snpEffExtract.R @ 0:1062d6ad6503 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/snpfreqplot/ commit 1f35303af979c16d9a3126dbc882a59f686ace5d"
author | iuc |
---|---|
date | Wed, 02 Dec 2020 21:23:06 +0000 |
parents | |
children | dc51db22310c |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1062d6ad6503 |
---|---|
1 #!/usr/bin/env R | |
2 | |
3 suppressPackageStartupMessages(library(VariantAnnotation)) | |
4 suppressPackageStartupMessages(library(tidyverse)) | |
5 | |
6 tsv_eff_from_vcf <- function(input_vcf, output_tab) { | |
7 read_vcf <- readVcf(input_vcf) # nolint | |
8 chrom_pos <- data.frame(read_vcf@rowRanges)[, c("seqnames", "start")] | |
9 ref_alt_filter <- read_vcf@fixed[, c("REF", "ALT", "FILTER")] | |
10 dp_af <- read_vcf@info[c("DP", "AF")] | |
11 | |
12 ## Unwrap the DNAStringList | |
13 # nolint start | |
14 ref_alt_filter <- data.frame( | |
15 REF = as.character(ref_alt_filter$REF), | |
16 ALT = sapply(seq_len(nrow(ref_alt_filter)), function(i) { | |
17 as.character(ref_alt_filter$ALT[[i]]) | |
18 }), | |
19 FILTER = as.character(ref_alt_filter$FILTER)) | |
20 # nolint end | |
21 ## | |
22 ## Don't unwrap EFF yet, we need to preserve rows | |
23 eff <- read_vcf@info["EFF"] | |
24 | |
25 stopifnot(nrow(chrom_pos) == nrow(ref_alt_filter)) | |
26 stopifnot(nrow(ref_alt_filter) == nrow(dp_af)) | |
27 stopifnot(nrow(dp_af) == nrow(eff)) | |
28 | |
29 ## EFF data contains nested constructs we need to unify all | |
30 ## data sources first, and then explode the EFF column. | |
31 united <- as_tibble(cbind(chrom_pos, ref_alt_filter, dp_af, eff)) # nolint | |
32 united_exploderows <- unnest(united, cols = c(EFF)) # nolint | |
33 | |
34 united_exploderows <- united_exploderows %>% | |
35 dplyr::mutate(CHROM = seqnames, POS = start) %>% | |
36 dplyr::select(CHROM, POS, REF, ALT, FILTER, DP, AF, EFF) | |
37 | |
38 ## EFF columns are defined here: | |
39 ## https://pcingola.github.io/SnpEff/se_inputoutput/ | |
40 options(warn = -1) ## suppress warnings | |
41 seperated_info <- united_exploderows %>% | |
42 separate(EFF, sep = "[(|)]", | |
43 extra = "merge", ## extra values merged into "extra" column | |
44 into = c("EFF[*].EFFECT", "EFF[*].IMPACT", "EFF[*].FUNCLASS", | |
45 "codon.change", "EFF[*].AA", "AA.length", | |
46 "EFF[*].GENE", "trans.biotype", "gene.coding", | |
47 "trans.id", "exon.rank", "gt.num", "warnings", | |
48 "extra")) | |
49 options(warn = 0) | |
50 ## If there is data that has been dropped or filled-in, we will see it in | |
51 ## the "extra" column if it isn't NA or an empty quote. | |
52 test_missing <- seperated_info %>% | |
53 dplyr::select("CHROM", "POS", "extra") %>% | |
54 replace_na(list(extra = "")) %>% | |
55 filter(extra != "") | |
56 | |
57 if (nrow(test_missing) > 0) { | |
58 print(test_missing) | |
59 stop("Extra values were not parsed") | |
60 } | |
61 | |
62 vcf_info <- seperated_info %>% | |
63 dplyr::select("CHROM", "POS", "REF", "ALT", "FILTER", "DP", "AF", | |
64 "EFF[*].EFFECT", "EFF[*].IMPACT", "EFF[*].FUNCLASS", | |
65 "EFF[*].AA", "EFF[*].GENE") %>% | |
66 ## now we de-duplicate any rows that arise from subselecting columns | |
67 dplyr::distinct() | |
68 | |
69 ## At this point, we would still have rows which share a POS and ALT pair | |
70 ## which could be problematic for the heatmap plot later. | |
71 ## | |
72 ## This is not something to worry about here, and is resolved in the heatmap | |
73 ## script later. | |
74 write.table(vcf_info, file = output_tab, | |
75 quote = F, sep = "\t", row.names = F) | |
76 } | |
77 | |
78 | |
79 # M A I N | |
80 stopifnot(exists("samples")) | |
81 | |
82 for (i in seq_len(nrow(samples))) { | |
83 entry <- samples[i, ]; | |
84 if (entry$exts %in% c("vcf", "vcf.gz")) { | |
85 in_vcf <- entry$files | |
86 out_tsv <- paste0(entry$ids, ".tsv") ## use local dir | |
87 tsv_eff_from_vcf(in_vcf, out_tsv) | |
88 ## point to the new file | |
89 samples[i, ]$files <- out_tsv | |
90 message(paste(entry$ids, ": converted from VCF to tabular")) | |
91 } else { | |
92 message(paste(entry$ids, ": already tabular")) | |
93 } | |
94 } |