comparison helperFunctions.R @ 0:1062d6ad6503 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/snpfreqplot/ commit 1f35303af979c16d9a3126dbc882a59f686ace5d"
author iuc
date Wed, 02 Dec 2020 21:23:06 +0000
parents
children e362b3143cde
comparison
equal deleted inserted replaced
-1:000000000000 0:1062d6ad6503
1 #!/usr/bin/env R
2
3 ## Helper functions for processing variant data, especially for
4 ## data which contains duplicate variants differing only on
5 ## annotation.
6
7 difference_in_group <- function(lines) {
8 #' Find the columns containing the differences between a
9 #' group of lines sharing the same POS and ALT.
10 #' e.g.
11 #' CHROM POS REF ALT IMPACT FUNCLASS AA GENE
12 #' 1 NC_045512.2 3037 C T LOW SILENT F924 ORF1ab
13 #' 2 NC_045512.2 3037 C T LOW SILENT F106 ORF1ab
14 #' 3 NC_045512.2 3037 C T MEDIUM SILENT F106 ORF1ab
15 #'
16 #' should yield:
17 #'
18 #' unique.name
19 #' 1 3037 T (LOW|F924)
20 #' 2 3037 T (LOW|F106)
21 #' 3 3037 T (MEDIUM|F106)
22 #'
23 #' i.e. it identifies that IMPACT and AA are the differing
24 #' columns for each of the rows
25 #'
26 #' Ideally this function should just be used as
27 #' ``tab %>% group_by(POS, ALT) %>% difference_in_group()``
28 #'
29 diff.colnames <- c()
30 nlines <- nrow(lines)
31 if (nlines > 1) {
32 for (i in 1:(nlines - 1)) {
33 test1 <- lines[i, ]
34 for (j in i:nlines) {
35 test2 <- lines[j, ]
36 diff.colnames <- c(diff.colnames,
37 names(test1[!(test1 %in% test2)]))
38 }
39 }
40 }
41 uni_select <- c("POS", "ALT", diff.colnames)
42 return(lines[, uni_select] %>% unite(uni_select, sep = " ")) # nolint
43 }
44
45 split_table_and_process <- function(tab) {
46 #' Split TAB into groups sharing the same POS and ALT
47 #' and create distinguishing labels.
48 #'
49 #' Calls the above ``difference_in_group`` for each
50 #' discovered group.
51 #'
52 #' This function is necessary because tidyr is difficult
53 #' to write custom group binding functions.
54 posalts <- tab %>% group_by(POS, ALT) %>% select(POS, ALT) # nolint
55 nlines <- nrow(tab)
56 groups <- list()
57 groups[[1]] <- c(1, 1)
58 last_pa <- paste(posalts[1, ])
59 for (r in 2:nlines) {
60 curr_pa <- paste(posalts[r, ])
61 posalt_diff_between_lines <- !all(last_pa == curr_pa)
62 if (posalt_diff_between_lines) {
63 ## end of current group, start of new
64 groups[[length(groups)]][2] <- r - 1 ## change prev end
65 groups[[length(groups) + 1]] <- c(r, r) ## set (start, end)
66 } else if (r == nlines) {
67 ## i.e. if the very last line shares
68 ## the same POS ALT as the one before,
69 ## close current group.
70 groups[[length(groups)]][2] <- r
71 }
72 last_pa <- curr_pa
73 }
74 as_tibble(do.call(
75 "rbind",
76 lapply(groups, function(grange) {
77 expand_range <- grange[1]:grange[2]
78 difference_in_group(tab[expand_range, ])
79 })
80 ))
81 }
82
83 read_and_process <- function(id) {
84 file <- (samples %>% filter(ids == id))$files # nolint
85 variants <- read.table(file, header = T, sep = "\t")
86 uniq_ids <- split_table_and_process(variants)
87 if (nrow(variants) != nrow(uniq_ids)) {
88 stop(paste0(id, " '", file, "' failed: ", file, "\"",
89 "nrow(variants)=", nrow(variants),
90 " but nrow(uniq_ids)=", nrow(uniq_ids)))
91 }
92 variants <- as_tibble(cbind(variants, uniq_ids)) # nolint
93 return(variants)
94 }