Mercurial > repos > iuc > snpfreqplot
comparison helperFunctions.R @ 0:1062d6ad6503 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/snpfreqplot/ commit 1f35303af979c16d9a3126dbc882a59f686ace5d"
author | iuc |
---|---|
date | Wed, 02 Dec 2020 21:23:06 +0000 |
parents | |
children | e362b3143cde |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1062d6ad6503 |
---|---|
1 #!/usr/bin/env R | |
2 | |
3 ## Helper functions for processing variant data, especially for | |
4 ## data which contains duplicate variants differing only on | |
5 ## annotation. | |
6 | |
7 difference_in_group <- function(lines) { | |
8 #' Find the columns containing the differences between a | |
9 #' group of lines sharing the same POS and ALT. | |
10 #' e.g. | |
11 #' CHROM POS REF ALT IMPACT FUNCLASS AA GENE | |
12 #' 1 NC_045512.2 3037 C T LOW SILENT F924 ORF1ab | |
13 #' 2 NC_045512.2 3037 C T LOW SILENT F106 ORF1ab | |
14 #' 3 NC_045512.2 3037 C T MEDIUM SILENT F106 ORF1ab | |
15 #' | |
16 #' should yield: | |
17 #' | |
18 #' unique.name | |
19 #' 1 3037 T (LOW|F924) | |
20 #' 2 3037 T (LOW|F106) | |
21 #' 3 3037 T (MEDIUM|F106) | |
22 #' | |
23 #' i.e. it identifies that IMPACT and AA are the differing | |
24 #' columns for each of the rows | |
25 #' | |
26 #' Ideally this function should just be used as | |
27 #' ``tab %>% group_by(POS, ALT) %>% difference_in_group()`` | |
28 #' | |
29 diff.colnames <- c() | |
30 nlines <- nrow(lines) | |
31 if (nlines > 1) { | |
32 for (i in 1:(nlines - 1)) { | |
33 test1 <- lines[i, ] | |
34 for (j in i:nlines) { | |
35 test2 <- lines[j, ] | |
36 diff.colnames <- c(diff.colnames, | |
37 names(test1[!(test1 %in% test2)])) | |
38 } | |
39 } | |
40 } | |
41 uni_select <- c("POS", "ALT", diff.colnames) | |
42 return(lines[, uni_select] %>% unite(uni_select, sep = " ")) # nolint | |
43 } | |
44 | |
45 split_table_and_process <- function(tab) { | |
46 #' Split TAB into groups sharing the same POS and ALT | |
47 #' and create distinguishing labels. | |
48 #' | |
49 #' Calls the above ``difference_in_group`` for each | |
50 #' discovered group. | |
51 #' | |
52 #' This function is necessary because tidyr is difficult | |
53 #' to write custom group binding functions. | |
54 posalts <- tab %>% group_by(POS, ALT) %>% select(POS, ALT) # nolint | |
55 nlines <- nrow(tab) | |
56 groups <- list() | |
57 groups[[1]] <- c(1, 1) | |
58 last_pa <- paste(posalts[1, ]) | |
59 for (r in 2:nlines) { | |
60 curr_pa <- paste(posalts[r, ]) | |
61 posalt_diff_between_lines <- !all(last_pa == curr_pa) | |
62 if (posalt_diff_between_lines) { | |
63 ## end of current group, start of new | |
64 groups[[length(groups)]][2] <- r - 1 ## change prev end | |
65 groups[[length(groups) + 1]] <- c(r, r) ## set (start, end) | |
66 } else if (r == nlines) { | |
67 ## i.e. if the very last line shares | |
68 ## the same POS ALT as the one before, | |
69 ## close current group. | |
70 groups[[length(groups)]][2] <- r | |
71 } | |
72 last_pa <- curr_pa | |
73 } | |
74 as_tibble(do.call( | |
75 "rbind", | |
76 lapply(groups, function(grange) { | |
77 expand_range <- grange[1]:grange[2] | |
78 difference_in_group(tab[expand_range, ]) | |
79 }) | |
80 )) | |
81 } | |
82 | |
83 read_and_process <- function(id) { | |
84 file <- (samples %>% filter(ids == id))$files # nolint | |
85 variants <- read.table(file, header = T, sep = "\t") | |
86 uniq_ids <- split_table_and_process(variants) | |
87 if (nrow(variants) != nrow(uniq_ids)) { | |
88 stop(paste0(id, " '", file, "' failed: ", file, "\"", | |
89 "nrow(variants)=", nrow(variants), | |
90 " but nrow(uniq_ids)=", nrow(uniq_ids))) | |
91 } | |
92 variants <- as_tibble(cbind(variants, uniq_ids)) # nolint | |
93 return(variants) | |
94 } |