Mercurial > repos > iuc > snpfreqplot
comparison helperFunctions.R @ 0:1062d6ad6503 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/snpfreqplot/ commit 1f35303af979c16d9a3126dbc882a59f686ace5d"
| author | iuc |
|---|---|
| date | Wed, 02 Dec 2020 21:23:06 +0000 |
| parents | |
| children | e362b3143cde |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1062d6ad6503 |
|---|---|
| 1 #!/usr/bin/env R | |
| 2 | |
| 3 ## Helper functions for processing variant data, especially for | |
| 4 ## data which contains duplicate variants differing only on | |
| 5 ## annotation. | |
| 6 | |
| 7 difference_in_group <- function(lines) { | |
| 8 #' Find the columns containing the differences between a | |
| 9 #' group of lines sharing the same POS and ALT. | |
| 10 #' e.g. | |
| 11 #' CHROM POS REF ALT IMPACT FUNCLASS AA GENE | |
| 12 #' 1 NC_045512.2 3037 C T LOW SILENT F924 ORF1ab | |
| 13 #' 2 NC_045512.2 3037 C T LOW SILENT F106 ORF1ab | |
| 14 #' 3 NC_045512.2 3037 C T MEDIUM SILENT F106 ORF1ab | |
| 15 #' | |
| 16 #' should yield: | |
| 17 #' | |
| 18 #' unique.name | |
| 19 #' 1 3037 T (LOW|F924) | |
| 20 #' 2 3037 T (LOW|F106) | |
| 21 #' 3 3037 T (MEDIUM|F106) | |
| 22 #' | |
| 23 #' i.e. it identifies that IMPACT and AA are the differing | |
| 24 #' columns for each of the rows | |
| 25 #' | |
| 26 #' Ideally this function should just be used as | |
| 27 #' ``tab %>% group_by(POS, ALT) %>% difference_in_group()`` | |
| 28 #' | |
| 29 diff.colnames <- c() | |
| 30 nlines <- nrow(lines) | |
| 31 if (nlines > 1) { | |
| 32 for (i in 1:(nlines - 1)) { | |
| 33 test1 <- lines[i, ] | |
| 34 for (j in i:nlines) { | |
| 35 test2 <- lines[j, ] | |
| 36 diff.colnames <- c(diff.colnames, | |
| 37 names(test1[!(test1 %in% test2)])) | |
| 38 } | |
| 39 } | |
| 40 } | |
| 41 uni_select <- c("POS", "ALT", diff.colnames) | |
| 42 return(lines[, uni_select] %>% unite(uni_select, sep = " ")) # nolint | |
| 43 } | |
| 44 | |
| 45 split_table_and_process <- function(tab) { | |
| 46 #' Split TAB into groups sharing the same POS and ALT | |
| 47 #' and create distinguishing labels. | |
| 48 #' | |
| 49 #' Calls the above ``difference_in_group`` for each | |
| 50 #' discovered group. | |
| 51 #' | |
| 52 #' This function is necessary because tidyr is difficult | |
| 53 #' to write custom group binding functions. | |
| 54 posalts <- tab %>% group_by(POS, ALT) %>% select(POS, ALT) # nolint | |
| 55 nlines <- nrow(tab) | |
| 56 groups <- list() | |
| 57 groups[[1]] <- c(1, 1) | |
| 58 last_pa <- paste(posalts[1, ]) | |
| 59 for (r in 2:nlines) { | |
| 60 curr_pa <- paste(posalts[r, ]) | |
| 61 posalt_diff_between_lines <- !all(last_pa == curr_pa) | |
| 62 if (posalt_diff_between_lines) { | |
| 63 ## end of current group, start of new | |
| 64 groups[[length(groups)]][2] <- r - 1 ## change prev end | |
| 65 groups[[length(groups) + 1]] <- c(r, r) ## set (start, end) | |
| 66 } else if (r == nlines) { | |
| 67 ## i.e. if the very last line shares | |
| 68 ## the same POS ALT as the one before, | |
| 69 ## close current group. | |
| 70 groups[[length(groups)]][2] <- r | |
| 71 } | |
| 72 last_pa <- curr_pa | |
| 73 } | |
| 74 as_tibble(do.call( | |
| 75 "rbind", | |
| 76 lapply(groups, function(grange) { | |
| 77 expand_range <- grange[1]:grange[2] | |
| 78 difference_in_group(tab[expand_range, ]) | |
| 79 }) | |
| 80 )) | |
| 81 } | |
| 82 | |
| 83 read_and_process <- function(id) { | |
| 84 file <- (samples %>% filter(ids == id))$files # nolint | |
| 85 variants <- read.table(file, header = T, sep = "\t") | |
| 86 uniq_ids <- split_table_and_process(variants) | |
| 87 if (nrow(variants) != nrow(uniq_ids)) { | |
| 88 stop(paste0(id, " '", file, "' failed: ", file, "\"", | |
| 89 "nrow(variants)=", nrow(variants), | |
| 90 " but nrow(uniq_ids)=", nrow(uniq_ids))) | |
| 91 } | |
| 92 variants <- as_tibble(cbind(variants, uniq_ids)) # nolint | |
| 93 return(variants) | |
| 94 } |
