Mercurial > repos > iuc > snpfreqplot
diff helperFunctions.R @ 0:1062d6ad6503 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/snpfreqplot/ commit 1f35303af979c16d9a3126dbc882a59f686ace5d"
author | iuc |
---|---|
date | Wed, 02 Dec 2020 21:23:06 +0000 |
parents | |
children | e362b3143cde |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/helperFunctions.R Wed Dec 02 21:23:06 2020 +0000 @@ -0,0 +1,94 @@ +#!/usr/bin/env R + +## Helper functions for processing variant data, especially for +## data which contains duplicate variants differing only on +## annotation. + +difference_in_group <- function(lines) { + #' Find the columns containing the differences between a + #' group of lines sharing the same POS and ALT. + #' e.g. + #' CHROM POS REF ALT IMPACT FUNCLASS AA GENE + #' 1 NC_045512.2 3037 C T LOW SILENT F924 ORF1ab + #' 2 NC_045512.2 3037 C T LOW SILENT F106 ORF1ab + #' 3 NC_045512.2 3037 C T MEDIUM SILENT F106 ORF1ab + #' + #' should yield: + #' + #' unique.name + #' 1 3037 T (LOW|F924) + #' 2 3037 T (LOW|F106) + #' 3 3037 T (MEDIUM|F106) + #' + #' i.e. it identifies that IMPACT and AA are the differing + #' columns for each of the rows + #' + #' Ideally this function should just be used as + #' ``tab %>% group_by(POS, ALT) %>% difference_in_group()`` + #' + diff.colnames <- c() + nlines <- nrow(lines) + if (nlines > 1) { + for (i in 1:(nlines - 1)) { + test1 <- lines[i, ] + for (j in i:nlines) { + test2 <- lines[j, ] + diff.colnames <- c(diff.colnames, + names(test1[!(test1 %in% test2)])) + } + } + } + uni_select <- c("POS", "ALT", diff.colnames) + return(lines[, uni_select] %>% unite(uni_select, sep = " ")) # nolint +} + +split_table_and_process <- function(tab) { + #' Split TAB into groups sharing the same POS and ALT + #' and create distinguishing labels. + #' + #' Calls the above ``difference_in_group`` for each + #' discovered group. + #' + #' This function is necessary because tidyr is difficult + #' to write custom group binding functions. + posalts <- tab %>% group_by(POS, ALT) %>% select(POS, ALT) # nolint + nlines <- nrow(tab) + groups <- list() + groups[[1]] <- c(1, 1) + last_pa <- paste(posalts[1, ]) + for (r in 2:nlines) { + curr_pa <- paste(posalts[r, ]) + posalt_diff_between_lines <- !all(last_pa == curr_pa) + if (posalt_diff_between_lines) { + ## end of current group, start of new + groups[[length(groups)]][2] <- r - 1 ## change prev end + groups[[length(groups) + 1]] <- c(r, r) ## set (start, end) + } else if (r == nlines) { + ## i.e. if the very last line shares + ## the same POS ALT as the one before, + ## close current group. + groups[[length(groups)]][2] <- r + } + last_pa <- curr_pa + } + as_tibble(do.call( + "rbind", + lapply(groups, function(grange) { + expand_range <- grange[1]:grange[2] + difference_in_group(tab[expand_range, ]) + }) + )) +} + +read_and_process <- function(id) { + file <- (samples %>% filter(ids == id))$files # nolint + variants <- read.table(file, header = T, sep = "\t") + uniq_ids <- split_table_and_process(variants) + if (nrow(variants) != nrow(uniq_ids)) { + stop(paste0(id, " '", file, "' failed: ", file, "\"", + "nrow(variants)=", nrow(variants), + " but nrow(uniq_ids)=", nrow(uniq_ids))) + } + variants <- as_tibble(cbind(variants, uniq_ids)) # nolint + return(variants) +}