snpfreqplot: helperFunctions.R comparison

comparison helperFunctions.R @ 0:1062d6ad6503 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/snpfreqplot/ commit 1f35303af979c16d9a3126dbc882a59f686ace5d"

author	iuc
date	Wed, 02 Dec 2020 21:23:06 +0000
parents
children	e362b3143cde

comparison

equal deleted inserted replaced

--1:000000000000
+:1062d6ad6503
+#!/usr/bin/env R
+## Helper functions for processing variant data, especially for
+## data which contains duplicate variants differing only on
+## annotation.
+difference_in_group <- function(lines) {
+#' Find the columns containing the differences between a
+#' group of lines sharing the same POS and ALT.
+#' e.g.
+#'         CHROM  POS REF ALT  IMPACT FUNCLASS   AA   GENE
+#' 1 NC_045512.2 3037   C   T     LOW   SILENT F924 ORF1ab
+#' 2 NC_045512.2 3037   C   T     LOW   SILENT F106 ORF1ab
+#' 3 NC_045512.2 3037   C   T  MEDIUM   SILENT F106 ORF1ab
+#'
+#' should yield:
+#'
+#'            unique.name
+#' 1    3037 T (LOW|F924)
+#' 2    3037 T (LOW|F106)
+#' 3 3037 T (MEDIUM|F106)
+#'
+#' i.e. it identifies that IMPACT and AA are the differing
+#'      columns for each of the rows
+#'
+#' Ideally this function should just be used as
+#' ``tab %>% group_by(POS, ALT) %>% difference_in_group()``
+#'
+diff.colnames <- c()
+nlines <- nrow(lines)
+if (nlines > 1) {
+for (i in 1:(nlines - 1)) {
+test1 <- lines[i, ]
+for (j in i:nlines) {
+test2 <- lines[j, ]
+diff.colnames <- c(diff.colnames,
+names(test1[!(test1 %in% test2)]))
+}
+}
+}
+uni_select <- c("POS", "ALT", diff.colnames)
+return(lines[, uni_select] %>% unite(uni_select, sep = " ")) # nolint
+}
+split_table_and_process <- function(tab) {
+#' Split TAB into groups sharing the same POS and ALT
+#' and create distinguishing labels.
+#'
+#' Calls the above ``difference_in_group`` for each
+#' discovered group.
+#'
+#' This function is necessary because tidyr is difficult
+#' to write custom group binding functions.
+posalts <- tab %>% group_by(POS, ALT) %>% select(POS, ALT) # nolint
+nlines <- nrow(tab)
+groups <- list()
+groups[[1]] <- c(1, 1)
+last_pa <- paste(posalts[1, ])
+for (r in 2:nlines) {
+curr_pa <- paste(posalts[r, ])
+posalt_diff_between_lines <- !all(last_pa == curr_pa)
+if (posalt_diff_between_lines) {
+## end of current group, start of new
+groups[[length(groups)]][2] <- r - 1     ## change prev end
+groups[[length(groups) + 1]] <- c(r, r)  ## set (start, end)
+} else if (r == nlines) {
+## i.e. if the very last line shares
+## the same POS ALT as the one before,
+## close current group.
+groups[[length(groups)]][2] <- r
+}
+last_pa <- curr_pa
+}
+as_tibble(do.call(
+"rbind",
+lapply(groups, function(grange) {
+expand_range <- grange[1]:grange[2]
+difference_in_group(tab[expand_range, ])
+})
+))
+}
+read_and_process <- function(id) {
+file <- (samples %>% filter(ids == id))$files    # nolint
+variants <- read.table(file, header = T, sep = "\t")
+uniq_ids <- split_table_and_process(variants)
+if (nrow(variants) != nrow(uniq_ids)) {
+stop(paste0(id, " '", file, "' failed: ", file, "\"",
+"nrow(variants)=", nrow(variants),
+" but nrow(uniq_ids)=", nrow(uniq_ids)))
+}
+variants <- as_tibble(cbind(variants, uniq_ids)) # nolint
+return(variants)
+}

Mercurial > repos > iuc > snpfreqplot

comparison helperFunctions.R @ 0:1062d6ad6503 draft