diff helperFunctions.R @ 0:1062d6ad6503 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/snpfreqplot/ commit 1f35303af979c16d9a3126dbc882a59f686ace5d"
author iuc
date Wed, 02 Dec 2020 21:23:06 +0000
parents
children e362b3143cde
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/helperFunctions.R	Wed Dec 02 21:23:06 2020 +0000
@@ -0,0 +1,94 @@
+#!/usr/bin/env R
+
+## Helper functions for processing variant data, especially for
+## data which contains duplicate variants differing only on
+## annotation.
+
+difference_in_group <- function(lines) {
+    #' Find the columns containing the differences between a
+    #' group of lines sharing the same POS and ALT.
+    #' e.g.
+    #'         CHROM  POS REF ALT  IMPACT FUNCLASS   AA   GENE
+    #' 1 NC_045512.2 3037   C   T     LOW   SILENT F924 ORF1ab
+    #' 2 NC_045512.2 3037   C   T     LOW   SILENT F106 ORF1ab
+    #' 3 NC_045512.2 3037   C   T  MEDIUM   SILENT F106 ORF1ab
+    #'
+    #' should yield:
+    #'
+    #'            unique.name
+    #' 1    3037 T (LOW|F924)
+    #' 2    3037 T (LOW|F106)
+    #' 3 3037 T (MEDIUM|F106)
+    #'
+    #' i.e. it identifies that IMPACT and AA are the differing
+    #'      columns for each of the rows
+    #'
+    #' Ideally this function should just be used as
+    #' ``tab %>% group_by(POS, ALT) %>% difference_in_group()``
+    #'
+    diff.colnames <- c()
+    nlines <- nrow(lines)
+    if (nlines > 1) {
+        for (i in 1:(nlines - 1)) {
+            test1 <- lines[i, ]
+            for (j in i:nlines) {
+                test2 <- lines[j, ]
+                diff.colnames <- c(diff.colnames,
+                                   names(test1[!(test1 %in% test2)]))
+            }
+        }
+    }
+    uni_select <- c("POS", "ALT", diff.colnames)
+    return(lines[, uni_select] %>% unite(uni_select, sep = " ")) # nolint
+}
+
+split_table_and_process <- function(tab) {
+    #' Split TAB into groups sharing the same POS and ALT
+    #' and create distinguishing labels.
+    #'
+    #' Calls the above ``difference_in_group`` for each
+    #' discovered group.
+    #'
+    #' This function is necessary because tidyr is difficult
+    #' to write custom group binding functions.
+    posalts <- tab %>% group_by(POS, ALT) %>% select(POS, ALT) # nolint
+    nlines <- nrow(tab)
+    groups <- list()
+    groups[[1]] <- c(1, 1)
+    last_pa <- paste(posalts[1, ])
+    for (r in 2:nlines) {
+        curr_pa <- paste(posalts[r, ])
+        posalt_diff_between_lines <- !all(last_pa == curr_pa)
+        if (posalt_diff_between_lines) {
+            ## end of current group, start of new
+            groups[[length(groups)]][2] <- r - 1     ## change prev end
+            groups[[length(groups) + 1]] <- c(r, r)  ## set (start, end)
+        } else if (r == nlines) {
+            ## i.e. if the very last line shares
+            ## the same POS ALT as the one before,
+            ## close current group.
+            groups[[length(groups)]][2] <- r
+        }
+        last_pa <- curr_pa
+    }
+    as_tibble(do.call(
+        "rbind",
+        lapply(groups, function(grange) {
+            expand_range <- grange[1]:grange[2]
+            difference_in_group(tab[expand_range, ])
+        })
+    ))
+}
+
+read_and_process <- function(id) {
+    file <- (samples %>% filter(ids == id))$files    # nolint
+    variants <- read.table(file, header = T, sep = "\t")
+    uniq_ids <- split_table_and_process(variants)
+    if (nrow(variants) != nrow(uniq_ids)) {
+        stop(paste0(id, " '", file, "' failed: ", file, "\"",
+                    "nrow(variants)=", nrow(variants),
+                    " but nrow(uniq_ids)=", nrow(uniq_ids)))
+    }
+    variants <- as_tibble(cbind(variants, uniq_ids)) # nolint
+    return(variants)
+}