Mercurial > repos > galaxyp > mqppep_preproc

---
title: "MaxQuant Phosphoproteomic Enrichment Pipeline ANOVA/KSEA"
author:
- "Nick Graham^[ORCiD 0000-0002-6811-1941, University of Southern California: Los Angeles, CA, US]"
- "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]"
- "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]"
date:
- "May 28, 2018"
- "; revised December 7, 2022"
lot: true
output:
  pdf_document:
    toc: true
    toc_depth: 2
    keep_tex: true
    dev: pdf
    includes:
      in_header: mqppep_anova_preamble.tex
latex_macros: false
raw_tex: true
urlcolor: blue
params:
  alphaFile:            "test-data/alpha_levels.tabular"
  inputFile:            "test-data/test_input_for_anova.tabular"
  preprocDb:            "test-data/test_input_for_anova.sqlite"
  kseaAppPrepDb:        !r c(":memory:", "test-data/mqppep.sqlite")[2]
  regexSampleNames:     "\\.\\d+[A-Z]$"
  regexSampleGrouping:  "\\d+"
  groupFilterPatterns:  ".+"
  groupFilter:    !r c("none", "exclude", "include")[1]
  imputationMethod:     !r c("group-median", "median", "mean", "random")[4]
  kseaCutoffThreshold:  !r c(0.05, 0.1, 0.25, 0.5, 0.9)[5]
  #imputationMethod:     !r c("group-median", "median", "mean", "random")[1]

  # how should sample groups be interpreted?
  #  - "f": fixed patterns (like `grep -F`)
  #  - "p": PERL-compatible (like `grep -P`)
  #  - "r": extended grep patterns (like `grep -E`)
  # use what case sensitivity?
  #  - "i": case insensitive matching (like `grep -i`)
  groupFilterMode: !r c("r", "ri", "p", "pi", "f", "fi")[1]
  # what pattern should be used for the first column
  #   (extended grep pattern, case sensitive)
  firstDataColumn:      "^Intensity[^_]"
  # for small random value imputation, what percentile should be center?
  meanPercentile:       50
  #meanPercentile:       1
  # for small random value imputation, what should `s / mean(x)` ratio be?
  sdPercentile:         1.0
  # output path for imputed data file
  imputedDataFilename:  "test-data/imputedDataFilename.txt"
  # output path for imputed/quantile-normalized/log-transformed data file
  imputedQNLTDataFile:  "test-data/imputedQNLTDataFile.txt"
  # output path for contents of `stats_metadata_v` table
  anovaKseaMetadata:    "test-data/anovaKseaMetadata.txt"
  # how to test one variable with > 2 categories (e.g., aov or kruskal.test)
  oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1]
  # how to test one variable with 2 categories (e.g., oneway.test)
  oneWayTwoCategories:  !r c("aov", "kruskal.test", "oneway.test")[3]
  # what should be the minimum quality for consideration in both
  minQuality:           0
  # correct KSEA with FDR (recommended) or raw p-value
  kseaCutoffStatistic:  !r c("FDR", "p.value")[1]
  # correct KSEA threshold 0.05 (conventional) or higher (perhaps better)
  #   "perhaps better" meaning that KSEA is an hypothesis-generator, not -test
  #kseaCutoffThreshold:  !r c(0.05, 0.1, 0.25, 0.5)[1]
  # minimum number of substrates required for a kinase to be considered in KSEA
  kseaMinSubstrateCount: 1
  # Should KSEA be performed aggregating signed log2FC or absolute?
  # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores
  # TRUE  use abs(log2FC) for KSEA as Justin Drake requested; this is a
  #         justifiable deviation from the KSEAapp::KSEA.Scores algorithm.
  kseaUseAbsoluteLog2FC: TRUE
  #kseaUseAbsoluteLog2FC: FALSE
  # minimum number of observed values per sample-group
  intensityMinValuesPerGroup: 1
  # maximum number of heatmap rows (result are poor when > 50)
  intensityHeatmapRows: 50
  # what should be the primary criterion to eliminate excessive heatmap rows
  intensityHeatmapCriteria: !r c("quality", "na_count", "p_value")[1]
  # should correlation among substrates be used (rather than covariance)
  correlateSubstrates:  TRUE
  # only show covariance among variables having variance > 1
  filterCovVarGT1:      FALSE
  # maximum number of residues to display for ppeps in rownames or columnames
  ppepTruncN:           10
  # maximum number of characters of subgenes to display in rownames or columnames
  subgeneTruncN:        10
  # maximum number of characters for paste(subgene, ppep) for enrichment plots
  substTruncN:          20
  # should boxplots use variable-width boxes to reflect # of samples
  boxPlotVarWidth:      TRUE
  # should boxplots use notched boxes to reflect difference between samples
  boxPlotNotch:         TRUE
  # look-up tables for kinase descriptions
  kinaseNameUprtLutBz2: "./kinase_name_uniprot_lut.tabular.bz2"
  kinaseUprtDescLutBz2: "./kinase_uniprot_description_lut.tabular.bz2"
  # should debugging trace messages be printed?
  showEnrichedSubstrates: FALSE
  # should row-scaling be applied to heatmaps: "none" or "row"
  defaultHeatMapRowScaling: !r c("none", "row")[1]
  # how missing values be displayed on heat maps: "NA" or " "
  heatMapNAcellNote:    !r c(" ", "NA")[1]
  # how missing values be displayed on heat maps: "NA" or " "
  heatMapNAgrey:        "#D8D8D8"
  # temporary hack
  heatMapNAsubstitute:  TRUE
  # what color should be used for missing values be displayed on heat maps
  heatMapNAcellColor:   "grey15"
  # should debugging trace messages be printed?
  printTraceMsgs:       FALSE
  # when debugging files are needed, set debugFileBasePath to the path
  #   to the directory where they should be written
  debugFileBasePath:    !r if (TRUE) NULL else "test-data"
  # should debugging nb/nbe messages be printed?
  printNBMsgs:          FALSE
  #printNBMsgs:          TRUE
---

```{r setup, include = FALSE, results = 'asis'}

# simple debug messaging
print_nb_messages <- params$printNBMsgs

nb <- if (!print_nb_messages) {
    function(...) invisible()
  } else {
    function(..., f = cat) f("\n$\\exists{}\\supset\\forall{}$", ...)
  }

nbe <- if (!print_nb_messages) {
    function(...) invisible()
  } else {
    function(..., f = cat, file = stderr()) {
      cat(
        stringi::stri_unescape_unicode("\nN.B. \\u2203\\u2283\\u2200"),
        ...,
        file = file
      )
    }
  }

#ref for debugging: https://yihui.org/tinytex/r/#debugging
options(tinytex.verbose = TRUE)

# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
# ref for top and bottom struts: https://tex.stackexchange.com/a/50355
knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10), dpi = 300)

# freeze the random number generator so the same results will be produced
#  from run to run
set.seed(28571)

### LIBRARIES

if (print_nb_messages) nbe("library(gplots)")
library(gplots)
if (print_nb_messages) nbe("library(caret)")
# load caret for nearZeroVar
if (print_nb_messages) nbe("Please ignore any messages about systemd.\n")
library(caret)
if (print_nb_messages) nbe("library(DBI)")
library(DBI)
if (print_nb_messages) nbe("library(RSQLite)")
library(RSQLite)
if (print_nb_messages) nbe("library(sqldf)\n")
# Suppress "Warning: no DISPLAY variable so Tk is not available"
suppressWarnings(suppressMessages(library(sqldf)))

# required but not added to search list:
# - DBI
# - RSQLite
# - ggplot2
# - knitr
# - latex2exp
# - preprocessCore
# - reshape2
# - vioplot

### CONSTANTS

const_boxplot_fill           <- "grey94"
const_ksea_astrsk_kinases    <- 1
const_ksea_nonastrsk_kinases <- 2
const_ksea_all_kinases       <- 3
const_log10_e                <- log10(exp(1))
const_stripchart_cex         <- 0.5
const_stripchart_jitter      <- 0.3
const_table_anchor_bp        <- "bp"
const_table_anchor_ht        <- "ht"
const_table_anchor_p         <- "p"
const_table_anchor_t         <- "t"
const_table_anchor_tbp       <- "tbp"


### GLOBAL VARIABLES (params)

## functions to process params

is_string_null_or_empty <- function(x) {
  # N. B. non-strings are intentionally treated as NULL
  if (is.null(x))
    TRUE
  else if (!is.character(x))
    TRUE
  else  x == ""
}

##' Catch *and* save both errors and warnings, and in the case of
##' a warning, also keep the computed result.
##' return result as list(value = ..., warning = ...)
##' - value will be:
##'   - the result if no exception is thrown
##'   - the exception if an exception is caught
##' - warning will be a string except perhaps when warning argument is not NULL
##'
##' adapted from `demo(error.catching)`
##'
##' @title tryCatch both warnings (with value) and errors
##' @param expr an \R expression to evaluate
##' @return a list with 'value' and 'warning', where
##'   'value' may be an error caught.
##' @author Martin Maechler;
##' Copyright (C) 2010-2012  The R Core Team
try_catch_w_e <-
  function(expr, error = function(e) e, warning = NULL) {
    wrn <- NULL
    # warning handler
    w_handler <-
      if (is.function(warning))
        warning
      else
        function(w) {
          wrn <<- w
          invokeRestart("muffleWarning")
        }
    e_handler <-
      if (is.function(error))
        error
      else
        function(e) e
    # return result as list(value = ..., warning = ...)
    # - value will be:
    #   - the result if no exception is thrown
    #   - the exception if an exception is caught
    list(
      value = withCallingHandlers(
        tryCatch(
          expr,
          error = e_handler
        ),
        warning = w_handler
      ),
      warning = wrn
    )
  }

see_kvp <-
  function(format, key, value, suffix = "") {
    if (
      !all(
        is.character(format),
        is.character(key),
        is.character(value),
        is.character(suffix)
      )
    ) {
      cat("all arguments to see_kvp should be character")
      knitr::knit_exit()
    }
    result <- sprintf(format, value)
    if (length(result) > 1) {
      sprintf(
        "%s = c(%s)%s",
        whack_underscores(key),
        paste(result, collapse = ", "),
        suffix
      )
    } else {
       sprintf(
         "%s = %s%s",
         key,
         result,
         suffix
       )
    }
  }

see_logical <-
  function(x, suffix = "", xprssn = deparse1(substitute(x))) {
      result <- as.character(as.logical(x))
      # handle NAs and NaNs
      result[is.na(result)] <- "NA"
      see_kvp(
        format = "%s",
        key    = xprssn,
        value  = result,
        suffix = suffix
      )
  }

see_numeric <-
  function(x, suffix = "", digits = 3, xprssn = deparse1(substitute(x))) {
    if (is.numeric(digits) && is.numeric(x)) {
      digits <- as.integer(digits)
      digits <- min(16, max(0, digits))
      format <- paste0("%0.", as.character(digits), "g")
      result <- sprintf(format, x)
      see_kvp(
        format = "%s",
        key    = xprssn,
        value  = result,
        suffix = suffix
      )
    }
  }

see_character <-
  function(x, suffix = "", xprssn = deparse1(substitute(x))) {
    if (is.character(x)) {
      see_kvp(
        format = "%s",
        key    = xprssn,
        value  = sprintf("\"%s\"", x),
        suffix = suffix
      )
    }
  }

see_variable <-
  function(x, suffix = "", digits = 3, xprssn = deparse1(substitute(x))) {
    if (is.character(x)) {
      see_character(x, suffix, xprssn)
    } else if (is.numeric(x)) {
      see_numeric(x, suffix, digits, xprssn)
    } else if (is.logical(x)) {
      see_logical(x, suffix, xprssn)
    } else {
      f <- file("")
      sink(f)
      str(x)
      msg <- paste(readLines(f), collapse = "\n")
      sink()
      close(f)
      paste0(
        "see_variable - str(",
        xprssn,
        "):\n",
        msg, "\n"
      )
    }
  }

divert_warnings <-
  function(expr, classes = "warning") {
    withCallingHandlers(
      expr,
      warning = function(w) {
        cat("  divert_warnings: ", w$message, "\n", file = stderr())
        if (inherits(w, classes))
          tryInvokeRestart("muffleWarning")
        }
      )
  }

# ref: https://tug.org/texinfohtml/latex2e.html
# LaTeX sets aside the following characters for special purposes.
#   For example, the percent sign % is for comments.
#   They are called reserved characters or special characters.
#   They are all discussed elsewhere in this manual.
#
#   $ % & { } _ ~ ^ \ #
#
# If you want a reserved character to be printed as itself, in the text body
#   font, for all but the final three characters in that list simply put
#   a backslash \ in front of the character.
#   Thus, typing \$1.23 will produce $1.23 in your output.
#
# As to the last three characters, to get a tilde in the text body font,
#   use \~{} (omitting the curly braces would result in the next character
#   receiving a tilde accent).
# Similarly, to get a text body font circumflex use \^{}.
#   To get a backslash in the font of the text body enter \textbackslash{}.
whack_math <- if (TRUE) {
  function(v) {
    v <- as.character(v)
    w <- v
    w <- gsub("\\", "\\textbackslash ", v, fixed = TRUE)
    w <- Reduce(
          f = function(l, r) {
            ptrn <- paste0("\\", r)
            for (i in seq_along(l)) {
              if (!grepl(ptrn, l[i], fixed = TRUE)) {
                l[i] <- gsub(r, ptrn, l[i], fixed = TRUE)
              }
            }
            l
          },
          x = c("#", "$", "%", "&", "{", "}", "_"),
          init = w
        )
    w <- gsub("^", "\\^{}", w, fixed = TRUE)
    w <- gsub("\\textbackslash \\_", "\\_", w, fixed = TRUE)
    return(w)
  }
} else {
  function(v) {
    v <- as.character(v)
    w <- v
    w <- gsub("\\", "\\textbackslash ", v, fixed = TRUE)
    w <- Reduce(
          f = function(l, r) {
            if (length(l) > 1) {
              cat("whack_math: deparse1(v) = `", deparse1(v), "`\n",
                  file = stderr())
              cat("whack_math: v = `", v, "`\n", file = stderr())
              cat("  Reduce f: l = `", l,
                "` r = `", r,
                "`\n", file = stderr()
                )
              divert_warnings(grepl(r, l, fixed = TRUE))
            }
            ptrn <- paste0("\\", r)
            cat("ptrn: `", ptrn, "`\n", file = stderr())
            for (i in seq_along(l)) {
              cat("  before l[i] = ", l[i], "\n", file = stderr())
              if (!grepl(ptrn, l[i], fixed = TRUE)) {
                l[i] <- gsub(r, ptrn, l[i], fixed = TRUE)
              }
              cat("   after l[i] = ", l[i], "\n", file = stderr())
            }
            l
          },
          x = c("#", "$", "%", "&", "{", "}", "_"),
          init = w
        )
    w <- gsub("^", "\\^{}", w, fixed = TRUE)
    w <- gsub("\\textbackslash \\_", "\\_", w, fixed = TRUE)
    return(w)
    if (all(v == w))
      v
    else
      paste0("\\texttt{", w, "}")
  }
}
whack_underscores <- whack_math

## dump params to stderr (remove this eventually)

if (FALSE) nbe(see_variable(params))

## unlist params for eventual output

param_unlist <- unlist(as.list(params))

# no need to whack underscores and dollars because this is verbatim
param_df <- data.frame(
  parameter = paste0("\\verb@", names(param_unlist), "@"),
  value = paste0(
    "\n\\begin{tiny}\n\\verb@",
    param_unlist,
    "@\n\\end{tiny}"
    )
  )
param_df <- data.frame(
  parameter = names(param_unlist),
  value = param_unlist
  )
param_df <- param_df[order(param_df$parameter), ]

## general output control

debug_file_base_path     <- params$debugFileBasePath
print_trace_messages     <- params$printTraceMsgs
show_enriched_substrates <- params$showEnrichedSubstrates
boxplot_varwidth         <- params$boxPlotVarWidth
boxplot_notch            <- params$boxPlotNotch

## parameters for static data

kinase_name_uprt_lut_bz2 <- params$kinaseNameUprtLutBz2
kinase_uprt_desc_lut_bz2 <- params$kinaseUprtDescLutBz2

## parameters for input file

preproc_db        <- params$preprocDb
alpha_file        <- params$alphaFile
input_file        <- params$inputFile

# First data column - ideally, this could be detected via
#   regexSampleNames, but for now leave it as is.
first_data_column <- params$firstDataColumn
fdc_is_integer    <- is.integer(first_data_column)
if (fdc_is_integer) {
  first_data_column <- as.integer(params$firstDataColumn)
}

## parameters for output files

ksea_app_prep_db      <- params$kseaAppPrepDb
imputed_data_filename <- params$imputedDataFilename
imp_qn_lt_data_filenm <- params$imputedQNLTDataFile
anova_ksea_mtdt_file  <- params$anovaKseaMetadata

## parameters for imputation

# Imputation method, should be one of
#   "random", "group-median", "median", or "mean"
imputation_method <- params$imputationMethod

# Selection of percentile of logvalue data to set the mean for random number
#   generation when using random imputation
mean_percentile <- params$meanPercentile / 100.0

# deviation adjustment-factor for random values; real number.
sd_percentile <- params$sdPercentile

## parameters for group parsing and filtering

# Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$"
regex_sample_names <- params$regexSampleNames
# Regular expression to extract Sample Grouping from Sample Name;
#   if error occurs, compare smpl_trt vs. sample_name_matches
#   to see if groupings/pairs line up
#   e.g., "(\\d+)"

regex_sample_grouping <- params$regexSampleGrouping
# What are the patterns for filtering sample groups?
# How should sample groups be filtered?
#   - none:    do not filter
#   - include: include sample groups matching filter
#   - exclude: include sample groups not matching filter

sample_group_filter <- params$groupFilter
if (grepl("f", params$groupFilterMode, fixed = TRUE)) {
  sample_group_filter_perl <- FALSE
  sample_group_filter_fixed <- TRUE
} else if (grepl("p", params$groupFilterMode, fixed = TRUE)) {
  sample_group_filter_perl <- TRUE
  sample_group_filter_fixed <- FALSE
} else { # normal regex
  sample_group_filter_perl <- FALSE
  sample_group_filter_fixed <- FALSE
}

sample_group_filter_nocase <-
  grepl("i", params$groupFilterMode, fixed = TRUE)

# What PCRE patterns should be included or excluded
group_filter_patterns_csv <- params$groupFilterPatterns
sample_group_filter_patterns <- strsplit(
    x = group_filter_patterns_csv,
    split = ",",
    fixed = TRUE
  )[[1]]

## parameters for hypothesis testing

one_way_all_categories_fname <- params$oneWayManyCategories

one_way_all_categories <- try_catch_w_e(
  match.fun(one_way_all_categories_fname))

if (!is.function(one_way_all_categories$value)) {
  write("fatal error for parameter oneWayManyCategories:", stderr())
  write(one_way_all_categories$value$message,             stderr())
  if (sys.nframe() > 0) {
    cat("Cannot continue and quit() failed. Goodbye.")
    knitr::knit_exit()
    quit(save = "no", status = 1)
  }
}

one_way_all_categories <- one_way_all_categories$value

one_way_two_categories_fname <- params$oneWayManyCategories
one_way_two_categories <- try_catch_w_e(
  match.fun(one_way_two_categories_fname))
if (!is.function(one_way_two_categories$value)) {
  cat("fatal error for parameter oneWayTwoCategories: \n")
  cat(one_way_two_categories$value$message, fill = TRUE)
  if (sys.nframe() > 0) {
    cat("Cannot continue and quit() failed. Goodbye.")
    knitr::knit_exit()
    quit(save = "no", status = 1)
  }
}
one_way_two_categories <- one_way_two_categories$value

## parameters for KSEA

ksea_cutoff_statistic <- params$kseaCutoffStatistic
ksea_cutoff_threshold <- params$kseaCutoffThreshold
ksea_min_substrate_count <- params$kseaMinSubstrateCount

## parameters for global variables consumed by functions

# intensityHeatmapCriteria: !r c("na_count", "p_value")[2] # TODO switch to 1
# TODO Validate within list
g_intensity_hm_criteria <- params$intensityHeatmapCriteria
if (is_string_null_or_empty(g_intensity_hm_criteria)) {
  cat("invalid intensityHeatmapCriteria parameter (must be string)")
  knitr::knit_exit()
}
switch(
  g_intensity_hm_criteria,
  "quality" = NULL,
  "na_count" = NULL,
  "p_value" = NULL,
  {
    with(
      params,
      cat(
        sprintf(
        "invalid %s (must be %s)",
        see_variable(intensityHeatmapCriteria),
        "one of quality or na_count or p_value"
        )
      )
    )
    knitr::knit_exit()
  }
)

g_default_heatmap_row_scaling <-
  params$defaultHeatMapRowScaling
if (
  !is.character(g_default_heatmap_row_scaling) ||
  !(g_default_heatmap_row_scaling %in% c("row", "none"))
  ) {
  cat("invalid defaultHeatMapRowScaling (must be 'row' or 'none')")
  knitr::knit_exit()
}

# intensityHeatmapRows: 50
# TODO Validate >> 0 < 75
g_intensity_hm_rows     <- params$intensityHeatmapRows
if (!is.integer(g_intensity_hm_rows) || g_intensity_hm_rows < 1) {
  cat("invalid intensityHeatmapRows (must be integer > 0)")
  knitr::knit_exit()
}

g_intensity_min_per_class <- params$intensityMinValuesPerGroup
if (!is.integer(g_intensity_min_per_class) || g_intensity_min_per_class < 0) {
  cat("invalid intensityMinValuesPerGroup (must be integer > -1")
  knitr::knit_exit()
}
g_correlate_substrates  <- params$correlateSubstrates
if (is.na(as.logical(g_correlate_substrates))) {
  cat("invalid correlateSubstrates (must be TRUE or FALSE)")
  knitr::knit_exit()
}
g_filter_cov_var_gt_1 <- params$filterCovVarGT1
if (is.na(as.logical(g_filter_cov_var_gt_1))) {
  cat("invalid filterCovVarGT1 parameter (must be TRUE or FALSE)")
  knitr::knit_exit()
}

# TODO Validate >> 0 < 30
g_ppep_trunc_n          <- params$ppepTruncN

# TODO Validate >> 0 < 30
g_subgene_trunc_n       <- params$subgeneTruncN

# TODO Validate >> 0 < 30
g_sbstr_trunc_n         <- params$substTruncN


### OPERATORS

# Test for exclusion
# ref: https://www.reneshbedre.com/blog/in-operator-r.html
`%notin%` <- Negate(`%in%`)

# Augmented assignment
# ref: https://www2.cs.arizona.edu/icon/refernce/infix2.htm#aug_assign
`%||:=%` <- function(lvalue, ...) {
  pf <- parent.frame()
  rvalue <- Reduce(paste0, x = ..., init = lvalue)
  assign(
    x = as.character(substitute(lvalue)),
    value = rvalue,
    pos = pf
  )
  invisible(rvalue)
}
# Infix concatenation
`%||%` <- function(lvalue, rvalue) paste0(lvalue, rvalue)


### FUNCTIONS

no_op <-
  function() {
  }
# this function is not used in this file and should be removed while
#   factoring out reusable code
all_apply <- function(f, v, na_rm = TRUE, ...) {
  Reduce(
    f = function(l, r) if (na_rm && is.na(r)) TRUE else l && r,
    x = sapply(X = v, FUN = f, ...),
    init = TRUE
  )
}

write_debug_file <- function(data_frame) {
  if (!is.null(debug_file_base_path)) {
    s_path <-
      sprintf(
        "%s/%s.txt",
        debug_file_base_path,
        deparse(substitute(data_frame))
      )
    write.table(
      data_frame,
      file = s_path,
      sep = "\t",
      col.names = TRUE,
      row.names = TRUE,
      quote = FALSE
    )
  }
}

# ref: http://adv-r.had.co.nz/Environments.html
# "When creating your own environment, note that you should set its parent
#   environment to be the empty environment. This ensures you don't
#   accidentally inherit objects from somewhere else."
# Caution: this prevents `with(my_env, expr)` from working when `expr`
#   contains anything from the global environment, even operators!
#   Hence, `x <- 1; get("x", new_env())` fails by design.
new_env <- function() {
  new.env(parent = emptyenv())
}

# make apply readable for rows
row_apply <- function(x, fun, ..., simplify = TRUE) {
  apply(x, MARGIN = 1, fun, ..., simplify = TRUE)
}

# make apply readable for columns
column_apply <- function(x, fun, ..., simplify = TRUE) {
  apply(x, MARGIN = 2, fun, ..., simplify = TRUE)
}

##' Produce a vector of boolean values whose i-th value is TRUE when any
##'   member of v matches the i-th membr of s, where i in 1:seq_len(length(s))
##'
##' @title Search multiple strings for matches of multiple substrings
##' @param v a vector of substrings to match
##' @param s a vector of strings to search for matches
##' @param ... additional arguments to grepl
##' @return a list with keys in s and valuse that are vectors of elements of v
##' @author Art Eschenlauer
##' Copyright (C) 2022 Art Eschenlauer;
##' MIT License; https://en.wikipedia.org/wiki/MIT_License#License_terms
mgrepl <- function(v, s, ...) {
  grpl_rslt <- rep_len(0, length(s))
  for (vi in v) {
    grpl_rslt_v <- sapply(
      X = s,
      FUN = function(t) {
        Reduce(
          f = function(l, r) if (is.null(l)) r else c(l, r),
          x = sapply(
            X = vi,
            FUN = function(f) grepl(f, t, ...)
          ),
          init = c()
        )
      },
      simplify = "array"
    )
    grpl_rslt <- grpl_rslt + grpl_rslt_v
  }
  unname(grpl_rslt > 0)
}

##' Produce positions in a vector where succeeding value != current valus
##'
##' @title Search vector for neighboring positions having different values
##' @param v a vector of comparable numeric values (e.g. integers)
##' @return a vector of positions i where v[i] != v[i + 1]
##' @author Art Eschenlauer
##' Copyright (C) 2022 Art Eschenlauer;
##' MIT License; https://en.wikipedia.org/wiki/MIT_License#License_terms
transition_positions <- function(v) {
  Reduce(
    f = function(l, i) if ((i != 1) && (v[i - 1] != v[i])) c(l, i - 1) else l,
    x = seq_along(v)[-1:0],
    init = c()
  )
}

### figure debug functions

cat_par_vector <- function(par_name, lbl = "", newlines = TRUE) {
  cat(
    sprintf(
      "%spar(%s) = c(%s)%s",
      lbl,
      par_name,
      paste(par(par_name), collapse = ", "),
      if (newlines) "\n\n" else ""
    )
  )
}

cat_margins <- function(lbl = NULL) {
  for (p in c("fig", "fin", "mar", "mai", "omd", "omi", "oma"))
    cat_par_vector(p, if (!is.null(lbl)) paste0(lbl, " ") else NULL)
}

cat_variable <-
  function(x, suffix = "", digits = 3, force_str = FALSE) {
    xprssn <- deparse1(substitute(x))
    if (force_str || is.matrix(x) || is.list(x) || is.data.frame(x)) {
      cat(
        paste0(
          "\n\\texttt{\\textbf{",
          whack_underscores(xprssn),
          "}} [",
          typeof(x),
          ",",
          mode(x),
          "] =\n"
        )
      )
      cat("\n\\begin{verbatim}\n")
      str(x)
      cat("\n\\end{verbatim}\n")
    } else {
      cat("\n", see_variable(x, suffix, digits, xprssn))
    }
  }

### structure helper functions

# ref: staque.R - Icon-oriented stack and queue operations
# - https://gist.github.com/eschen42/917690355e53918b9e7ba7138a02d1f8
#
# sq_get(v):x produces the leftmost element of v and removes it from v,
#   but produces NA if v is empty
sq_get <- function(v) {
  if (length(v) == 0) return(NA)
  assign(as.character(substitute(v)), v[-1], parent.frame())
  return(v[1])
}
#
# sq_put(v,x1,...,xn):v puts x1, x2, ..., xn onto the right end of v,
#   producing v.
#   Values are pushed in order from left to right,
#   so xn becomes the last (rightmost) value on v.
#   sq_put(v) with no second argument does nothing.
sq_put <- function(v, x = NA, ...) {
  pf <- parent.frame()
  if (is.null(x)) return(pf$v)
  if (
    !(length(x) > 1) &&
    !rlang::is_closure(x) &&
    is.na(x)
    ) return(pf$v)
  assign(as.character(substitute(v)), c(v, x, ...), pf)
  pf[[as.character(substitute(v))]]
}

### numerical/statistical helper functions

any_nan <- function(x) {
  !any(x == "NaN")
}

# determine standard deviation of quantile to impute
sd_finite <- function(x) {
  ok <- is.finite(x)
  sd(x[ok])
}

# compute anova raw p-value
anova_func <- function(x, grouping_factor, one_way_f) {
  subject <- data.frame(
    intensity = x
  )
  x_aov <-
    one_way_f(
      formula = intensity ~ grouping_factor,
      data = subject
      )
  pvalue <-
    if (identical(one_way_f, aov))
      summary(x_aov)[[1]][["Pr(>F)"]][1]
    else
      pvalue <- x_aov$p.value
  pvalue
}

# This code adapted from matrixcalc::is.positive.definite
#   Notably, this simply tests without calling stop()
# nolint start
#   squash un-actionable cyclomatic_complexity warning
is_positive_definite <- function(x, tol = 1e-08) {
# nolint end
  if (!is.matrix(x))
    return(FALSE)
  if (!is.numeric(x))
    return(FALSE)
  if (nrow(x) < 1)
    return(FALSE)
  if (ncol(x) < 1)
    return(FALSE)
  if (nrow(x) != ncol(x))
    return(FALSE)
  sum_symm <- sum(x == t(x), na.rm = TRUE)
  value_count <- Reduce("*", dim(x))
  if (sum_symm != value_count)
    return(FALSE)
  eigenvalues <- eigen(x, only.values = TRUE)$values
  n <- nrow(x)
  for (i in 1:n) {
    if (abs(eigenvalues[i]) < tol) {
      eigenvalues[i] <- 0
    }
  }
  if (any(eigenvalues <= 0)) {
    return(FALSE)
  }
  return(TRUE)
}

### LaTeX functions

# Use this like print.data.frame, from which it is adapted:
data_frame_table_latex <-
  function(
    x,
    # digits to pass to format.data.frame
    digits = NULL,
    # TRUE -> right-justify columns; FALSE -> left-justify
    right = TRUE,
    # maximumn number of rows to print
    max = NULL,
    # string with justification of each column
    justification = NULL,
    # TRUE to center on page
    centered = TRUE,
    # optional caption
    caption = NULL,
    # h(inline); b(bottom); t (top) or p (separate page)
    anchor = "h",
    # set underscore_whack to TRUE to escape underscores
    underscore_whack = TRUE,
    # how to emit results
    emit = cat
  ) {
    if (is.null(justification))
      justification <-
        Reduce(
          f = paste,
          x = rep_len(if (right) "r" else "l", length(colnames(x)))
          )
    n <- length(rownames(x))
    if (length(x) == 0L) {
      emit(
        sprintf(
          # if n is one, use singular 'row', else use plural 'rows'
          ngettext(
            n,
            "data frame with 0 columns and %d row",
            "data frame with 0 columns and %d rows"
            ),
          n
          ),
        "\n",
        sep = ""
        )
    } else if (n == 0L) {
      emit("0 rows for:\n")
      latex_itemized_list(
        v = names(x),
        underscore_whack = underscore_whack
        )
    } else {
      if (is.null(max))
        max <- getOption("max.print", 99999L)
      if (!is.finite(max)) {
        cat("Abend because: invalid 'max' / getOption(\"max.print\"): ", max)
        knitr::knit_exit()
      }
      omit <- (n0 <- max %/% length(x)) < n
      m <- as.matrix(
        format.data.frame(
          if (omit) x[seq_len(n0), , drop = FALSE] else x,
          digits = digits,
          na.encode = FALSE
          )
        )
      emit(
        # h(inline); b(bottom); t (top) or p (separate page)
        paste0("\\begin{table}[", anchor, "]"),
        "\\leavevmode",
        sep = "\n"
        )
      if (!is.null(caption))
        emit(paste0(" \\caption{", caption, "}"))
      if (centered) emit("\\centering\n")
      emit(
        paste(
          " \\begin{tabular}{",
          justification,
          "}\n",
          sep = ""
          )
        )

      # ref for top and bottom struts (\T and \B):
      #   https://tex.stackexchange.com/a/50355
      if (!is.null(caption))
        emit("\\B \\\\\n")
      latex_table_row(
        v = colnames(m),
        extra = " \\T \\B",
        underscore_whack = underscore_whack
        )
      emit("\\hline \\\\\n")
      for (i in seq_len(length(m[, 1]))) {
        latex_table_row(
        v = m[i, ],
        underscore_whack = underscore_whack
        )
      }
      emit(
        paste(
          " \\end{tabular}",
          "\\end{table}",
          sep = "\n"
          )
        )
      if (omit)
        emit(" [ reached 'max' / getOption(\"max.print\") -- omitted",
          n - n0, "rows ]\n")
    }
    invisible(x)
  }

# Use this like print.data.frame, from which it is adapted:
# nolint start
#   squash un-actionable cyclomatic_complexity warning
#   squash un-actionable "no visible global for ..."
data_frame_tabbing_latex <-
  function(
# nolint end
    x,
    # vector of tab stops, in inches
    tabstops,
    # vector of headings, registered with tab-stops
    headings = colnames(x),
    # digits to pass to format.data.frame
    digits = NULL,
    # maximumn number of rows to print
    max = NULL,
    # optional caption
    caption = NULL,
    # set underscore_whack to TRUE to escape underscores
    underscore_whack = TRUE,
    # flag for landscape mode
    landscape = FALSE,
    # flag indicating that subsubsection should be used for caption
    #   rather than subsection
    use_subsubsection_header = TRUE,
    # character-size indicator; for possible values, see:
    #   https://tug.org/texinfohtml/latex2e.html#Font-sizes
    charactersize = "small",
    # set verbatim to TRUE to debug output
    verbatim = FALSE
  ) {

    hlinport <- if (landscape) {
        function() cat("\\hlinlscp \\\\\n")
      } else {
        function() cat("\\hlinport \\\\\n")
      }

    tabstops_tex <-
      Reduce(
        f    = function(l, r) paste0(l, r),
        x    = sprintf("\\hspace{%0.2fin}\\=", tabstops),
        init = ""
      )

    n <- length(rownames(x))
    if (length(x) == 0L) {
      cat(
        sprintf(
          # if n is one, use singular 'row', else use plural 'rows'
          ngettext(
            n,
            "data frame with 0 columns and %d row",
            "data frame with 0 columns and %d rows"
            ),
          n
          ),
        "\n",
        sep = ""
        )
    } else if (n == 0L) {
      cat("0 rows for:\n")
      latex_itemized_list(
        v = names(x),
        underscore_whack = underscore_whack
        )
    } else {
      if (is.null(max))
        max <- getOption("max.print", 99999L)
      if (!is.finite(max)) {
        cat("Abend because: invalid 'max' / getOption(\"max.print\"): ", max)
        knitr::knit_exit()
      }
      omit <- (n0 <- max %/% length(x)) < n
      m <- as.matrix(
        format.data.frame(
          if (omit) x[seq_len(n0), , drop = FALSE] else x,
          digits = digits,
          na.encode = FALSE
          )
        )
      if (landscape)
        cat("\n\\begin{landscape}")
      tex_caption <-
        if (!is.null(caption)) sprintf("\\captionof{table}{%s}\n", caption)
          else "\n"
      # build the column names, which have multiple lines when
      #   length(headings) is a multiple of the number of columns
      column_names <- ""
      while (length(headings) > 0) {
        my_row <- c()
        for (i in 1:(1 + length(tabstops))) {
          my_field <- sq_get(headings)
          sq_put(my_row, if (is.na(my_field)) "" else my_field)
        }
        column_names %||:=% latex_tabbing_row(
          v = my_row,
          underscore_whack = underscore_whack,
          action = paste0
        )
      }

      # Begin tabbing environment after beginning charactersize environment
      if (verbatim) cat("\n\\begin{verbatim}")
      cat(
        paste0(
          "\n\\begin{", charactersize, "}", tex_caption,
          "\\begin{tabwrap}{", tabstops_tex, "}\n"
        )
      )
      # emit column names
      cat(column_names)
      # emit hline
      hlinport()
      for (i in seq_len(length(m[, 1]))) {
        my_row <- latex_tabbing_row(
          v = m[i, ],
          underscore_whack = underscore_whack,
          action = paste0
        )
        if (FALSE)
          cat(my_row)
        else
          cat(my_row)
      }
      hlinport()
      if (omit)
        cat(" [ reached 'max' / getOption(\"max.print\") -- omitted",
          n - n0, "rows ]\n")
      # End charactersize environment after ending tabbing environment
      cat(paste0("\\end{tabwrap}\n\\end{", charactersize, "}\n"))
      if (verbatim) cat("\\end{verbatim}\n")
      if (landscape)
        cat("\\end{landscape}\n")
    }
    invisible(x)
  }

param_df_noexit <-
  function(e = NULL) {
    data_frame_tabbing_latex(
      x = param_df,
      tabstops = c(1.75),
      underscore_whack = TRUE,
      caption = "Input parameters",
      verbatim = FALSE
      )
    if (!is.null(e)) {
      sink(stderr())
      cat("Caught fatal error:\n\n")
      str(e)
      sink()
    }
  }

param_df_exit <-
  function(e = NULL) {
    param_df_noexit(e)
    knitr::knit_exit()
    exit(-1)
  }

# exit with exit code (default 0) and optional msg
exit <-
  function(code = 0, msg = NULL, use_stderr = FALSE) {
    if (!is.null(msg)) {
      if (use_stderr) sink(stderr())
      cat("\n\n", msg, "\n\n")
      if (use_stderr) sink()
    }
    q(save = "no", status = code)
  }

# make control sequences into printable latex sequences
latex_printable_control_seqs <-
  function(s) {
    s <- gsub("[\\]", "xyzzy_plugh", s)
    s <- gsub("[$]", "\\\\$", s)
    s <- gsub("xyzzy_plugh", "$\\\\backslash$", s)
    return(s)
  }
nolatex_verbatim <-
  function(expr) eval(expr)

error_knitrexit_stop <-
  function(e) {
    cat("Caught error:\n\n")
    str(e)
    knitr::knit_exit()
    stop(e)
  }

latex_verbatim <-
  function(expr) {
    cat("\n\\begin{verbatim}\n___\n")
    tryCatch(
      expr = expr,
      error = param_df_exit,
      finally = cat("...\n\\end{verbatim}\n")
    )
  }

latex_samepage <-
  function(expr) {
    cat("\n\\begin{samepage}\n")
    tryCatch(
      expr = expr,
      error = param_df_exit,
      finally = cat("\n\\end{samepage}\n")
    )
  }

# return the result of invocation after showing parameters
# ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/
latex_show_invocation <-
  function(f, f_name = deparse1(substitute(f)), head_patch = FALSE) {
    function(...) {
      va <- list(...)
      my_rslt <- new_env()
      my_rslt$rslt <- NULL
      latex_verbatim(
        expr = {
          cat(sprintf("\n ..  Local variables for '%s':\n\n", f_name))
          str(va)
          if (!head_patch) {
            # return this result
            # ref:
            # https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/
            cat(sprintf("\n ..  Invoking '%s'\n", f_name))
            tryCatch(
              {
                cat("\n\\end{verbatim}\n")
                rslt <- do.call(f, va)
              },
              error = param_df_exit,
              finally = cat("\n\\begin{verbatim}\n")
            )
            cat(sprintf("\n ..  '%s' returned:\n", f_name))
            str(rslt)
            my_rslt$rslt <- rslt
          }
        }
      )
      # return the result of invocation with the shown parameters
      # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/
      if (head_patch) my_rslt$rslt <- do.call(f, va)
      (my_rslt$rslt)
    }
  }

latex_collapsed_vector <- function(
    collapse_string,
    v,
    underscore_whack = TRUE,
    action = cat0
  ) {
  v_sub <-
    if (underscore_whack) whack_underscores(v) else v
  action(
    paste0(
      v_sub,
      collapse = collapse_string
      )
    )
}

latex_itemized_collapsed <-
  function(collapse_string, v, underscore_whack = TRUE) {
  cat("\\begin{itemize}\n\\item ")
  latex_collapsed_vector(collapse_string, v, underscore_whack)
  cat("\n\\end{itemize}\n")
}

latex_itemized_list <- function(v, underscore_whack = TRUE) {
  latex_itemized_collapsed("\n\\item ", v, underscore_whack)
}

latex_enumerated_collapsed <-
  function(collapse_string, v, underscore_whack = TRUE) {
  cat("\\begin{enumerate}\n\\item ")
  latex_collapsed_vector(collapse_string, v, underscore_whack)
  cat("\n\\end{enumerate}\n")
}

latex_enumerated_list <- function(v) {
  latex_enumerated_collapsed("\n\\item ", v)
}

latex_table_row <- function(v, extra = "", underscore_whack = TRUE) {
  latex_collapsed_vector(" & ", v, underscore_whack)
  cat(extra)
  cat(" \\\\\n")
}

latex_tabbing_row <- function(
    v,
    extra = "",
    underscore_whack = TRUE,
    action = cat0
  ) {
    # latex_collapsed_vector applies action to result of paste0;
    #   by default, action = cat;
    #   hence, a scalar string is assigned to v_collapsed
    v_collapsed <-
      latex_collapsed_vector(
        "} \\> \\tabfill{",
        v,
        underscore_whack,
        action = paste0
      )
    action(
      "\\tabfill{",
      v_collapsed,
      "}",
      extra,
      " \\\\\n"
    )
  }

# N.B. use con = "" to emulate regular cat
fcat0 <-
  function(..., sprtr = " ", cnnctn = file()) {
    gsubfn::cat0(..., sep = sprtr, file = cnnctn)
    invisible(cnnctn)
  }

hypersub <-
  function(s) {
    hyper <- tolower(s)
    hyper <- gsub("[^a-z0-9]+", "-", hyper)
    hyper <- gsub("[-]+",       "-", hyper)
    hyper <- gsub("[_]+",       "-", hyper)
    hyper <- sub("^[-]",        "",  hyper)
    hyper <- sub("[-]$",        "",  hyper)
    return(hyper)
  }

table_href <- function(s = "offset", caption = "") {
    paste0("\\hyperlink{table.\\arabic{", s, "}}{Table \\arabic{", s, "}}")
  }

table_offset <- function(i = 0, s = "offset", new = FALSE) {
    paste0(
      if (new) paste0("\\newcounter{", s, "}\n") else "",
      "\\setcounter{", s, "}{\\value{table}}\n",
      paste0(if (i > 0) rep(paste0("\\stepcounter{", s, "}"), i), "\n")
    )
  }

a_section_header <-
  function(s, prefix = "") {
    hyper <- hypersub(s)
    my_subsection_header <- sprintf(
      "\\hypertarget{%s}{\\%ssection{%s}\\label{%s}}\n",
      hyper,
      prefix,
      gsub("_", "\\_", s, fixed = TRUE),
      hyper
      )
    my_subsection_header
  }
section_header <- function(s) a_section_header(s, "")
subsection_header <- function(s) a_section_header(s, "sub")
subsubsection_header <- function(s) a_section_header(s, "subsub")

### SQLite functions

ddl_exec <- function(db, sql) {
  discard <- DBI::dbExecute(conn = db, statement = sql)
  if (FALSE && discard != 0) {
    need_newpage <- TRUE
    if (need_newpage) {
      need_newpage <<- FALSE
      cat("\\newpage\n")
    }
    o_file <- stdout()
    cat("\n\\begin{verbatim}\n")
    cat(sql, file = o_file)
    cat(sprintf("\n%d rows affected by DDL\n", discard), file = o_file)
    cat("\n\\end{verbatim}\n")
  }
}

dml_no_rows_exec <- function(db, sql) {
  discard <- DBI::dbExecute(conn = db, statement = sql)
  if (discard != 0) {
    need_newpage <- TRUE
    if (need_newpage) {
      need_newpage <<- FALSE
      cat("\\newpage\n")
    }
    cat("\n\\begin{verbatim}\n")
    o_file <- stdout()
    cat(sql, file = o_file)
    cat(sprintf("\n%d rows affected by DML\n", discard), file = o_file)
    cat("\n\\end{verbatim}\n")
  }
}

### KSEA functions and helpers

#' The KSEA App Analysis (KSEA Kinase Scores Only)
#'
#' Compute KSEA kinase scores and statistics from phoshoproteomics data input
#' Adapted from KSEAapp::KSEA.Scores to allow retrieval of maximum log2(FC)
#'
#' Result is an R data.frame with column names
#'   "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR"
#' "Please refer to the original Casado et al. publication for detailed
#'   description of these columns and what they represent:
#'
#'   - Kinase.Gene indicates the gene name for each kinase.
#'   - mS          represents the mean log2(fold change) of all the
#'                   kinase's substrates.
#'   - Enrichment  is the background-adjusted value of the kinase's mS.
#'   - m           is the total number of detected substrates
#'                   from the experimental dataset for each kinase.
#'   - z.score     is the normalized score for each kinase, weighted by
#'                   the number of identified substrates.
#'   - p.value     represents the statistical assessment for the z.score.
#'   - FDR         is the p-value adjusted for multiple hypothesis testing
#'                   using the Benjamini & Hochberg method."
#'
#' @param ksdata           the Kinase-Substrate dataset uploaded from the file
#'                           prefaced with "PSP&NetworKIN_"
#'                           available from github.com/casecpb/KSEA/
#' @param px               the experimental data file formatted as described in
#'                           the KSEA.Complete() documentation
#' @param networkin        a binary input of TRUE or FALSE, indicating whether
#'                           or not to include NetworKIN predictions, where
#'                           \code{NetworKIN = TRUE}
#'                           means include NetworKIN predictions
#' @param networkin_cutoff a numeric value between 1 and infinity setting
#'                           the minimum NetworKIN score
#'                           (this can be omitted if NetworKIN = FALSE)
#'
#' @return                 creates a new R data.frame with all the KSEA kinase
#'                           scores, along with each one's statistical
#'                           assessment, as described herein.
#'
#' @references
#'
#' Casado et al. (2013) Sci Signal. 6(268):rs6
#'
#' Hornbeck et al. (2015) Nucleic Acids Res. 43:D512-20
#'
#' Horn et al. (2014) Nature Methods 11(6):603-4
#'
# nolint start
#   squash un-actionable cyclomatic_complexity warning
ksea_scores <- function(
# nolint end
  # For human data, typically, ksdata = KSEAapp::ksdata
  ksdata,

  # Input data file having columns:
  # - Protein     : abbreviated protein name
  # - Gene        : HUGO gene name
  # - Peptide     : peptide sequence without indications of phosphorylation
  # - Reside.Both : position(s) of phosphorylation within Gene sequence
  #                 - First letter designates AA that is modified
  #                 - Numbers indicate position within Gene
  #                 - Multiple values are separated by semicolons
  #   - p         : p-value
  #   - FC        : fold-change
  px,

  # A binary input of TRUE or FALSE, indicating whether or not to include
  #   NetworKIN predictions
  networkin,

  # A numeric value between 1 and infinity setting the minimum NetworKIN
  #   score (can be left out if networkin = FALSE)
  networkin_cutoff,

  # Minimum substrate count, necessary to adjust the p-value appropriately.
  minimum_substrate_count

) {
  if (FALSE) {
    if (print_nb_messages) nbe("Output contents of `ksdata` table\n")
    cat_variable(ksdata)
    cat("\n\\clearpage\n")
    data_frame_tabbing_latex(
      x = ksdata[order(ksdata$GENE, ksdata$SUB_GENE), ],
      tabstops = c(0.3, 0.3, 1.2, 0.3, 0.3, 0.3, 0.3,
                   0.8, 0.3, 0.8, 0.3, 0.3, 0.3),
      caption = "ksdata dump",
      use_subsubsection_header = FALSE
    )
  }
  # no px$FC should be <= 0, but abs(px$FC) is used below as a precaution.
  if (length(grep(";", px$Residue.Both)) == 0) {
    # There are no Residue.Both entries having semicolons, so new is
    #   simply px except two columns are renamed and a column is added
    #   for log2(abs(fold-change))
    new <- px
    colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD")
    new$log2_fc <- log2(abs(as.numeric(as.character(new$FC))))
    new <- new[complete.cases(new$log2_fc), ]
  } else {
    # Split each row having semicolons in Residue.Both into rows that are
    #   duplicated in all respects except that each row has a single
    #   member of the set "split-on-semicolon-Residue.Both"
    px_double <- px[grep(";", px$Residue.Both), ]
    residues <- as.character(px_double$Residue.Both)
    residues <- as.matrix(residues, ncol = 1)
    split <- strsplit(residues, split = ";")
    # x gets count of residues in each row,
    #   i.e., 1 + count of semicolons
    x <- sapply(split, length)
    # Here is the set of split rows
    px_single <- data.frame(
      Protein      = rep(px_double$Protein, x),
      Gene         = rep(px_double$Gene,    x),
      Peptide      = rep(px_double$Peptide, x),
      Residue.Both = unlist(split),
      p            = rep(px_double$p,       x),
      FC           = rep(px_double$FC,      x)
      )
    # new first gets the split rows
    new <- px[-grep(";", px$Residue.Both), ]
    # to new, append the rows that didn't need splitting in the first place
    new <- rbind(new, px_single)
    # map Gene         to SUB_GENE
    # map Residue.Both to SUB_MOD_RSD
    colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD")
    # Eliminate any non-positive values to prevent introduction of
    #   infinite or NaN values
    new[(0 <= new$log2_fc), "log2_fc"] <- NA
    # Because of preceding step, there is no need for abs in the next line
    new$log2_fc <- log2(as.numeric(as.character(new$FC)))
    # Convert any illegal values from NaN to NA
    new[is.nan(new$log2_fc), "log2_fc"] <- NA
    # Eliminate rows having missing values (e.g., non-imputed data)
    new <- new[complete.cases(new$log2_fc), ]
  }
  # At this point, new$log2_fc is signed according to which contrast has
  #   the greater intensity
  # To take the magnitude into account without taking the direction into
  #   account, set params$kseaUseAbsoluteLog2FC to TRUE
  #
  # Should KSEA be performed aggregating signed log2FC or absolute?
  # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores
# nolint start
#   squash un-actionable "no visible global for ..."
  if (params$kseaUseAbsoluteLog2FC) {
# nolint end
    # TRUE  use abs(log2FC) for KSEA as Justin requested; this is a
    #         justifiable deviation from the KSEAapp::KSEA.Scores algorithm.
    new$log2_fc <- abs(new$log2_fc)
  }

  monitor_filtration_on_stderr <- TRUE
  if (monitor_filtration_on_stderr) {
    # set to TRUE to monitor filtration on stderr
    sink(stderr())
    cat(see_variable(networkin, "\n"))
  }
  ksdata_filtered <-
    sqldf::sqldf(
      sprintf("%s %s",
        "select * from ksdata where not Source = 'NetworKIN'",
        if (networkin)
          sprintf("or networkin_score >= %d", networkin_cutoff)
        else
          ""
        )
      )
  if (FALSE) write.csv(x = ksdata_filtered, file = "ksdata_filtered.csv")
  if (monitor_filtration_on_stderr) {
    cat(see_variable(sqldf::sqldf(
      "select count(*), Source from ksdata group by Source"), "\n"))
    cat(see_variable(sqldf::sqldf(
      "select count(*), Source from ksdata_filtered group by Source"), "\n"))
    sink()
  }

  ############################################################################
  # Line numbers below refer to lines of:
  #   https://github.com/casecpb/KSEAapp/blob/master/R/KSEA.Scores.R
  # I would put the original line in a comment but then lint would complain...
  # - Indeed, I had to rename all the variables because lint didn't like names
  #   containing periods or capital letters.
  # ACE
  ############################################################################
  #
  # (1) Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD
  #   colnames of ksdata_filtered:
  #     "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID"
  #     "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID"
  #     "SITE_...7_AA" "networkin_score" "Source"
  #   colnames of new:
  #     "Protein" "SUB_GENE" "Peptide" "SUB_MOD_RSD" "p" "FC" "log2_fc"
  # Equivalent to:
  #   SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc
  #     FROM ksdata_filtered a
  #       INNER JOIN new b
  #         ON a.SUB_GENE = b.SUB_GENE
  #           AND a.SUB_MOD_RSD = b.SUB_MOD_RSD
  #   (KSEA.Scores.R line # 105)
  #   "Extract KSData.filtered annotations that are only found in new"
  ksdata_dataset <- base::merge(ksdata_filtered, new)
  #   colnames of ksdata_dataset:
  #     "KINASE"      "KIN_ACC_ID"   "GENE"       "KIN_ORGANISM" "SUBSTRATE"
  #     "SUB_GENE_ID" "SUB_ACC_ID"   "SUB_GENE"   "SUB_ORGANISM" "SUB_MOD_RSD"
  #     "SITE_GRP_ID" "SITE_...7_AA" "networkin_score"  "Source" "Protein"
  #     "Peptide"     "p"            "FC"         "log2_fc" (uniprot_no_isoform)
  # Re-order dataset; prior to accounting for isoforms
  #   (KSEA.Scores.R line # 106)
  ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ]
  # Extract non-isoform accession in UniProtKB
  #   (KSEA.Scores.R line # 107)
  ksdata_dataset$uniprot_no_isoform <- sapply(
    ksdata_dataset$KIN_ACC_ID,
    function(x) unlist(strsplit(as.character(x), split = "-"))[1]
    )
  #   "last expression collapses isoforms ... for easy processing"
  # Discard previous results while selecting interesting columns ...
  #   (KSEA.Scores.R line # 110)
  ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)]
  # Column names are now:
  #   "GENE"       "SUB_GENE"        "SUB_MOD_RSD"    "Peptide" "p"
  #   "FC" "log2_fc" "Source"
  # Make column names human-readable
  #   (KSEA.Scores.R line # 111)
  colnames(ksdata_dataset_abbrev) <- c(
    "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p",
    "FC", "log2FC", "Source"
    )
  # SELECT * FROM ksdata_dataset_abbrev
  #   ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p
  #   (KSEA.Scores.R line # 112)
  #   "Extract KSData.filtered annotations that are only found in new"
  ksdata_dataset_abbrev <-
    ksdata_dataset_abbrev[
      order(
        ksdata_dataset_abbrev$Kinase.Gene,
        ksdata_dataset_abbrev$Substrate.Gene,
        ksdata_dataset_abbrev$Substrate.Mod,
        ksdata_dataset_abbrev$p),
      ]
  if (print_nb_messages) nbe(see_variable(ksdata_dataset_abbrev))
  if (FALSE) write.csv(
    x = ksdata_dataset_abbrev,
    file = "ksdata_dataset_abbrev_no_aggregation.csv",
    row.names = FALSE
    )

  # For some kinases, the only difference between kinase names in two databases
  #   is the case of the the letters; dedup to eliminate this false distinction
  #   The only imperfection with this approach is when the NetworKIN-only fliter
  #   is in force and there is a duplication between NetworKIN and HPRD (which
  #   outranks NetworKIN).  Presently, the script does not invoke with this
  #   function with the NetworKIN-only filter active.
  dedup_sql <- "
    select
      r.'Kinase.Gene', r.'Substrate.Gene', r.'Substrate.Mod',
      r.Peptide, r.p, r.FC, r.log2FC, r.Source
    from
      ( select
          rowid,
          rank() over
            (partition by
              lower(k.'Kinase.Gene'), k.p, k.FC
            order by
              k.Source
            ) as rank,
          *
        from
          ksdata_dataset_abbrev k
        ) as r
    where r.rank = 1
  "
  ksdata_dataset_abbrev <- sqldf::sqldf(x = dedup_sql)
  if (FALSE) write.csv(
    x = ksdata_dataset_abbrev,
    file = "ksdata_dataset_abbrev_no_aggregation_dedup.csv",
    row.names = FALSE
    )

  # First aggregation step to account for multiply phosphorylated peptides
  #   and differing peptide sequences; the goal here is to combine results
  #   for all measurements of the same substrate.
  # SELECT  `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`,
  #         `Source`, avg(log2FC) AS log2FC
  #   FROM  ksdata_dataset_abbrev
  #   GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`,
  #         `Source`
  #   ORDER BY `Kinase.Gene`;
  # in two steps:
  # (1) compute average log_2(fold-change)
  #   "take the mean of the log2FC amongst phosphosite duplicates"
  #   (KSEA.Scores.R line # 115)
  ksdata_dataset_abbrev <- aggregate(
    log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source,
    data = ksdata_dataset_abbrev,
    FUN = mean
    )
  if (print_nb_messages) nbe(see_variable(ksdata_dataset_abbrev))
  # (2) order by Kinase.Gene
  #   (KSEA.Scores.R line # 117)
  ksdata_dataset_abbrev <-
    ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ]
  # SELECT  `Kinase.Gene`, count(*)
  #   FROM  ksdata_dataset_abbrev
  #  GROUP BY `Kinase.Gene`;
  # in two steps:
  # (1) Extract the list of Kinase.Gene names
  #   "@@@@@@@@@@@@@@@@@@@@"
  #   "Do analysis for KSEA"
  #   "@@@@@@@@@@@@@@@@@@@@"
  #   (KSEA.Scores.R line # 124)
  kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene)
  # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev,
  #   named by Kinase.Gene
  #   (KSEA.Scores.R line # 125)
  kinase_list <- as.matrix(table(kinase_list))
  # Second aggregation step to account for all substrates per kinase
  # CREATE TABLE mean_fc
  #   AS
  # SELECT  `Kinase.Gene`, avg(log2FC) AS log2FC
  #   FROM  ksdata_dataset_abbrev
  #   GROUP BY `Kinase.Gene`
  #   (KSEA.Scores.R line # 127)
  if (print_nb_messages) nb(see_variable(ksdata_dataset_abbrev), "\n")
  mean_fc <-
    aggregate(
      log2FC ~ Kinase.Gene,
      data = ksdata_dataset_abbrev,
      FUN = mean
      )
  if (print_nb_messages) nbe(see_variable(mean_fc), "\n")
  if (FALSE) write.csv(x = mean_fc, file = "mean_fc.csv")

  # for contrast j
  #   for each kinase i
  #     extract log2 of fold-change (from `new` above)
  #     (used in KSEA.Scores.R lines # 130 & 132)
  log2_fc_j_each_i <-
    new$log2_fc

  # for contrast j
  #   for all kinases i
  #     compute mean of abs(log2 of fold-change)
  #     (used in KSEA.Scores.R lines # 130)
  mean_abs_log2_fc_j_all_i <-
    mean(abs(log2_fc_j_each_i), na.rm = TRUE)

  # for contrast j
  #   for all kinases i
  #     compute mean of log2 of fold-change
  #     (used in KSEA.Scores.R lines # 132)
  mean_log2_fc_j_all_i <-
    mean(log2_fc_j_each_i, na.rm = TRUE)

  # Reorder mean_fc, although I don't see why
  #   (KSEA.Scores.R line 128
  mean_fc <- mean_fc[order(mean_fc[, 1]), ]
  if (FALSE) write.csv(x = mean_fc, file = "mean_fc.csv")
  # mean_fc columns so far: "Kinase.Gene", "log2FC"
  # - Kinase.Gene
  #               indicates the gene name for each kinase.

  # Create column 3: mS
  # - mS
  #               represents the mean log2(fold change) of all the
  #               kinase's substrates.
  #   (KSEA.Scores.R line # 129)
  mean_fc$m_s <-
    mean_fc_m_s <- mean_fc[, 2]

  # Create column 4: Enrichment
  # - Enrichment
  #               is the background-adjusted value of the kinase's mS.
  #   (KSEA.Scores.R line # 130)
  mean_fc$enrichment <-
    mean_fc_m_s / mean_abs_log2_fc_j_all_i

  # Create column 5: m, count of substrates of kinase (count of j for i)
  # - m
  #               is the total number of detected substrates
  #               from the experimental dataset for each kinase.
  #   (KSEA.Scores.R line # 131)
  mean_fc$m <-
    mean_fc_m <- kinase_list


  # Create column 6: z-score
  # - z.score
  #               is the normalized score for each kinase, weighted by
  #               the number of identified substrates.
  #   (KSEA.Scores.R line # 132)
  mean_fc$z_score <-
    (mean_fc_m_s - mean_log2_fc_j_all_i) * sqrt(mean_fc_m) /
      sd(log2_fc_j_each_i, na.rm = TRUE)

  # Create column 7: p-value, deduced from z-score
  # - p.value
  #               represents the statistical assessment for the z.score.
  #   (KSEA.Scores.R line # 133)
  # "one-tailed p-value"
  mean_fc$p_value <-
    pnorm(-abs(mean_fc$z_score))

  # zap excluded kinases; this must be done before adjusting p-value
  if (TRUE) {
    mean_fc <-
      mean_fc[
        mean_fc$m >= minimum_substrate_count,
        ,
        drop = FALSE
      ]
  }

  # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value
  # - FDR
  #               is the p-value adjusted for multiple hypothesis testing
  #               using the Benjamini & Hochberg method."
  #   (KSEA.Scores.R line # 134)
  mean_fc$fdr <-
    p.adjust(mean_fc$p_value, method = "fdr")

  # It makes no sense to leave Z-scores negative when using
  #   absolute value of fold-change
# nolint start
#   squash un-actionable "no visible global for ..."
  if (params$kseaUseAbsoluteLog2FC) {
# nolint end
    mean_fc$z_score <- abs(mean_fc$z_score)
  }

  # Remove second column (log2FC), which is duplicated as mS
  #   (KSEA.Scores.R line # 136)
  mean_fc <-
    mean_fc[order(mean_fc$Kinase.Gene), -2]
  # Correct the column names which we had to hack because of the linter...
  colnames(mean_fc) <- c(
    "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR"
    )
  #   (KSEA.Scores.R line # 138)
  return(mean_fc)
}

ksea_low_fdr_barplot_factory <- function(
  rslt,
  i_cntrst,
  i,
  a_level,
  b_level,
  fold_change,
  caption
) {
  rslt_score_list_i <- rslt$score_list[[i]]
  if (!is.null(rslt_score_list_i)) {
    rslt_score_list_i_nrow <- nrow(rslt_score_list_i)
    k <- data.frame(
      contrast = as.integer(i_cntrst),
      a_level = rep.int(a_level, rslt_score_list_i_nrow),
      b_level = rep.int(b_level, rslt_score_list_i_nrow),
      kinase_gene = rslt_score_list_i$Kinase.Gene,
      mean_log2_fc = rslt_score_list_i$mS,
      enrichment = rslt_score_list_i$Enrichment,
      substrate_count = rslt_score_list_i$m,
      z_score = rslt_score_list_i$z.score,
      p_value = rslt_score_list_i$p.value,
      fdr = rslt_score_list_i$FDR
    )
    selector <- switch(
      ksea_cutoff_statistic,
      "FDR" = {
        k$fdr
        },
      "p.value" = {
        k$p_value
        },
      {
        cat(
          sprintf(
            "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
            ksea_cutoff_statistic
            )
          )
        param_df_exit()
        knitr::knit_exit()
      }
    )

    k <- k[selector < ksea_cutoff_threshold, ]
    nrow_k <- nrow(k)

    if (nrow_k > 0) {
      max_nchar_rowname <- max(nchar(rownames(k)))
      my_cex_names <- 1.0 / (1 + nrow_k / 50)

      if (print_trace_messages) cat_margins("Initially")
      if (print_trace_messages) cat_variable(nrow_k,            "\n\n", 0)
      if (print_trace_messages) cat_variable(my_cex_names,      "\n\n", 0)
      if (print_trace_messages) cat_variable(max_nchar_rowname, "\n\n", 0)

      # fin: The figure region dimensions, (width, height), in inches.
      # mar: A numerical vector of the form c(bottom, left, top, right)
      #      that gives the number of lines of margin to be specified
      #      on the four sides of the plot; default: c(5, 4, 4, 2) + 0.1

# mar: The figure region dimensions, (width, height), in inches.
      numeric_z_score <- as.numeric(k$z_score)
      bar_order <- order(-as.numeric(k$p_value))
      kinase_name <- k$kinase_gene
      long_caption <-
        sprintf(
          "Kinase z-score, %s, KSEA %s < %s",
          caption,
          ksea_cutoff_statistic,
          ksea_cutoff_threshold
          )
      my_cex_caption <- 65.0 / max(65.0, nchar(long_caption))
      # return a function that draws the plot
      function() {
        par_fin <- par("fin") # vector of width_in_inches and height_in_inches)
        op <- par(
          bg = if (print_trace_messages) "yellow" else "white",
          fin = c(par_fin[1], min(par_fin[2], 2.5 + nrow_k / 6)),
          mar = par("mar") +
            c(3 / nrow_k, (1 + max_nchar_rowname * my_cex_names) / 2, 0, 0)
            # bottom,     left,                                     top, right
        )
        on.exit(par(op))
        if (print_trace_messages) cat_margins("Eventually")

        barplot(
          height = numeric_z_score[bar_order],
          border = NA,
          xpd = FALSE,
          cex.names = my_cex_names,
          main = long_caption,
          cex.main = my_cex_caption,
          names.arg = kinase_name[bar_order],
          horiz = TRUE,
          srt = 45,
          las = 1,
          cex.axis = 0.9
        )
      }
    }
  } else {
    no_op
  }
}

# note that this adds elements to the global variable `ksea_asterisk_hash`

ksea_low_fdr_print <- function(
  rslt,
  i_cntrst,
  i,
  a_level,
  b_level,
  fold_change,
  caption,
  write_db = TRUE, # if TRUE, write to DB, else print table
  anchor = c(const_table_anchor_p, const_table_anchor_t)
) {
  rslt_score_list_i <- rslt$score_list[[i]]
  if (!is.null(rslt_score_list_i)) {
    rslt_score_list_i_nrow <- nrow(rslt_score_list_i)
    k <- contrast_ksea_scores <- data.frame(
      contrast = as.integer(i_cntrst),
      a_level = rep.int(a_level, rslt_score_list_i_nrow),
      b_level = rep.int(b_level, rslt_score_list_i_nrow),
      kinase_gene = rslt_score_list_i$Kinase.Gene,
      mean_log2_fc = rslt_score_list_i$mS,
      enrichment = rslt_score_list_i$Enrichment,
      substrate_count = rslt_score_list_i$m,
      z_score = rslt_score_list_i$z.score,
      p_value = rslt_score_list_i$p.value,
      fdr = rslt_score_list_i$FDR
    )

    selector <- switch(
      ksea_cutoff_statistic,
      "FDR" = {
        k$fdr
        },
      "p.value" = {
        k$p_value
        },
      {
        cat(
          sprintf(
            "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
            ksea_cutoff_statistic
            )
          )
        param_df_exit()
        knitr::knit_exit()
      }
    )

    k <- k[selector < ksea_cutoff_threshold, ]
    # save kinase names to ksea_asterisk_hash
    for (kinase_name in k$kinase_gene) {
      ksea_asterisk_hash[[kinase_name]] <- 1
    }

    if (write_db) {
      db_write_table_overwrite <- (i_cntrst < 2)
      db_write_table_append    <- !db_write_table_overwrite
      RSQLite::dbWriteTable(
        conn = db,
        name = "contrast_ksea_scores",
        value = contrast_ksea_scores,
        append = db_write_table_append
        )
      ""
    } else {
      selector <- switch(
        ksea_cutoff_statistic,
        "FDR" = {
          contrast_ksea_scores$fdr
          },
        "p.value" = {
          contrast_ksea_scores$p_value
          },
        {
          cat(
            sprintf(
              "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
              ksea_cutoff_statistic
              )
            )
          param_df_exit()
          knitr::knit_exit()
        }
      )
      if (print_nb_messages) nbe(see_variable(contrast_ksea_scores))
      output_df <- contrast_ksea_scores[
        selector < ksea_cutoff_threshold,
        c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count",
          "z_score", "p_value", "fdr")
        ]
      output_df$kinase_gene <-
        gsub(
          "_",
          "\\\\_",
          output_df$kinase_gene
        )
      colnames(output_df) <-
        c(
          colnames(output_df)[1],
          colnames(output_df)[2],
          "enrichment",
          "m_s",
          "z_score",
          "p_value",
          "fdr"
        )

# nolint start
      output_order <- with(output_df, order(p_value))
# nolint end
      output_df <- output_df[output_order, ]

      output_df[, 2] <- sprintf("%0.3g", output_df[, 2])
      output_df$fdr <- sprintf("%0.4f", output_df$fdr)
      output_df$p_value <- sprintf("%0.2e", output_df$p_value)
      output_df$z_score <- sprintf("%0.2f", output_df$z_score)
      output_df$m_s <- sprintf("%d", output_df$m_s)
      output_df$enrichment <- sprintf("%0.3g", output_df$enrichment)
      colnames(output_df) <-
        c(
          "Kinase or motif",
          "\\(\\overline{{\\lvert}\\log_2 (\\text{fold-change}){\\rvert}}\\)",
          "Enrichment",
          "Substrates",
          "z-score",
          "p-value",
          "FDR"
        )
      selector <- switch(
        ksea_cutoff_statistic,
        "FDR" = {
          rslt$score_list[[i]]$FDR
          },
        "p.value" = {
          rslt$score_list[[i]]$p.value
          },
        {
          cat(
            sprintf(
              "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
              ksea_cutoff_statistic
              )
            )
          param_df_exit()
          knitr::knit_exit()
        }
      )
      if (sum(selector < ksea_cutoff_threshold) > 0) {
        if (print_nb_messages) nbe(see_variable(output_df))
        math_caption <- gsub("{", "\\{", caption,      fixed = TRUE)
        math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE)
        if (TRUE) {
          data_frame_tabbing_latex(
            x = output_df,
            # vector of tab stops, in inches
            tabstops = c(1.8, 1.2, 0.8, 0.8, 0.8, 0.8, 0.8),
            # vector of headings, registered with tab-stops
            headings = colnames(output_df),
            # digits to pass to format.data.frame
            digits = NULL,
            # maximumn number of rows to print
            max = NULL,
            # optional caption
            caption = sprintf(
              "\\text{%s}, KSEA %s < %s",
              math_caption,
              ksea_cutoff_statistic,
              ksea_cutoff_threshold
              ),
            # set underscore_whack to TRUE to escape underscores
            underscore_whack = FALSE,
            # flag for landscape mode
            landscape = FALSE,
            # flag indicating that subsubsection should be used for caption
            #   rather than subsection
            use_subsubsection_header = TRUE,
            # character-size indicator; for possible values, see:
            #   https://tug.org/texinfohtml/latex2e.html#Font-sizes
            charactersize = "small",
            # set verbatim to TRUE to debug output
            verbatim = FALSE
            )
        }
      } else {
        cat(
          sprintf(
            "\\break
            No kinases had
            \\(\\text{KSEA %s}_\\text{enrichment} < %s\\)
            for contrast %s\\hfill\\break\n",
            ksea_cutoff_statistic,
            ksea_cutoff_threshold,
            caption
          )
        )
      }
    }
  } else {
    ""
  }
}

# create_breaks is a helper for ksea_heatmap
create_breaks <- function(merged_scores) {
  if (sum(!is.na(merged_scores)) < 2)
    return(NULL)
  if (min(merged_scores, na.rm = TRUE) < -1.6) {
    breaks_neg <- seq(-1.6, 0, length.out = 30)
    breaks_neg <-
      append(
        seq(min(merged_scores, na.rm = TRUE), -1.6, length.out = 10),
        breaks_neg
        )
    breaks_neg <- sort(unique(breaks_neg))
  } else {
    breaks_neg <- seq(-1.6, 0, length.out = 30)
  }
  if (max(merged_scores, na.rm = TRUE) > 1.6) {
    breaks_pos <- seq(0, 1.6, length.out = 30)
    breaks_pos <-
      append(
        breaks_pos,
        seq(1.6, max(merged_scores, na.rm = TRUE),
        length.out = 10)
        )
    breaks_pos <- sort(unique(breaks_pos))
  } else {
    breaks_pos <- seq(0, 1.6, length.out = 30)
  }
  breaks_all <- unique(append(breaks_neg, breaks_pos))
  mycol_neg <-
    gplots::colorpanel(n = length(breaks_neg),
               low = "blue",
               high = "white")
  mycol_pos <-
    gplots::colorpanel(n = length(breaks_pos) - 1,
               low = "white",
               high = "red")
  mycol <- unique(append(mycol_neg, mycol_pos))
  color_breaks <- list(breaks_all, mycol)
  return(color_breaks)
}

# nolint start
#   squash un-actionable cyclomatic_complexity warning
hm2plus <- function(
# nolint end
  x,
  mat = matrix(
          c(
            c(0, 4, 0),
            c(0, 3, 3),
            c(2, 1, 1)
          ),
          nrow = 3,
          ncol = 3,
          byrow = TRUE
        ),
  denwid = 0.5,
  denhgt = 0.15,
  widths = c(0.5, 2.5, 1.5),
  heights = c(0.4, 0.15, 3.95),
  divergent = FALSE,
  notecol = "grey50",
  trace = "none",
  margins = c(6, 20),
  srtcol       = 90,
  srtrow       = 0,
  density_info = "none",
  key_xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
  key_par = list(),
  hclustfun = hclust,
  ...
) {

  if (FALSE) # this is to avoid commenting out code to pass linting...
    my_hm2 <- latex_show_invocation(gplots::heatmap.2, head_patch = FALSE)
  else
    my_hm2 <- gplots::heatmap.2

  x <- as.matrix(x)
  if (sum(!is.na(x)) < 1)
    return(NULL)
  color_count <- 1 + max(64, length(as.vector(x))) # 8 was not enough
  break_count <- 1 + color_count
  min_nonax <- min(x, na.rm = TRUE)
  max_nonax <- max(x, na.rm = TRUE)
  if (print_nb_messages) nb("within hm2plus", see_variable(divergent), "\n")
  if (divergent) {
    zlim <- max(abs(min_nonax), abs(max_nonax))
    if (print_nb_messages) nb(see_variable(zlim, "\n"))
    breaks <- (zlim) / (break_count:1)
    if (print_nb_messages) nb(see_variable(breaks, "\n"))
    breaks <- breaks - median(breaks)
    zlim <- c(-zlim, zlim)
    if (print_nb_messages) nb(see_variable(zlim, "\n"))
  } else {
    zlim <- max(abs(min_nonax), abs(max_nonax))
    if (print_nb_messages) nb(see_variable(zlim, "\n"))
    breaks <- zlim / (break_count:1)
    if (print_nb_messages) nb(see_variable(breaks, "\n"))
    if  (max_nonax < 0) {
      breaks <- breaks - zlim
      zlim <- c(-zlim, 0)
    } else {
      zlim <- c(0, zlim)
    }
    if (print_nb_messages) nb(see_variable(zlim, "\n"))
  }
  nonax <- x
# nolint start
#   squash un-actionable "no visible global for ..."
  if (params$heatMapNAsubstitute) nonax[is.na(x)] <- min_nonax
# nolint end
  if (is.null(widths)) widths <- c(denwid, 4 - denwid, 1.5)
  if (is.null(heights)) heights <- c(0.4, denhgt, 4.0)
  colors <-
    if (divergent && min_nonax < 0) {
      # divergent colors on both sides of zero
      colorRampPalette(c("red", "white", "blue"))(color_count)
    } else if (divergent && min_nonax > 0) {
      # "divergent" colors > zero
      colorRampPalette(c("white", "blue"))(color_count)
    } else if (divergent && max_nonax < 0) {
      # "divergent" colors < zero
      colorRampPalette(c("red", "white"))(color_count)
    } else {
      # "non-divergent" colors including zero
      tmp <- hcl.colors(color_count, "YlOrRd", rev = TRUE)
# nolint start
#   squash un-actionable "no visible global for ..."
      tmp[1] <- params$heatMapNAgrey
# nolint end
      tmp
    }

  nbe("There are ", sum(is.na(x)), " NA values")

  suppressWarnings(
    my_hm2(
      x = x,
      col = colors,
      density.info = density_info,
      srtCol = srtcol,
      srtRow = srtrow,
      margins = margins,
      lwid = widths,
      lhei = heights,
      key.title = NA,
      key.xlab = key_xlab,
      key.par = key_par,
      lmat = mat,
      notecol = notecol,
      trace = trace,
      bg = "yellow",
      hclustfun = hclustfun,
      oldstyle = FALSE,
      ... # varargs
    )
  )
  # implicitly returning value returned by heatmap.2
}

# draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap
# nolint start
#   squash un-actionable cyclomatic_complexity warning
draw_kseaapp_summary_heatmap <- function(
# nolint end
    x,                       # matrix with row/col names already formatted
    sample_cluster,          # a binary input of TRUE or FALSE,
                             #   indicating whether or not to perform
                             #   hierarchical clustering of the sample columns
    merged_asterisk,         # matrix having dimensions of x, values "*" or ""
    color_breaks,            # breaks for color gradation, from create_breaks
                             #   passed to `breaks` argument of `image`
    margins = c(8, 15),      # two integers setting the bottom and right margins
                             #   to accommodate row and column labels
    master_cex = 0.7,        # basis for text sizes
    ...
) {
  merged_scores <- x
  if (!is.matrix(x)) {
    cat(
      paste0(
        "No plot because \\texttt{typeof(x)} is '",
        typeof(x),
        "' rather than 'matrix'.\n\n"
        )
      )
    cat_variable(x)
    return(FALSE)
  }
  if (print_trace_messages) cat(sprintf("master_cex = %03f\n\n", master_cex))
  nrow_x <- nrow(x)
  ncol_x <- ncol(x)
  #if (nrow_x < 2) {
  if (nrow_x < 1) {
    cat("No plot because matrix has no rows.\n\n")
    return(FALSE)
  } else if (nrow_x < 2) {
    cat("No plot because matrix has one row.  Matrix looks like this:\n\n")
    cat("\n\\begin{verbatim}\n")
    print(x)
    cat("\n\\end{verbatim}\n")
    return(FALSE)
  } else if (ncol_x < 2) {
    cat("No plot because matrix x has ", ncol_x, " columns.\n\n")
    cat_variable(x)
    return(FALSE)
  }
  max_nchar_rowname <- max(nchar(rownames(x)))
  max_nchar_colname <- max(nchar(colnames(x)))

  my_row_cex_scale <- master_cex * 150 / nrow_x
  # row cex shrink hack begin
  my_row_cex_scale <- master_cex * 150 / max(nrow_x, ncol_x)
  # row cex shrink hack end

  my_col_cex_scale <- 3.0
  # col cex shrink hack begin
  if (ncol_x > 40)
    my_col_cex_scale <- 3.0 * 40 / ncol_x
  # col cex shrink hack end

  my_asterisk_scale <- 0.4 * my_row_cex_scale
  my_row_warp <- 1
  my_note_warp <- 2
  my_row_warp <- 1
  my_row_cex_asterisk <-
    master_cex * my_row_warp * my_asterisk_scale


  my_col_cex <- my_col_cex_scale * master_cex
  my_row_cex <- min(3.5 * my_row_cex_asterisk, my_col_cex)
  my_key_cex <- 1.286
  my_hm2_cex <- 1 * master_cex
  my_offset <- (4.8 / (9 + nrow_x / 10)) - 0.4
  if (print_trace_messages) cat(sprintf("nrow_x = %03f\n\n", nrow_x))
  if (print_trace_messages) cat(sprintf("my_offset = %03f\n\n", my_offset))
  my_offset <- 0.05
  if (print_trace_messages) cat(sprintf("my_offset = %03f\n\n", my_offset))
  my_scale <- 3.0
  if (ncol_x < 10 && nrow_x < 10)
    my_scale <- my_scale * 10 / (10 - nrow_x) * 10 / (10 - ncol_x)

  my_heights <- c(
               0.15,
               3.85 - my_offset,
               0.5  + my_offset
             )
  my_margins <- c(1, 1) +
      c(
        margins[1] * 0.08 * max_nchar_colname * my_col_cex,
        margins[2] * 0.04 * max_nchar_rowname * my_row_cex
      )

  if (print_trace_messages) {
    cat_variable(my_heights,          suffix = "; ")
    cat_variable(my_margins,          suffix = "\n\n")
    cat_variable(my_row_cex_scale,    suffix = "; ")
    cat_variable(my_col_cex_scale,    suffix = "\n\n")
    cat_variable(my_row_cex_asterisk, suffix = "\n\n")
    cat_variable(my_row_cex,          suffix = "; ")
    cat_variable(my_col_cex,          suffix = "\n\n")
    cat_variable(my_row_cex,          suffix = "; ")
    cat_variable(my_col_cex,          suffix = "\n\n")
  }

  hm2plus(
    x            = merged_scores,
    Colv         = sample_cluster,
    cellnote     = merged_asterisk,
    cex          = my_hm2_cex,
    cexCol       = my_col_cex,
    cexRow       = my_row_cex,
    denhgt       = 0.15,
    density_info = "none",
    denwid       = 0.5,
    divergent    = TRUE,
    key_par      = list(cex = my_key_cex),
    key_xlab     = "Z-score",
    margins      = my_margins,
    notecex      = my_scale * min(
                     1.5,
                     my_row_cex_asterisk * my_note_warp,
                     my_col_cex * my_note_warp
                   ),
    notecol      = "white",
    scale        = "none",
    srtcol       = 90,
    srtrow       = 0,
    trace        = "none",
    mat          = matrix(
                     c(
                       c(0, 3, 3),
                       c(2, 1, 1),
                       c(0, 4, 0)
                     ),
                     nrow = 3,
                     ncol = 3,
                     byrow = TRUE
                   ),
    widths       = c(0.5, 3.1, 0.9),
    heights      = my_heights,
    ...
  )
  return(TRUE)
}

# function drawing heatmap of contrast fold-change for each kinase,
#   adapted from KSEAapp::KSEA.Heatmap
# nolint start
#   squash un-actionable cyclomatic_complexity warning
ksea_heatmap <- function(
# nolint end
  # the data frame outputs from the KSEA.Scores() function, in list format
  score_list,
  # a character vector of all the sample names for heatmap annotation:
  # - the names must be in the same order as the data in score_list
  # - please avoid long names, as they may get cropped in the final image
  sample_labels,
  # character string of either "p.value" or "FDR" indicating the data column
  #   to use for marking statistically significant scores
  stats,
  # a numeric value between 0 and infinity indicating the min. number of
  #   substrates a kinase must have to be included in the heatmap
  m_cutoff,
  # a numeric value between 0 and 1 indicating the p-value/FDR cutoff
  #   for indicating significant kinases in the heatmap
  p_cutoff = {
    cat("argument 'p_cutoff' is required for function 'ksea_heatmap'")
    param_df_exit()
    knitr::knit_exit()
  },
  # a binary input of TRUE or FALSE, indicating whether or not to perform
  #   hierarchical clustering of the sample columns
  sample_cluster,
  # a binary input of TRUE or FALSE, indicating whether or not to export
  #   the heatmap as a .png image into the working directory
  export = FALSE,
  # bottom and right margins; adjust as needed if contrast names are too long
  margins = c(6, 6),
  # print which kinases?
  # - Mandatory argument, must be one of const_ksea_.*_kinases
  which_kinases,
  # additional arguments to gplots::heatmap.2, such as:
  # - main: main title of plot
  # - xlab: x-axis label
  # - ylab: y-axis label
  ...
) {
  filter_m <- function(dataset, m_cutoff) {
    filtered <- dataset[(dataset$m >= m_cutoff), ]
    return(filtered)
  }
  score_list_m <- lapply(score_list, function(...) filter_m(..., m_cutoff))
  for (i in seq_len(length(score_list_m))) {
    names <- colnames(score_list_m[[i]])[c(2:7)]
    colnames(score_list_m[[i]])[c(2:7)] <-
      paste(names, i, sep = ".")
  }
  master <-
    Reduce(
      f = function(...) {
        base::merge(..., by = "Kinase.Gene", all = TRUE)
      },
      x = score_list_m
      )

  row.names(master) <- master$Kinase.Gene
  columns <- as.character(colnames(master))
  merged_scores <- as.matrix(master[, grep("z.score", columns), drop = FALSE])
  colnames(merged_scores) <- sample_labels
  merged_stats <- as.matrix(master[, grep(stats, columns)])
  asterisk <- function(mtrx, p_cutoff) {
    new <- data.frame()
    for (i in seq_len(nrow(mtrx))) {
      for (j in seq_len(ncol(mtrx))) {
        my_value <- mtrx[i, j]
        if (!is.na(my_value) && my_value < p_cutoff) {
          new[i, j] <- "*"
        } else {
          new[i, j] <- ""
        }
      }
    }
    return(new)
  }
  merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff))

  asterisk_rows <- rowSums(merged_asterisk == "*") > 0
  all_rows <- rownames(merged_stats)
  names(asterisk_rows) <- all_rows
  non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE])
  asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE])

  row_list <- list()
  row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows
  row_list[[const_ksea_all_kinases]] <- all_rows
  row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows

  i <- which_kinases
  my_row_names <- row_list[[i]]
  scrs <- merged_scores[my_row_names, , drop = FALSE]
  stts <- merged_stats[my_row_names, , drop = FALSE]
  merged_asterisk <- as.matrix(asterisk(stts, p_cutoff))

  color_breaks <- create_breaks(scrs)
  if (is.null(color_breaks)) {
    cat("No plot because matrix has too few rows.\n\n")
    return(NULL)
  }
  plot_height <- nrow(scrs) ^ 0.55
  plot_width <- ncol(scrs) ^ 0.7
  if (export == "TRUE") {
    png(
      "KSEA.Merged.Heatmap.png",
      width = plot_width * 300,
      height = 2 * plot_height * 300,
      res = 300,
      pointsize = 14
    )
  }
  did_draw <- draw_kseaapp_summary_heatmap(
    x               = scrs,
    sample_cluster  = sample_cluster,
    merged_asterisk = merged_asterisk,
    color_breaks    = color_breaks,
    margins         = margins
  )
  if (export == "TRUE") {
    dev.off()
  }
  if (!did_draw)
    return(NULL)
  return(my_row_names)
}

# helpers for heatmaps of phosphopeptide intensities

# factory producing function to truncate string after n characters
trunc_n <- function(n) {
    function(x) {
      sapply(
        X = x,
        FUN = function(s) {
          if (is.na(s))
            return("NA")
          cond <- try_catch_w_e(nchar(s) > n)
          if (!is.logical(cond$value)) {
            return(cond$value$message)
          } else if (cond$value) {
            paste0(
              strtrim(s, n),
              "..."
            )
          } else {
            s
          }
        },
        USE.NAMES = FALSE
      )
    }
  }
trunc_long_ppep <- function(x) trunc_n(40)(x)
trunc_ppep <- function(x) trunc_n(g_ppep_trunc_n)(x)
trunc_subgene <- function(x) trunc_n(g_subgene_trunc_n)(x)
trunc_enriched_substrate <- function(x) trunc_n(g_sbstr_trunc_n)(x)

# factory producing a function that returns a covariance
#   matrix's rows (and columns) having variance > v_min
keep_cov_w_var_gtr_min <- function(v_min) {
  function(x) {
    if (!is.matrix(x))
      return(NULL)
    keepers <- sapply(
        X = seq_len(nrow(x)),
        FUN = function(i) {
          if (x[i, i] < v_min)
            NA
          else
            x[i, i]
        }
      )
    names(keepers) <- rownames(x)
    keepers <- keepers[!is.na(keepers)]
    keepers <- names(keepers)
    if (length(keepers) == 0)
      return(NULL)
    x[keepers, keepers]
  }
}
# function that returns a matrix's rows having variance > 1
keep_cov_w_var_gtr_1 <- keep_cov_w_var_gtr_min(1)

# factory producing a function that returns
#   - either a matrix's rows (rows = TRUE)
#   - or a matrix's columns (rows = FALSE)
#   having variance > v_min
keep_var_gtr_min <- function(v_min) {
  function(x, rows = TRUE) {
    nrowcol <- if (rows) nrow else ncol
    if (!is.matrix(x))
      return(NULL)
    keepers <- sapply(
        X = seq_len(nrowcol(x)),
        FUN = function(i) {
          row_var <- var(
              if (rows) x[i, ] else  x[, i],
              na.rm = TRUE
            )
          if (is.na(row_var) || row_var <= v_min) NA else i
        }
      )
    keepers <- keepers[!is.na(keepers)]
    if (rows) x[keepers, ] else  x[, keepers]
  }
}

keep_var_gtr_0 <- keep_var_gtr_min(0)

# function drawing heatmap of phosphopeptide intensities
# nolint start
#   squash un-actionable cyclomatic_complexity warning
ppep_heatmap <-
# nolint end
  function(
    m,                              # matrix with rownames already formatted
    cutoff,                         # cutoff used by hm_heading_function
    hm_heading_function,            # construct $ cat heading from m and cutoff
    hm_main_title,                  # main title for plot (drawn below heading)
    suppress_row_dendrogram = TRUE, # set to false to show dendrogram
    max_peptide_count =             # experimental:
            g_intensity_hm_rows,    #   values of 50 and 75 worked well
    master_cex = 1.0,               # basis for text sizes
    margins = NULL,                 # optional margins (bottom, right)
    cellnote = NULL,                # optional matrix of character; dim = dim(m)
    adj = 0.5,                      # adjust text: 0 left, 0.5 middle, 1 right
    row_scaling =                   # should row-scaling be applied if possible?
      g_default_heatmap_row_scaling,
    ...                             # passthru to hm2plus or heatmap.2
  ) {
    peptide_count <- 0
    # emit the heading for the heatmap
    if (hm_heading_function(m, cutoff)) {
      nrow_m <- nrow(m)
      peptide_count <- min(max_peptide_count, nrow_m)
      if (nrow_m > 1) {
        m_margin <- m[peptide_count:1, ]
        # Margin was heuristically derived to accommodate the widest label
        row_mchar_max <- max(nchar(rownames(m_margin)))
        col_mchar_max <- max(nchar(colnames(m_margin)))
        row_margin <- master_cex * row_mchar_max * 2.6
        col_margin <- master_cex * col_mchar_max * 2.6
        if (print_trace_messages) cat(
          sprintf("row_margin = %0.3f; ", row_margin))
        if (print_trace_messages) cat(
          sprintf("col_margin = %0.3f; ", col_margin))
        hm_call <- NULL
        tryCatch(
          {
            # set non-argument parameters for hm_call inner function
            my_row_cex <-
              master_cex * 200000 / (
                (max(nchar(rownames(m_margin)))^2) * g_intensity_hm_rows
                )
            m_hm <-  m[peptide_count:1, , drop = FALSE]
            if (is.null(cellnote)) {
              cellnote <-  matrix("", nrow = nrow(m_hm), ncol = ncol(m_hm))
# nolint start
#   squash un-actionable "no visible global for ..."
              cellnote[is.na(m_hm)] <- params$heatMapNAcellNote
# nolint end
            } else {
              cellnote <-  cellnote[peptide_count:1, , drop = FALSE]
            }
# nolint start
#   squash un-actionable "no visible global for ..."
            if (params$heatMapNAsubstitute) m_hm[is.na(m_hm)] <- 0
# nolint end
            nrow_m_hm <- nrow(m_hm)
            ncol_m_hm <- ncol(m_hm)
            if (nrow_m_hm < 1 || ncol_m_hm < 1)
              return(peptide_count) # return zero as initialized above


            my_row_cex <- master_cex * (100 / (2 + row_mchar_max))
            my_col_cex <- master_cex * 6 * row_margin / col_margin
            my_col_adj <- min(my_col_cex, my_row_cex) / my_col_cex
            my_col_cex <- min(my_col_cex, my_row_cex)
            col_margin <- sqrt(my_col_adj) * col_margin
            if (print_trace_messages) cat(
              sprintf("my_row_cex = %0.3f; ", my_row_cex))
            if (print_trace_messages) cat(
              sprintf("my_col_cex = %0.3f; ", my_col_cex))
            if (is.null(margins)) my_margins <-
              c(
                (my_col_cex + col_margin), # col
                (my_row_cex + row_margin) / my_row_cex  # row
              )
            else
              my_margins <- margins

            if (print_trace_messages) cat(
              sprintf(
                "my_margins = c(%s)\n\n",
                paste(my_margins, collapse = ", ")
              )
            )
            my_hm2_cex <- 2 * master_cex
            my_key_cex <-
              0.9 -
              0.1 * (g_intensity_hm_rows + nrow_m_hm) / g_intensity_hm_rows
            my_key_warp <- 1.5 * 22.75 / row_margin
            my_key_cex <- min(1.10, my_key_warp * my_key_cex)
            my_hgt_scale <- 3.70 - 0.4 * (max(1, 0.9 * my_row_cex) - 1)
            my_hgt_scale <- 3.75 # 3.615
            my_hgt_scale <- 3.60 # 3.615
            if (print_trace_messages)
              cat_variable(my_hgt_scale, "\n\n", 3)
            my_warp <- max(0.1, 1.4 * (7.5 + nrow_m) / g_intensity_hm_rows)
            if (print_trace_messages)
              cat_variable(my_warp, "\n\n", 3)
            # added 0.9 heuristically...
            my_plot_height <-
              (0.566 + 0.354 * (nrow_m / g_intensity_hm_rows)) *
                min(my_hgt_scale, my_hgt_scale * my_warp)
            my_plot_height <-
              min(3.65, my_plot_height * g_intensity_hm_rows / 50)
            my_heights <- c(
                0.3,                                 # title and top dendrogram
                my_plot_height,                      # plot and bottom margin
                4.15 - my_hgt_scale,                 # legend
                0.05 + my_hgt_scale - my_plot_height # whitespace below legend
              )
            my_note_cex <- min(0.8, my_row_cex, my_col_cex)
            if (print_trace_messages) {
              cat_variable(my_plot_height, "\n\n", 3)
              cat_variable(4.19 - my_hgt_scale, "\n\n", 3)
              cat_variable(nrow_m_hm, "; ", 0)
              cat_variable(ncol_m_hm, "; ", 0)
              cat_variable(my_row_cex, "; ", 3)
              cat_variable(my_col_cex, "; ", 3)
              cat_variable(my_note_cex, "; ", 3)
              cat_variable(my_key_cex, "\n\n", 3)
              cat_variable(my_hgt_scale, "; ", 3)
              cat_variable(my_plot_height, "; ", 3)
              cat_variable(my_warp, "\n\n", 3)
              cat_variable(my_heights, "; ", 2)
              cat_variable(sum(my_heights), "\n\n", 3)
            }

            # define hm_call inner function
            hm_call <- function(x, scaling, title) {
              my_cex_main <- min(5.0, 220 / nchar(title))
              op <- par(
                cex.main = my_cex_main * master_cex,
                adj = adj
              )
              if (
                !is.null(
                  hm2plus(
                    x,
                    Colv = NA,
                    Rowv = TRUE,
                    cexRow = my_row_cex,
                    cexCol = my_col_cex,
                    dendrogram = "row",
                    las = 1,
                    main = title,
                    key_xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
                    cex = my_hm2_cex,
                    key_par = list(cex = my_key_cex),
                    margins = my_margins,
                    widths =  c(0.4, 2.6, 1.5),
                    heights = my_heights,
                    mat = matrix(
                            c(
                              c(0, 3, 3),
                              c(2, 1, 1),
                              c(0, 4, 0),
                              c(0, 0, 0)
                            ),
                            nrow = 4,
                            ncol = 3,
                            byrow = TRUE
                          ),
                    na.rm = TRUE,
                    scale = scaling,
                    srtcol = 90,
                    srtrow = 0,
                    xlab = "",
                    cellnote = cellnote,
                    # ref for na.color:
                    # https://cran.r-project.org/web/packages/gplots/gplots.pdf
# nolint start
#   squash un-actionable "no visible global for ..."
                    na.color = params$heatMapNAcellColor,
# nolint end
                    notecex = my_note_cex,
                    ...
                  )
                )
              ) {
                if (print_trace_messages) cat(
                  sprintf(
                    "my_heights = c(%s); sum = %0.3f\n\n",
                    paste(
                      sprintf("%0.3f", my_heights),
                      collapse = ", "
                    ),
                    sum(my_heights)
                  )
                )
                if (print_trace_messages) cat(
                      sprintf("my_key_cex = %0.3f\n\n",
                      my_key_cex)
                    )
                if (print_trace_messages) cat(
                      sprintf("my_key_cex/my_heights[3] = %0.3f\n\n",
                      my_key_cex / my_heights[3])
                    )
                if (print_trace_messages) cat(
                      sprintf("my_heights[2]/my_heights[3] = %0.3f\n\n",
                      my_heights[2] / my_heights[3])
                    )
              }
              par(op)
            }

            # invoke hm_call inner function
            if (row_scaling != "row" || sum(rowSums(!is.na(m_hm)) < 2))
              hm_call(
                m_hm,
                "none",
                "log(intensities), unimputed, and unnormalized"
                )
            else
              hm_call(
                m_hm,
                "row",
                "log(intensities), row-scaled, unimputed, and unnormalized"
                )
          },
          error = function(e) {
            if (!is.null(hm_call)) {
              m_hm[is.na(m_hm)] <- 0
              tryCatch(
                {
                  if (nrow(m_hm) > 1)
                    hm_call(
                      m_hm,
                      "none",
                      "log(intensities), zero-imputed, unnormalized"
                      )
                  else
                    cat("\nThere are too few peptides to produce a heatmap.\n")
                },
                error = function(r) {
                  cat(
                    sprintf(
                      "\n%s %s Internal message: %s\n\\newline\n\n",
                      "Failure drawing heatmap, possibly because of ",
                      "too many missing values.\n\\newline\n\n",
                      r$message
                      )
                    )
                  cat_margins()
                }
              )
            } else {
              cat(
                "\nFailure drawing heatmap, ",
                "possibly because of too many missing values.\n"
                )
            }
          }
        )
      }
    }
    # return value:
    peptide_count
  }

# function drawing heatmap of correlations if they exist, else covariances
# nolint start
#   squash un-actionable cyclomatic_complexity warning
#   squash un-actionable "no visible global for ..."
cov_heatmap <-
  function(
# nolint end
    m,                              # matrix with rownames already formatted
    kinase_name,
    top_substrates = FALSE,
    ...                             # passthru to hm2plus or heatmap.2
  ) {
      if (print_nb_messages) nbe(
        see_variable(m), " [", nrow(m), "x", ncol(m), "\n")
      colnames_m <- colnames(m)
      is_na_m <- is.na(m)
      tmp <- m
      tmp[is_na_m] <- 0

      tmp <- m[, 0 < colSums(x = tmp)] # by default, na.rm is FALSE

      colnames_tmp <- colnames(tmp)

      my_low_p_seq <- seq(
          from = min(g_intensity_hm_rows, nrow(m)),
          to = 1,
          by = -1
        )

      if (g_correlate_substrates) {
        # zap samples having zero or near-zero variance
        tmp[is.na(tmp)] <- 0
        nzv <- caret::nearZeroVar(
          tmp,           # matrix of values, samples x variables
          freqCut = 1.01, # min(freq most prevalent value /
                         #     freq second most prevalent)
          uniqueCut = 99 # max(number of unique values /
                         #     total number of samples)
        )
        tmp <- if (length(nzv) > 0) {
            m[, -nzv, drop = FALSE]
          } else {
            m
          }
      } else {
        tmp <- m[my_low_p_seq, , drop = FALSE]
      }


      t_m <- t(tmp)
      t_m[is.na(t_m)] <- 0

      my_corcov <- cov(t_m)
      did_filter_rows <- did_filter_cols <- FALSE
      if (g_correlate_substrates && !is_positive_definite(my_corcov)) {
        my_correlate_substrates <- FALSE
        t_m <- t(m[my_low_p_seq, , drop = FALSE])
        t_m[is.na(t_m)] <- 0
        unfiltered_row_count <- ncol(t_m)
        unfiltered_col_count <- nrow(t_m)

        # zap empty samples
        t_m <- t_m[0 < rowSums(x = t_m), ]
        # zap substrates present in fewer than two samples
        foo <- t_m > 0
        foo <- colSums(x = foo) > 1
        t_m <- t_m[, foo]

        did_filter_rows <- unfiltered_row_count > ncol(t_m)
        did_filter_cols <- unfiltered_col_count > nrow(t_m)

        colnames_tmp <- rownames(t_m)
        my_corcov <- cov(t_m)
        if (g_filter_cov_var_gt_1) {
          my_corcov <- keep_cov_w_var_gtr_1(my_corcov)
        }
      } else if (g_correlate_substrates) {
        my_corcov <- cov2cor(my_corcov)
        my_correlate_substrates <- TRUE
      } else {
        my_correlate_substrates <- FALSE
        if (g_filter_cov_var_gt_1) my_corcov <- keep_cov_w_var_gtr_1(my_corcov)
      }

      omitted_samples <- colnames_m[colnames_m %notin% colnames_tmp]
      suffix <- if (length(omitted_samples) > 1) "s" else ""

      f_omissions <-
        function(is_corr) {
          cat(
            sprintf(
              "Below is the %s plot for %s sites",
              if (is_corr) "correlation" else "covariance",
              sprintf(
                if (top_substrates)
                  "%0.0f \"highest-quality\""
                else
                  "%0.0f",
                ncol(t_m)
              )
            )
          )
          if (did_filter_cols) {
            cat(sprintf(", omitting sample%s ", suffix))
            latex_collapsed_vector(", ", omitted_samples)
          }
          cat(".\n\n")
        }

      if (is.null(my_corcov) || sum(!is.na(t_m)) < 2) {
        cat(
          sprintf(
            "\\newline\n%s %s plot.\n",
            "Insufficient covariance to produce",
            if (my_correlate_substrates)
              "correlation"
            else
              "covariance"
          ),
          "\\newpage\n"
        )
        return(NULL)
      }

      cat("\\leavevmode\n", "\\newpage\n")
      f_omissions(my_correlate_substrates)

      master_cex <- 0.4
      max_nchar <- max(nchar(rownames(t_m)))
      my_limit <- g_intensity_hm_rows
      diminution <- sqrt(my_limit  / (my_limit + ncol(t_m)))
      my_row_cex <-
        my_col_cex <-
          min(1.75, master_cex * 9 * diminution ^ 1.5)
      my_margin <- 3 + my_row_cex * 64 / (8 + max_nchar)
      my_key_cex <- 1.4
      my_hm2_cex <- 1.0 * master_cex
      my_hgt_scale <- 3.50 - 0.26 * (max(0.4, my_key_cex) - 0.4)
      my_hgt_scale <- 2.7

      my_legend_height <- 4.0 - my_hgt_scale
      my_legend_height <- 0.5 * my_key_cex
      my_warp <- 0.65 * (my_limit + ncol(t_m)) / my_limit
      my_warp <- 0.8
      my_legend_height <- 0.77
      my_legend_height <- 0.67
      my_plot_height <- my_hgt_scale + (1 - my_warp) * my_legend_height
      my_legend_height <- my_warp * my_legend_height

      parjust <- par(adj = 0.5)
      on.exit(par(parjust))
      my_corcov <- my_corcov[order(rownames(my_corcov)), ]
      my_main <-
        sprintf("%s among %s sites %s",
            if (my_correlate_substrates) "Correlation"
              else "Covariance",
            kinase_name,
            if (!my_correlate_substrates &&
                g_filter_cov_var_gt_1 &&
                did_filter_rows
            )
              "having variance > 1"
            else ""
          )
      my_main_nchar <- nchar(my_main)
      my_heights <- c(
            0.3,
            my_plot_height,
            my_legend_height # was 4.0 - my_hgt_scale # was 4.19
          )
      if (print_trace_messages) cat(sprintf("max_nchar = %0.3f; ", max_nchar))
      if (print_trace_messages) cat(sprintf("my_margin = %0.3f; ", my_margin))
      if (print_trace_messages) cat(
        sprintf("my_plot_height = %0.3f\n\n", my_plot_height))
      if (print_trace_messages) cat(sprintf("master_cex = %0.3f; ", master_cex))
      if (print_trace_messages) cat(sprintf("my_row_cex = %0.3f; ", my_row_cex))
      if (print_trace_messages) cat(sprintf("my_col_cex = %0.3f; ", my_col_cex))
      if (print_trace_messages) cat(
        sprintf("my_key_cex = %0.3f\n\n", my_key_cex))
      if (print_trace_messages) cat(
        sprintf("my_hgt_scale = %0.3f\n\n", my_hgt_scale))
      if (print_trace_messages) cat(
        sprintf("legend height = %0.3f\n\n", my_legend_height))
      if (print_trace_messages) cat(
        sprintf(
          "my_heights = c(%s); sum = %0.3f\n\n",
          paste(
            sprintf("%0.3f", my_heights),
            collapse = ", "
          ),
          sum(my_heights)
        )
      )
      op <- par(cex.main = (30 + my_main_nchar) / my_main_nchar)
      on.exit(par(op))
      hm2plus(
        x            = my_corcov,
        cex          = my_hm2_cex,
        cexCol       = my_col_cex,
        cexRow       = my_row_cex,
        density_info = "none",
        denhgt       = 0.15,
        denwid       = 0.5,
        divergent    = TRUE,
        key_par      = list(cex = my_key_cex),
        key_xlab     = if (my_correlate_substrates) "Correlation"
                         else "Covariance",
        main         = my_main,
        mat          = matrix(
                         c(
                           c(0, 3, 3),
                           c(2, 1, 1),
                           c(0, 4, 0)
                         ),
                         nrow = 3,
                         ncol = 3,
                         byrow = TRUE
                       ),
        heights = my_heights,
        margins      = c(my_margin, my_margin),
        widths       = c(0.5, 3.1, 0.9),
        scale        = "none",
        symkey       = TRUE,
        symbreaks    = TRUE,
        symm         = FALSE #TODO evaluate TRUE
        # ...
      )
  } # end cov_heatmap

### FILE IMPORT

# function reading bzipped file to data.frame
bzip2df <- function(d, f, ctor = bzfile) {
  # read.delim file (by default, compressed by bzip2)
  if (file.exists(f)) {
    conn <- NULL
    pf <- parent.frame()
    tryCatch(
      assign(
        as.character(substitute(d)),
        read.delim(conn <- bzfile(f, open = "r")),
        pf
      ),
      finally = if (!is.null(conn)) close(conn)
      )
  }
}

```

# Purpose

The purpose of this analysis is to perform for phosphopeptides:

- imputation of missing values,
- quantile normalization,
- ANOVA (using the R stats::`r params$oneWayManyCategories` function),
- assignment of an FDR-adjusted $p$-value and a "quality score" to each phosphopeptide, and
- KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases:
  - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org)
  - The Human Proteome Database [http://hprd.org](http://hprd.org)
  - NetworKIN [http://networkin.science/](http://networkin.science/)
  - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx)

```{r include = FALSE}

if (params$kseaUseAbsoluteLog2FC) {
  sfc <- "|s|"
  pfc <- "|p|"
  pfc_txt <- "$\\text{absolute value}({\\log_2 (\\text{fold-change})})$"
} else {
  sfc <- "s"
  pfc <- "p"
  pfc_txt <- "${\\log_2 (\\text{fold-change}})$"
}

ksea_heatmap_titles <- list()
ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <-
  sprintf(
    "Summary for all kinases enriched in one or more contrasts at %s < %s",
    ksea_cutoff_statistic,
    ksea_cutoff_threshold
    )
ksea_heatmap_titles[[const_ksea_all_kinases]] <-
  "Summary figure for all contrasts and all kinases"
ksea_heatmap_titles[[const_ksea_nonastrsk_kinases]] <-
  sprintf(
    "Summary for all kinases not enriched at %s < %s in any contrast",
    ksea_cutoff_statistic,
    ksea_cutoff_threshold
    )
# hash to hold names of significantly enriched kinases
ksea_asterisk_hash <- new_env()

# PROCESS (mostly read) PARAMETERS

# False discovery rate adjustment for ANOVA
#  Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05
val_fdr <- read.table(file = alpha_file, sep = "\t", header = FALSE, quote = "")

if (
  ncol(val_fdr) != 1 ||
  sum(!is.numeric(val_fdr[, 1])) ||
  sum(val_fdr[, 1] < 0) ||
  sum(val_fdr[, 1] > 1)
) {
  cat("alphaFile should be one column of numbers within the range [0.0,1.0]")
  param_df_exit()
  knitr::knit_exit()
}
val_fdr <- val_fdr[, 1]

```

```{r echo = FALSE, results = 'asis'}


result <- file.copy(
  from      = preproc_db,
  to        = ksea_app_prep_db,
  overwrite = TRUE
  )
if (!result) {
  write(
    sprintf(
      "fatal error copying initial database '%s' to output '%s'",
      preproc_db,
      ksea_app_prep_db,
    ),
    stderr()
  )
  if (sys.nframe() > 0) {
    cat("Cannot continue and quit() failed. Goodbye.")
    param_df_exit()
    knitr::knit_exit()
    # in case knit_exit doesn't exit
    quit(save = "no", status = 1)
  }
}

if (FALSE) {
  write.table(x = param_df, file = "test-data/params.txt")
}

```

```{r echo = FALSE}
### READ DATA

# read.table reads a file in table format and creates a data frame from it.
#   - note that `quote = ""` means that quotation marks are treated literally.
full_data <- read.table(
  file = input_file,
  sep = "\t",
  header = TRUE,
  quote = "",
  check.names = FALSE
  )

```


# Extraction of Sample Classes and Names from Input Data

```{r echo = FALSE, results = 'asis'}

data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE)
my_column_names <- names(full_data)

if (!fdc_is_integer) {
  if (length(data_column_indices) > 0) {
    first_data_column <- data_column_indices[1]
  } else {
    cat(paste("failed to convert firstDataColumn:", first_data_column))
    param_df_exit()
    knitr::knit_exit()
  }
}

cat(
  sprintf(
    paste(
      "\n\nThe input data file has peptide-intensity data",
      "in columns %d (\"%s\") through %d (\"%s\")."
      ),
    tmp <- min(data_column_indices),
    my_column_names[tmp],
    tmp <- max(data_column_indices),
    my_column_names[tmp]
    )
  )

if (TRUE) {
  cat0(
    table_offset(i = 1, new = TRUE),
    "Sample classes and names are shown in ",
    table_href(),
    ".\n\n"
  )
} else {
  cat0(
    "\\newcounter{offset}\n",
    "\\setcounter{offset}{\\value{table}}\n",
    "\\stepcounter{offset}\n",
    "Sample classes and names are shown in ",
    table_href(),
    ".\n\n"
  )
}

#TODO remove this unused variable and assignment
if (FALSE) {
  # Write column names as a LaTeX enumerated list.
  column_name_df <- data.frame(
    column = seq_len(length(colnames(full_data))),
    name = paste0("\\verb@", colnames(full_data), "@")
    )
}

```

```{r echo = FALSE, results = 'asis'}
# extract intensity columns from full_data to quant_data
quant_data <- full_data[first_data_column:length(full_data)]
quant_data[quant_data == 0] <- NA
rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide
full_data_names <- colnames(quant_data)
# Extract factors and trt-replicates using regular expressions.
# Typically:
#   regex_sample_names    is "\\.\\d+[A-Z]$"
#   regex_sample_grouping is "\\d+"
# This would distinguish trt-replicates by terminal letter [A-Z]
#   in sample names and group them into trts by the preceding digits.
#   e.g.:
#     group .1A .1B .1C into group 1;
#     group .2A .2B .2C, into group 2;
#     etc.
m <- regexpr(regex_sample_names, colnames(quant_data), perl = TRUE)
sample_name_matches <- regmatches(names(quant_data), m)
colnames(quant_data) <- sample_name_matches

write_debug_file(quant_data)

rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE)
smpl_trt <- as.factor(regmatches(sample_name_matches, rx_match))

if (print_nb_messages) nbe(see_variable(smpl_trt, "\n\n"))
if (print_nb_messages) nbe(see_variable(sample_name_matches, "\n\n"))
if (print_nb_messages) nbe(see_variable(full_data_names, "\n\n"))

sample_treatment_df <-
  save_sample_treatment_df <-
    data.frame(
      class = smpl_trt,
      sample = sample_name_matches,
      full_sample_names = full_data_names
      )

if (print_nb_messages) nbe(see_variable(sample_treatment_df, "\n\n"))

# reorder data
my_order <- with(sample_treatment_df, order(class, sample))
quant_data <- quant_data[, my_order]
sample_name_matches <- sample_name_matches[my_order]
smpl_trt <- smpl_trt[my_order]
sample_treatment_df <- data.frame(
  class = smpl_trt,
  sample = sample_name_matches
  )

# filter smpl_trt as appropriate
if (sample_group_filter %in% c("include", "exclude")) {
  include_sample <-
    mgrepl(
      v = sample_group_filter_patterns,
      s = as.character(smpl_trt),
      fixed = sample_group_filter_fixed,
      perl = sample_group_filter_perl,
      ignore.case = sample_group_filter_nocase
    )
  if (sum(include_sample) < 2) {
    errmsg <-
      paste(
        "ERROR:",
        sum(include_sample),
        "samples are too few for analysis;",
        "check input parameters for sample-name parsing"
        )
    cat0(
      errmsg,
      "\\stepcounter{offset}\n",
      " in ",
      table_href(),
      ".\n\n"
    )
    data_frame_tabbing_latex(
      x = save_sample_treatment_df,
      tabstops = c(1.25, 1.25),
      caption = "Sample classes",
      use_subsubsection_header = FALSE
      )
    data_frame_tabbing_latex(
      x =
        param_df[
          c("regexSampleNames",
            "regexSampleGrouping",
            "groupFilterPatterns",
            "groupFilter",
            "groupFilterMode"
          ),
        ],
      tabstops = c(1.75),
      underscore_whack = TRUE,
      caption = "Input parameters for sample-name parsing",
      verbatim = FALSE
      )
    param_df_exit()
    knitr::knit_exit()
    return(invisible(-1))
  }
  sample_treatment_df <-
    if (sample_group_filter == "include")
      sample_treatment_df[include_sample, ]
    else
      sample_treatment_df[!include_sample, ]
} else {
  include_sample <- rep.int(TRUE, length(smpl_trt))
}
sample_name_matches <- sample_treatment_df$sample
rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE)
smpl_trt <- as.factor(regmatches(sample_name_matches, rx_match))
sample_treatment_df$class <- smpl_trt

# filter quant_data as appropriate
number_of_samples <- length(sample_name_matches)
quant_data <- quant_data[, sample_name_matches]

sample_level_integers <- as.integer(smpl_trt)
sample_treatment_levels <- levels(smpl_trt)
count_of_treatment_levels <- length(sample_treatment_levels)

# for each phosphopeptide, across treatment levels, compute minimum
#   count of observed (i.e., non-missing) values
my_env <- new_env()
for (l in sample_treatment_levels)
  my_env[[as.character(l)]] <-
    as.vector(rowSums(!is.na(quant_data[l == smpl_trt])))
min_group_obs_count <- row_apply(
    x = Reduce(
      f = function(l, r) cbind(l, my_env[[r]]),
      x = sample_treatment_levels,
      init = c()
    ),
    fun = min
  )
names(min_group_obs_count) <- rownames(quant_data)
rm(my_env)

# display (possibly-filtered) results
cat("\\newpage\n")

if (sum(include_sample) > 1) {
data_frame_tabbing_latex(
  x = sample_treatment_df,
  tabstops = c(1.25),
  caption = "Sample classes",
  use_subsubsection_header = FALSE
  )
}
sample_name_grow <- 10 / (10 + max(nchar(sample_name_matches) + 6))
sample_colsep <- transition_positions(as.integer(sample_treatment_df$class))
```

```{r echo = FALSE, results = 'asis'}
cat("\\newpage\n")
```

## Are the log-transformed sample distributions similar?

```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'}

quant_data[quant_data == 0] <- NA  #replace 0 with NA
quant_data_log <- log10(quant_data)

rownames(quant_data_log) <- rownames(quant_data)
colnames(quant_data_log) <- sample_name_matches

write_debug_file(quant_data_log)

g_ppep_distrib_ctl <- new_env()
g_ppep_distrib_ctl$xlab_line <- 3.5 + 11.86 * (0.67 - sample_name_grow)
g_ppep_distrib_ctl$mai_bottom <- (0.5 + 3.95 * (0.67 - sample_name_grow))
g_ppep_distrib_ctl$axis <- (0.6 + 0.925 * (0.67 - sample_name_grow))

my_ppep_distrib_bxp <- function(
    x
  , sample_name_grow = sample_name_grow
  , main
  , varwidth = FALSE
  , sub = NULL
  , xlab
  , ylab
  , col = const_boxplot_fill
  , notch = FALSE
  , ppep_distrib_ctl = g_ppep_distrib_ctl
  , ...
  ) {
    my_xlab_line  <- g_ppep_distrib_ctl$xlab_line
    my_mai_bottom <- g_ppep_distrib_ctl$mai_bottom
    my_axis       <- g_ppep_distrib_ctl$axis

    if (print_trace_messages) {
      cat_variable(my_xlab_line, suffix = "; ")
      cat_variable(my_mai_bottom, suffix = "; ")
      cat_variable(my_axis, suffix = "\n\n")
    }

    old_par <- par(
      mai = par("mai") + c(my_mai_bottom, 0, 0, 0),
      cex.axis = my_axis,
      cex.lab = 1.2
    )
    tryCatch(
      {
        # Vertical plot
        boxplot(
          x
        , las = 2
        , col = col
        , main = main
        , sub = NULL
        , ylab = ylab
        , xlab = NULL
        , notch = notch
        , varwidth = varwidth
        , ...
        )
        title(
          sub = sub
        , cex.sub = 1.0
        , line = my_xlab_line + 1
        )
        title(
          xlab = xlab
        , line = my_xlab_line
        )
      },
      finally = par(old_par)
    )
  }

my_ppep_distrib_bxp(
    x = quant_data_log
  , sample_name_grow = sample_name_grow
  , main = "Peptide intensities for each sample"
  , varwidth = boxplot_varwidth
  , sub = "Box widths reflect number of peptides for sample"
  , xlab = "Sample"
  , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
  , col = const_boxplot_fill
  , notch = FALSE
  )

cat("\n\n\n\n")

```

```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'}
if (nrow(quant_data_log) > 1) {
  quant_data_log_stack <- stack(quant_data_log)
  ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) +
    ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) +
    ggplot2::ylab("Probability density") +
    ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE)
} else {
  cat("No density plot because there are too few peptides.\n\n")
}
```

## Globally, are peptide intensities are approximately unimodal?

<!--
# bquote could be used as an alternative to latex2exp::TeX below particularly
#   and when plotting math expressions generally, at the expense of mastering
#   another syntax, which hardly seems worthwhile when I need to use TeX
#   elsewhere; here's an introduction to bquote:
#   https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/
-->
```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'}

# identify the location of missing values
fin <- is.finite(as.numeric(as.matrix(quant_data_log)))

logvalues <- as.numeric(as.matrix(quant_data_log))[fin]
logvalues_density <- density(logvalues)
plot(
  x = logvalues_density,
  main = latex2exp::TeX(
    "Smoothed estimated probability density vs. $log_{10}$(peptide intensity)"
    ),
  xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
  ylab = "Probability density"
  )
hist(
  x = as.numeric(as.matrix(quant_data_log)),
  xlim = c(min(logvalues_density$x), max(logvalues_density$x)),
  breaks = 100,
  main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"),
  xlab = latex2exp::TeX("$log_{10}$(peptide intensity)")
)
```

# Characterization of Input Data

## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values

```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'}
# determine quantile
q1 <- quantile(logvalues, probs = mean_percentile)[1]

# 1 = row of matrix (ie, phosphopeptide)
sds <- row_apply(quant_data_log, sd_finite)
if (sum(!is.na(sds)) > 2) {
  plot(
    density(sds, na.rm = TRUE)
  , main = "Smoothed estimated probability density vs. std. deviation"
  , sub = "(probability estimation made with Gaussian smoothing)"
  , ylab = "Probability density"
  )
} else {
  cat(
    "At least two non-missing values are required to plot",
    "probability density.\n\n"
    )
}

```

```{r echo = FALSE}
# Determine number of cells to impute
temp <- quant_data[is.na(quant_data)]

# Determine number of values to impute
number_to_impute <- length(temp)

# Determine percent of missing values
pct_missing_values <-
  round(length(temp) / (length(logvalues) + length(temp)) * 100)
```

```{r echo = FALSE}

# prep for trt-median based imputation

```
# Imputation of Missing Values

```{r echo = FALSE}

imp_smry_pot_peptides_before <- nrow(quant_data_log)
imp_smry_missing_values_before   <- number_to_impute
imp_smry_pct_missing      <- pct_missing_values

```

```{r echo = FALSE}
#Determine number of cells to impute

```
```{r echo = FALSE}

# Identify which values are missing and need to be imputed
ind <- which(is.na(quant_data), arr.ind = TRUE)

```
```{r echo = FALSE, results = 'asis'}

# Apply imputation
switch(
  imputation_method
, "group-median" = {
    quant_data_imp <- quant_data
    imputation_method_description <-
      paste("Substitute missing value with",
        "median peptide-intensity for sample group.\n"
        )
    # Take the accurate ln(x+1) because the data are log-normally distributed
    #   and because median can involve an average of two measurements.
    quant_data_imp <- log1p(quant_data_imp)
    for (i in seq_len(count_of_treatment_levels)) {
      # Determine the columns for this factor-level
      level_cols <- i == sample_level_integers
      # Extract those columns
      lvlsbst <- quant_data_imp[, level_cols, drop = FALSE]
      # assign to ind the row-column pairs corresponding to each NA
      ind <- which(is.na(lvlsbst), arr.ind = TRUE)
      # No group-median exists if there is only one sample
      #   a given ppep has no measurement; otherwise, proceed.
      if (ncol(lvlsbst) > 1) {
        the_centers <-
          row_apply(lvlsbst, median, na.rm = TRUE)
        for (j in seq_len(nrow(lvlsbst))) {
          for (k in seq_len(ncol(lvlsbst))) {
            if (is.na(lvlsbst[j, k])) {
              lvlsbst[j, k] <- the_centers[j]
            }
          }
        }
        quant_data_imp[, level_cols] <- lvlsbst
      }
    }
    # Take the accurate e^x - 1 to match scaling of original input.
    quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
    good_rows <- !is.na(rowMeans(quant_data_imp))
  }
, "median" = {
    quant_data_imp <- quant_data
    imputation_method_description <-
      paste("Substitute missing value with",
        "median peptide-intensity across all sample classes.\n"
        )
    # Take the accurate ln(x+1) because the data are log-normally distributed
    #   and because median can involve an average of two measurements.
    quant_data_imp <- log1p(quant_data_imp)
    quant_data_imp[ind] <-
      row_apply(quant_data_imp, median, na.rm = TRUE)[ind[, 1]]
    # Take the accurate e^x - 1 to match scaling of original input.
    quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
    good_rows <- !is.nan(rowMeans(quant_data_imp))
  }
, "mean" = {
    quant_data_imp <- quant_data
    imputation_method_description <-
      paste("Substitute missing value with",
        "geometric-mean peptide intensity across all sample classes.\n"
        )
    # Take the accurate ln(x+1) because the data are log-normally distributed,
    #   so arguments to mean should be previously transformed.
    #   this will have to be
    quant_data_imp <- log1p(quant_data_imp)
    # Assign to NA cells the mean for the row
    quant_data_imp[ind] <-
      row_apply(quant_data_imp, mean, na.rm = TRUE)[ind[, 1]]
    # Take the accurate e^x - 1 to match scaling of original input.
    quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
    good_rows <- !is.nan(rowMeans(quant_data_imp))
  }
, "random" = {
    quant_data_imp <- quant_data
    # sd to be used is the median sd
    m1 <- median(sds, na.rm = TRUE) * sd_percentile
    # If you want results to be reproducible, you will want to call
    #   base::set.seed before calling stats::rnorm
    imputation_method_description <-
      paste("Substitute each missing value with random intensity",
        sprintf(
          "random intensity $N \\sim (%0.2f, %0.2f)$.\n",
          q1, m1
          )
        )
    cat(sprintf("mean_percentile (from input parameter) is %2.0f\n\n",
      100 * mean_percentile))
    cat(sprintf("sd_percentile (from input parameter) is %0.2f\n\n",
      sd_percentile))
    quant_data_imp[ind] <-
      10 ^ rnorm(number_to_impute, mean = q1, sd = m1)
    quant_data_imp_ln <- log1p(quant_data_imp)
    good_rows <- !is.nan(rowMeans(quant_data_imp))
  }
)
quant_data_imp_log10 <- quant_data_imp_ln * const_log10_e

if (length(good_rows) < 1) {
  print("ERROR: Cannot impute data; there are no good rows!")
  return(-1)
  }
```

```{r echo = FALSE, results = 'asis'}
cat("\\quad\n\nImputation method:\n\n\n", imputation_method_description)
```

```{r echo = FALSE}

imp_smry_pot_peptides_after <- sum(good_rows)
imp_smry_rejected_after  <- sum(!good_rows)
imp_smry_missing_values_after   <- sum(is.na(quant_data_imp[good_rows, ]))

# From ?`%in%`:
#   %in% is currently defined
#   as function(x, table) match(x, table, nomatch = 0) > 0

stock_in <-
  names(good_rows) %in%
    names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count])
if (print_nb_messages) nbe(see_variable(stock_in), "\n")

explicit_in <-
  0 < match(
    names(good_rows),
    names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count])
    )
if (print_nb_messages) nbe(see_variable(explicit_in), "\n")

great_enough_row_names <- good_rows[
  names(good_rows) %in%
    names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count])
]
if (print_nb_messages) nbe(see_variable(great_enough_row_names), "\n")
great_enough_row_names <- great_enough_row_names[great_enough_row_names]
if (print_nb_messages) nbe(see_variable(great_enough_row_names), "\n")
```

```{r echo = FALSE, results = 'asis'}
# ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf
tabular_lines_fmt <- paste(
  "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page)
  " \\leavevmode",
  " \\caption{Imputation Results}",
  " \\centering", # \centering centers the table on the page
  " \\begin{tabular}{l c c c}",
  "  \\hline\\hline",
  "  \\  & potential peptides   & missing values & rejected",
  "    peptides \\\\ [0.5ex]",
  "  \\hline",
  "  before imputation     & %d & %d (%d\\%s) &    \\\\",
  "  after imputation      & %d & %d          & %d \\\\",
  "  after keep comparable & %d &             & %d \\\\ [1ex]",
  "  \\hline",
  " \\end{tabular}",
  #" \\label{table:nonlin}", # may be used to refer this table in the text
  "\\end{table}",
  sep = "\n"
  )
tabular_lines <-
  sprintf(
    tabular_lines_fmt,
    imp_smry_pot_peptides_before,
    imp_smry_missing_values_before,
    imp_smry_pct_missing,
    "%",
    imp_smry_pot_peptides_after,
    imp_smry_missing_values_after,
    imp_smry_rejected_after,
    length(great_enough_row_names),
    imp_smry_pot_peptides_before -
      length(great_enough_row_names)
    )
cat(tabular_lines)
```

```{r filter_good_rows, echo = FALSE}

if (print_nb_messages) nbe(
  "before name extraction, ", see_variable(length(good_rows)), " ",
  see_variable(good_rows), "\n")
good_rows <- names(good_rows[names(great_enough_row_names)])
if (print_nb_messages) nbe(
  "after name extraction, ", see_variable(length(good_rows)),
  see_variable(good_rows), "\n")

min_group_obs_count <- min_group_obs_count[good_rows]

# Zap rows where imputation was insufficiently effective
full_data         <- full_data        [good_rows, ]
quant_data        <- quant_data       [good_rows, ]
quant_data_log    <- quant_data_log   [good_rows, ]

if (print_nb_messages) nbe(
  "before row filter, ", see_variable(nrow(quant_data_imp)), "\n")
quant_data_imp <- quant_data_imp[good_rows, ]
if (print_nb_messages) nbe(
  "after row filter, ", see_variable(nrow(quant_data_imp)), "\n")
write_debug_file(quant_data_imp)
quant_data_imp_good_rows <- quant_data_imp

write_debug_file(quant_data_imp_good_rows)
```

```{r echo = FALSE, results = 'asis'}

can_plot_before_after_imp <- TRUE
d_combined <-
  as.numeric(
    as.matrix(
      log10(quant_data_imp)
      )
    )
d_original <-
  as.numeric(
    as.matrix(
      log10(quant_data_imp[!is.na(quant_data)])
      )
    )

if (sum(!is.na(d_original)) > 2) {
  d_original <- density(d_original)
} else {
  can_plot_before_after_imp <- FALSE
}
if (can_plot_before_after_imp && sum(is.na(d_combined)) < 1) {
  d_combined <- density(d_combined)
} else {
  can_plot_before_after_imp <- FALSE
}

if (sum(is.na(quant_data)) > 0) {
  # There ARE missing values
  d_imputed <-
    as.numeric(
      as.matrix(
        log10(quant_data_imp[is.na(quant_data)])
        )
      )
  if (can_plot_before_after_imp && sum(is.na(d_imputed)) < 1) {
    d_imputed <- (density(d_imputed))
  } else {
    can_plot_before_after_imp <- FALSE
  }
} else {
  # There are NO missing values
  d_imputed <- d_combined
}

```

```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'}
zero_sd_rownames <-
  rownames(quant_data_imp)[
    is.na((row_apply(quant_data_imp, sd, na.rm = TRUE)) == 0)
  ]

if (length(zero_sd_rownames) >= nrow(quant_data_imp)) {
  cat("All peptides have zero standard deviation.  Cannot continue.")
  param_df_exit()
  knitr::knit_exit()
}
if (length(zero_sd_rownames) > 0) {
  cat(
    sprintf(
      "%d %s %s",
      length(zero_sd_rownames),
      "peptides with zero variance",
      "were removed from statistical consideration"
    )
  )
  zap_named_rows <- function(df, nms) {
    return(df[!(row.names(df) %in% nms), ])
  }
  quant_data_imp <-
    zap_named_rows(quant_data_imp, zero_sd_rownames)
  quant_data <-
    zap_named_rows(quant_data, zero_sd_rownames)
  full_data <-
    zap_named_rows(full_data, zero_sd_rownames)
  min_group_obs_count <-
    min_group_obs_count[names(min_group_obs_count) %notin% zero_sd_rownames]
}

if (sum(is.na(quant_data)) > 0) {
  cat("\\leavevmode\\newpage\n")
  # Copy quant data to x
  x <- quant_data
  # x gets to have values of:
  #  - NA for observed values
  #  - 1 for missing values
  x[is.na(x)] <- 0
  x[x > 1] <- NA
  x[x == 0] <- 1
  # Log-transform imputed data
  # update variable because rows may have been eliminated from quant_data_imp
  quant_data_imp_log10 <- log10(quant_data_imp)

  write_debug_file(quant_data_imp_log10)

  # Set blue_dots to log of quant data or NA for NA quant data
  blue_dots <- log10(quant_data)
  # Set red_dots to log of imputed data or NA for observed quant data
  red_dots <- quant_data_imp_log10 * x

  count_red <- sum(!is.na(red_dots))
  count_blue <- sum(!is.na(blue_dots))
  ylim_save <- ylim <- c(
    min(red_dots, blue_dots, na.rm = TRUE),
    max(red_dots, blue_dots, na.rm = TRUE)
    )
  show_stripchart <-
    50 > (count_red + count_blue) / length(sample_name_matches)
  if (show_stripchart) {
    boxplot_sub <- "Light blue = data before imputation; Red = imputed data;"
  } else {
    boxplot_sub <- ""
  }

  # Vertical plot
  colnames(blue_dots) <- sample_name_matches
  my_ppep_distrib_bxp(
      x = blue_dots
    , sample_name_grow = sample_name_grow
    , main = "Peptide intensities after eliminating unusable peptides"
    , varwidth = boxplot_varwidth
    , sub = paste(boxplot_sub,
                  "Box widths reflect number of peptides for sample")
    , xlab = "Sample"
    , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
    , col = const_boxplot_fill
    , notch = FALSE
    , ylim = ylim
    )

  if (show_stripchart) {
    # Points
    # ref: https://r-charts.com/distribution/add-points-boxplot/
    # NA values are not plotted
    stripchart(
      blue_dots,                 # Data
      method = "jitter",          # Random noise
      jitter = const_stripchart_jitter,
      pch = 19,                   # Pch symbols
      cex = const_stripchart_cex, # Size of symbols reduced
      col = "lightblue",          # Color of the symbol
      vertical = TRUE,            # Vertical mode
      add = TRUE                  # Add it over
      )
    stripchart(
      red_dots,                   # Data
      method = "jitter",          # Random noise
      jitter = const_stripchart_jitter,
      pch = 19,                   # Pch symbols
      cex = const_stripchart_cex, # Size of symbols reduced
      col = "red",                # Color of the symbol
      vertical = TRUE,            # Vertical mode
      add = TRUE                  # Add it over
      )

  }
}
```

```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
if (sum(is.na(quant_data)) > 0) {
  # show measured values in blue on left half-violin plot
  cat("\\leavevmode\n\\quad\n\n\\quad\n\n")
  old_par <- par(
    mai = par("mai") + c(g_ppep_distrib_ctl$mai_bottom, 0, 0, 0),
    cex.axis = g_ppep_distrib_ctl$axis,
    cex.lab = 1.2
  )
  tryCatch(
    {
      vioplot::vioplot(
        x = lapply(blue_dots, function(x) x[!is.na(x)]),
        col = "lightblue1",
        side = "left",
        plotCentre = "line",
        ylim = ylim_save,
        main = "Distributions of observed and imputed data",
        sub = NULL,
        las = 2,
        xlab = NULL,
        ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
        )
      title(
        sub = "Light blue = observed data; Pink = imputed data",
        cex.sub = 1.0,
        line = g_ppep_distrib_ctl$xlab_line + 1
      )
      title(
        xlab = "Sample",
        line = g_ppep_distrib_ctl$xlab_line
      )
      red_violins <- lapply(red_dots, function(x) x[!is.na(x)])
      cols_to_delete <- c()
      for (ix in seq_len(length(red_violins))) {
        if (length(red_violins[[ix]]) < 1) {
          cols_to_delete <- c(cols_to_delete, ix)
        }
      }
      # destroy any unimputable columns
      if (!is.null(cols_to_delete)) {
        red_violins <- red_violins[-cols_to_delete]
      }
      # plot imputed values in red on right half-violin plot
      vioplot::vioplot(
        x = red_violins,
        col = "lightpink1",
        side = "right",
        plotCentre = "line",
        add = TRUE
        )

    },
    finally = par(old_par)
  )

  # density plot
  cat("\\leavevmode\n\n\n\n\n\n\n")
  if (can_plot_before_after_imp) {
    ylim <- c(
      0,
      max(
        if (is.list(d_combined)) d_combined$y else d_combined,
        if (is.list(d_original)) d_original$y else d_original,
        if (is.list(d_imputed)) d_imputed$y else d_imputed,
        na.rm = TRUE
      )
    )
    plot(
      d_combined,
      ylim = ylim,
      sub =
        paste(
          "Blue = data before imputation; Red = imputed data;",
          "Black = combined"
          ),
      main = "Density of peptide intensity before and after imputation",
      xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
      ylab = "Probability density"
    )
    lines(d_original, col = "blue")
    lines(d_imputed, col = "red")
  } else {
    cat(
      "There are too few points to plot the density of peptide intensity",
      "before and after imputation."
      )
  }
  cat("\\leavevmode\\newpage\n")
}
```

# Quantile Normalization

The excellent `normalize.quantiles` function from
*[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)*
performs "quantile normalization" as described Bolstad *et al.* (2003),
DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)*
and its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html),
i.e., it assumes that the goal is to detect
subtle differences among grossly similar samples (having similar distributions)
by equalizing intra-quantile quantitations^[Unfortunately,
one software library upon which `preprocessCore` depends
*[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)*
that requires that a specific, non-concurrent version of the library (`openblas` version $0.3.3$) be
installed.  The installation command equivalent to what was used to install the library to produce the results presented here is:
\linebreak
`    conda install bioconductor-preprocesscore openblas=0.3.3`].


<!--
  # Apply quantile normalization using preprocessCore::normalize.quantiles
  # ---
  # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html
  #   except this: https://support.bioconductor.org/p/122925/#9135989
  #   says to install it like this:
  #     ```
  #     BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1])
  #     ```
  # conda installation (necessary because of a bug in recent openblas):
  #   conda install bioconductor-preprocesscore openblas=0.3.3
  # ...
  # ---
  # normalize.quantiles {preprocessCore} --  Quantile Normalization
  #
  # Description:
  #   Using a normalization based upon quantiles, this function normalizes a
  #     matrix of probe level intensities.
  #
  #   THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the
  #     assumption that the data is missing at random.
  #
  # Usage:
  #   normalize.quantiles(x, copy = TRUE, keep.names = FALSE)
  #
  # Arguments:
  #
  #   - x: A matrix of intensities where each column corresponds to a chip and each row is a probe.
  #
  #   - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy,
  #       but in certain situations not making a copy of the matrix, but instead normalizing
  #       it in place will be more memory friendly.
  #
  #   - keep.names: Boolean option to preserve matrix row and column names in output.
  #
  # Details:
  #   This method is based upon the concept of a quantile-quantile plot extended to n dimensions.
  #     No special allowances are made for outliers. If you make use of quantile normalization
  #     please cite Bolstad et al, Bioinformatics (2003).
  #
  #   This functions will handle missing data (ie NA values), based on
  #     the assumption that the data is missing at random.
  #
  #   Note that the current implementation optimizes for better memory usage
  #     at the cost of some additional run-time.
  #
  # Value: A normalized matrix.
  #
  # Author: Ben Bolstad, bmbolstad.com
  #
  # References
  #
  #   - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide
  #       Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf
  #
  #   - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of
  #       Normalization Methods for High Density Oligonucleotide Array Data Based on Bias
  #       and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185
  #       http://bmbolstad.com/misc/normalize/normalize.html
  # ...
-->
```{r echo = FALSE, results = 'asis'}

if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp)), "\n")
if (nrow(quant_data_imp) > 0) {
  quant_data_imp_qn <- preprocessCore::normalize.quantiles(
     as.matrix(quant_data_imp), keep.names = TRUE
   )
} else {
  quant_data_imp_qn <- as.matrix(quant_data_imp)
}

if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn)), "\n")

quant_data_imp_qn <- as.data.frame(quant_data_imp_qn)
write_debug_file(quant_data_imp_qn)

quant_data_imp_qn_log <- log10(quant_data_imp_qn)
write_debug_file(quant_data_imp_qn_log)

if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn_log)), "\n")
if (print_nb_messages) nbe(see_variable(ncol(quant_data_imp_qn_log)), "\n")

quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn))))

sel <- row_apply(quant_data_imp_qn_ls, any_nan)
quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls

quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ]
quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2)

quant_data_imp_qn_ls <- as.data.frame(quant_data_imp_qn_ls)

write_debug_file(quant_data_imp_qn_ls)
write_debug_file(quant_data_imp_qn_ls2)

# Create data.frame used by ANOVA analysis
data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log)
```

## Are normalized, imputed, log-transformed sample distributions similar?

```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'}

# Save unimputed quant_data_log for plotting below
unimputed_quant_data_log <- quant_data_log

# log10 transform (after preparing for zero values,
#   which should never happen...)
quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001
quant_data_log <- log10(quant_data_imp_qn)

how_many_peptides <- nrow(quant_data_log)

if ((how_many_peptides) > 0) {
  cat(
    sprintf(
      "Intensities for %d peptides:\n\n\n",
      how_many_peptides
      )
    )
  cat("\n\n\n")


  # data visualization
  if (TRUE) {

  my_ppep_distrib_bxp(
      x = quant_data_log
    , sample_name_grow = sample_name_grow
    , main = "Peptide intensities for each sample"
    , varwidth = boxplot_varwidth
    , sub = NULL
    , xlab = "Sample"
    , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
    , col = const_boxplot_fill
    , notch = boxplot_notch
    )

  } else {

  old_par <- par(
    mai = par("mai") + c(0.5, 0, 0, 0)
  , oma = par("oma") + c(0.5, 0, 0, 0)
  )
  # ref: https://r-charts.com/distribution/add-points-boxplot/
  # Vertical plot
  colnames(quant_data_log) <- sample_name_matches
  boxplot(
    quant_data_log
  , las = 2
  , cex.axis = 0.9 * sample_name_grow
  , col = const_boxplot_fill
  , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
  , xlab = "Sample"
  , notch = boxplot_notch
  , varwidth = boxplot_varwidth
  )
  par(old_par)
  }
} else {
  cat("There are no peptides to plot\n")
}

cat("\n\n\n")

```

```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'}
if (nrow(quant_data_log) > 1) {
  quant_data_log_stack <- stack(quant_data_log)
  ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) +
    ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) +
    ggplot2::ylab("Probability density") +
    ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE)
} else {
  cat("No density plot because there are fewer than two peptides to plot.\n\n")
}
```
```{r echo = FALSE, results = 'asis'}
cat("\\leavevmode\\newpage\n")
```

# ANOVA Analysis

## Assignment of $p$-value and quality score

For each phosphopeptide, ANOVA analysis was performed to compute a $p$-value representing the evidence against the "null hypothesis" ($H_0$) that the intensity does not vary significantly among sample groups.

However, because as more and more phosphopeptides are tested, there is increasd probability that, by random chance, a given peptide will have a $p$-value that appears to indicate significance.  For this reason, the $p$-values were adjusted by applying the False Discovery Rate (FDR) correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https://doi.org/10.1111/j.2517-6161.1995.tb02031.x).

Furthermore, to give more weight to phosphopeptides having fewer missing values, an (arbitrarily defined) quality score was assigned to each, defined as:

$$
\textit{quality}_j
  = \frac{1 + o_{j}}{v_{j}(1 + o_{j}) + 0.005}
$$

where:

- $o_j$ is the minimum number of non-missing observations per sample group for substrate $j$ for all sample groups, and
- $v_j$ is the FDR-adjusted ANOVA $p$-value for substrate $j$.


```{r, echo = FALSE, results = 'asis'}
cat("\\newpage\n")

# Make new data frame containing only Phosphopeptides
#   to connect preANOVA to ANOVA (connect_df)
connect_df <- data.frame(
    data_table_imp_qn_lt$Phosphopeptide
  , data_table_imp_qn_lt[, first_data_column]
  )
colnames(connect_df) <- c("Phosphopeptide", "Intensity")
```

```{r anova, echo = FALSE, fig.dim = c(10, 12), results = 'asis'}
g_can_run_ksea <- FALSE
old_oma <- par("oma")

# nolint start
#   squash un-actionable cyclomatic_complexity warning
if (count_of_treatment_levels < 2) {
# nolint end
  cat(
    "ERROR!!!! Cannot perform ANOVA analysis",
    "(see next page)\\newpage\n"
  )
  cat(
    "ERROR: ANOVA analysis",
    "requires two or more factor levels!\n\n\n"
  )

  cat("\n\n\n\n\n")
  cat("Unparsed sample names are:\n\n\n",
    "\\begin{quote}\n",
    paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"),
    "\n\\end{quote}\n\n")

  regex_sample_names <- latex_printable_control_seqs(regex_sample_names)

  cat("\\leavevmode\n\n\n")
  cat("Parsing rule for SampleNames is",
    "\n\n\n",
    "\\text{'",
    regex_sample_names,
    "'}\n\n\n",
    sep = ""
    )

  cat("\nParsed sample names are:\n",
    "\\begin{quote}\n",
    paste(sample_name_matches, collapse = "\n\n\n"),
    "\n\\end{quote}\n\n")

  regex_sample_grouping <- latex_printable_control_seqs(regex_sample_grouping)

  cat("\\leavevmode\n\n\n")
  cat("Parsing rule for SampleGrouping is",
    "\n\n\n",
    "\\text{'",
    regex_sample_grouping,
    "'}\n\n\n",
    sep = ""
    )

  cat("\n\n\n")
  cat("Sample group assignments are:\n",
    "\\begin{quote}\n",
    paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"),
    "\n\\end{quote}\n\n")

} else {

  if (print_nb_messages) nbe("computing p_value_data_anova_ps\n")
  if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn_log)), "\n")
  if (print_nb_messages) nbe(see_variable(ncol(quant_data_imp_qn_log)), "\n")
  if (print_nb_messages) nbe(see_variable(quant_data_imp_qn_log), "\n")
  if (print_nb_messages) nbe(see_variable(anova_func), "\n")
  if (print_nb_messages) nbe(see_variable(smpl_trt), "\n")
  if (print_nb_messages) nbe(see_variable(one_way_all_categories), "\n")
  tryCatch(
    {
      p_value_data_anova_ps <-
        row_apply(
          quant_data_imp_qn_log,
          anova_func,
          grouping_factor = smpl_trt,
          one_way_f = one_way_all_categories
          )
    },
    error = function(e) {
      mesg <- paste("Could not compute ANOVA because", e$message)
      cat("\n\n", mesg, "\n\n")
      param_df_noexit(e)
      sink(stderr())
      cat("\n\n", mesg, "\n\n")
      values <- paste(param_df$parameter, "=", param_df$value, collapse = "\n")
      cat(values)
      sink()
      knitr::knit_exit()
      exit(code = 1)
    }
  )
  if (print_nb_messages) nbe(see_variable(p_value_data_anova_ps), "\n")

  p_value_data_anova_ps_fdr <-
    p.adjust(p_value_data_anova_ps, method = "fdr")
  my_ppep_v <- full_data[, 1]
  p_value_data <- list(
    phosphopeptide = my_ppep_v,
    raw_anova_p = p_value_data_anova_ps,
    fdr_adjusted_anova_p = p_value_data_anova_ps_fdr,
    missing_values = rowSums(is.na(quant_data)),
    min_group_obs_count = min_group_obs_count
  )
  p_value_data <- data.frame(
    phosphopeptide = my_ppep_v,
    raw_anova_p = p_value_data_anova_ps,
    fdr_adjusted_anova_p = p_value_data_anova_ps_fdr,
    missing_values = rowSums(is.na(quant_data)),
    min_group_obs_count = min_group_obs_count
  )
  p_value_data$quality <- 1.0 / with(
      p_value_data,
      fdr_adjusted_anova_p + 0.005 / (1 + min_group_obs_count)
    )

  p_value_data$ranking <-
    with(
      p_value_data,
      switch(
        g_intensity_hm_criteria,
        "quality"  = order(-quality),
        "na_count" = order(missing_values, fdr_adjusted_anova_p),
        # the default is "p_value"
        order(fdr_adjusted_anova_p)
      )
    )
  p_value_data <- p_value_data[p_value_data$ranking, , drop = FALSE]

  write.table(
    p_value_data,
    file = "p_value_data.txt",
    sep = "\t",
    col.names = TRUE,
    row.names = FALSE,
    quote = FALSE
    )


  # output ANOVA file to constructed filename,
  #   e.g.    "Outputfile_pST_ANOVA_STEP5.txt"
  #   becomes "Outputfile_pST_ANOVA_STEP5_FDR0.05.txt"

  # Re-output datasets to include p-values
  metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:ncol(p_value_data)])
  write.table(
    cbind(metadata_plus_p, quant_data_imp),
    file = imputed_data_filename,
    sep = "\t",
    col.names = TRUE,
    row.names = FALSE,
    quote = FALSE
    )

  write.table(
    cbind(metadata_plus_p, quant_data_imp_qn_log),
    file = imp_qn_lt_data_filenm,
    sep = "\t",
    col.names = TRUE,
    row.names = FALSE,
    quote = FALSE
    )


  first_page_suppress <- 1
  number_of_peptides_found <- 0
  cutoff <- val_fdr[1]
  for (cutoff in val_fdr) {
    #loop through FDR cutoffs
    if (number_of_peptides_found > g_intensity_hm_rows - 1) {
      cat("\\leavevmode\n\n\n")
      break
      }

    if (print_trace_messages) {
      cat_variable(cutoff)
      cat("\n\n")
    }

    bool_1 <- (p_value_data$fdr_adjusted_anova_p < cutoff)
    bool_2 <- (p_value_data$min_group_obs_count >= g_intensity_min_per_class)
    g_can_run_ksea <- g_can_run_ksea || (sum(bool_2) > 0)
    bool_4 <- (p_value_data$quality >= params$minQuality)
    bool_3 <- as.logical(
        as.integer(bool_1) *
        as.integer(bool_2) *
        as.integer(bool_4)
      )
    if (print_trace_messages) {
      if (length(bool_1) > 30) {
        cat_variable(bool_1, force_str = TRUE)
        cat_variable(bool_2, force_str = TRUE)
        cat_variable(bool_3, force_str = TRUE)
      } else {
        cat_variable(bool_1, suffix = "\n\n")
        cat_variable(bool_2, suffix = "\n\n")
        cat_variable(bool_3, suffix = "\n\n")
      }
      cat_variable(length(bool_3), digits = 0, suffix = "; ")
      cat_variable(sum(bool_3), digits = 0, suffix = "\n\n")
    }

    filtered_p <-
      p_value_data[bool_3, , drop = FALSE]
    filtered_p <-
      filtered_p[!is.na(filtered_p$phosphopeptide), , drop = FALSE]

    if (print_trace_messages)
      cat_variable(filtered_p, force_str = TRUE)

    have_remaining_peptides <- sum(bool_3, na.rm = TRUE) > 0

    filter_result_string <-
      sprintf(
        "%s, %s of %0.0f peptides remained having both %s and %s.\n\n",
        "After filtering for ANOVA results",
        if (have_remaining_peptides)
          as.character(sum(bool_3, na.rm = TRUE))
        else
          "none",
        length(bool_3),
        adj_p_string <-
          sprintf("adjusted p-value < %s", as.character(signif(cutoff, 2))),
        sprintf(
          "more than %0.0f observations in some groups",
          max(0, g_intensity_min_per_class - 1)
        )
      )

    filtered_data_filtered <-
      quant_data_imp_qn_log[
        rownames(filtered_p),
        , drop = FALSE
        ]
    # order by p-value
    filtered_data_filtered <-
      filtered_data_filtered[
        order(filtered_p$fdr_adjusted_anova_p),
        , drop = FALSE
        ]

    if (have_remaining_peptides
        && nrow(filtered_p) > 0
        && nrow(filtered_data_filtered) > 0
        ) {
      if (first_page_suppress == 1) {
        first_page_suppress <- 0
        } else {
          cat("\\newpage\n")
        }
      latex_samepage({
        cat(filter_result_string)
        if (nrow(filtered_data_filtered) > 1) {
          cat(subsection_header(sprintf(
            "Intensity distributions for %d phosphopeptides, %s\n",
            nrow(filtered_data_filtered),
            adj_p_string
          )))
        } else {
          cat(
            subsection_header(
              sprintf(
                "Intensity distribution for one phosphopeptide, %s\n",
                adj_p_string
              )
            ),
            rownames(filtered_data_filtered)[1],
            "\n"
          )
        }
      }) # end latex_samepage

      old_par <- par(
        mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1),
        oma = old_oma * c(1, 1, 0.3, 1),
        cex.main = 0.9,
        cex.axis = 0.7,
        fin = c(9, 7.25)
        )
      # Vertical plot
      colnames(filtered_data_filtered) <- sample_name_matches
      tryCatch(
        boxplot(
          filtered_data_filtered,
          main = "Imputed, normalized intensities", # no line plot
          las = 2,
          cex.axis = 0.9 * sample_name_grow,
          col = const_boxplot_fill,
          ylab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
          notch = FALSE,
          varwidth = boxplot_varwidth
        ),
        error = function(e) {
          print(e)
          cat_margins()
        }

      )
      par(old_par)
    } else {
      cat(sprintf(
        "%s < %0.2f\n\n\n\n\n",
        "No peptides were found to have cutoff adjusted p-value",
        cutoff
      ))
    }

    if (have_remaining_peptides && nrow(filtered_data_filtered) > 0) {
      # Add Phosphopeptide column to anova_filtered table
      # The assumption here is that the first intensity is unique;
      #   this is a hokey assumption but almost definitely will
      #   be true in the real world unless there is a computation
      #   error upstream.
      anova_filtered_merge <- base::merge(
        x = connect_df,
        y = filtered_data_filtered,
        by.x = "Intensity",
        by.y = 1
      )
      anova_filtered_merge_order <- rownames(filtered_p)

      anova_filtered <- data.frame(
        ppep    = anova_filtered_merge$Phosphopeptide,
        intense = anova_filtered_merge$Intensity,
        data    = anova_filtered_merge[, 2:number_of_samples + 1]
      )
      colnames(anova_filtered) <-
        c("Phosphopeptide", colnames(filtered_data_filtered))

      # Merge qualitative columns into the ANOVA data
      output_table <- data.frame(anova_filtered$Phosphopeptide)
      output_table <- base::merge(
        x = output_table,
        y = data_table_imp_qn_lt,
        by.x = "anova_filtered.Phosphopeptide",
        by.y = "Phosphopeptide"
      )

      # Produce heatmap to visualize significance and the effect of imputation

      cat_hm_heading <- function(m, cutoff) {
        if (nrow(m) > g_intensity_hm_rows) {
          cat("\\clearpage\n")
          cat(subsection_header(
            paste(
              sprintf("Heatmap for the %d most-significant peptides",
                g_intensity_hm_rows),
              sprintf("whose adjusted p-value < %0.2f\n", cutoff)
            )
          ))
        } else {
          if (nrow(m) < 2) {
            return(FALSE)
          } else {
            cat(subsection_header(
              paste(
                sprintf("Heatmap for %d usable peptide genes whose", nrow(m)),
                sprintf("adjusted p-value < %0.2f\n", cutoff)
              )
            ))
          }
        }
        cat("\n\n\n")
        cat("\n\n\n")
        return(TRUE)
      }

      # construct matrix with appropriate rownames
      m <-
        as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ])
      nrow_m <- nrow(m)
      ncol_m <- ncol(m)
      if (nrow_m > 0) {
        rownames_m <- rownames(m)
        q <- data.frame(pepname = rownames_m)
        g <- sqldf::sqldf("
          SELECT q.pepname, substr(met.Gene_Name, 1, 30) AS gene_name
            FROM q, metadata_plus_p AS met
            WHERE q.pepname = met.Phosphopeptide
            ORDER BY q.rowid
        ")
        tmp <- sapply(
          X = seq_len(nrow(g)),
          FUN = function(i) {
            pre <- strsplit(g$gene_name[i], "; ")[[1]]
            rslt <- paste(unique(pre), sep = "; ")
            return(rslt)
          }
        )
        tmp <- unlist(tmp)
        tmp <-
          make.names(tmp, unique = TRUE)
        tmp <- sub(
            "No_Gene_Name",
            "not_found",
            tmp,
            fixed = TRUE
          )
        ten_trunc_names <- trunc_ppep(rownames_m)
        tmp <- sapply(
          X = seq_len(nrow_m),
          FUN = function(i) {
            sprintf(
              "(%s) %s",
              tmp[i],
              ten_trunc_names[i]
            )
          }
        )
        rownames(m) <- tmp
      }
      # draw the heading and heatmap
      if (nrow_m > 0) {
          number_of_peptides_found <-
            ppep_heatmap(
              m                       = m,
              cutoff                  = cutoff,
              hm_heading_function     = cat_hm_heading,
              hm_main_title           =
                "log(intensities), row-scaled, unimputed, unnormalized",
              suppress_row_dendrogram = FALSE,
              master_cex              = 0.35,
              sepcolor                = "black",
              colsep                  = sample_colsep
            )
          if (number_of_peptides_found > 1) {
            cat("\\leavevmode\n")
          }
      }
    }
  }
}

cat("\\leavevmode\n")

if (!g_can_run_ksea) {
  errmsg <- paste("Cannot proceed with KSEA analysis",
    "because too many values are missing.")
  if (FALSE) cat0(
        errmsg,
        "\\stepcounter{offset}\n",
        "\\stepcounter{offset}\n",
        "\\stepcounter{offset}\n",
        " in ",
        table_href(),
        ".\n\n"
      )
  if (FALSE) {
    if (print_nb_messages) nbe(see_variable(p_value_data))
  } else {
    if (print_nb_messages) nbe(see_variable(p_value_data))

    display_p_value_data <- p_value_data
    display_p_value_data$raw_anova_p <-
      sprintf("%0.3g", display_p_value_data$raw_anova_p)
    display_p_value_data$fdr_adjusted_anova_p <-
      sprintf("%0.3g", display_p_value_data$fdr_adjusted_anova_p)
    display_p_value_data$quality <-
      sprintf("%0.3g", display_p_value_data$quality)

    headers_1st_line <-
      c("",               "Raw ANOVA", "FDR-adj.", "Missing", "Min. #",
        "",        "")
    headers_2nd_line <-
      c("Phosphopeptide", "p-value",   "p-value",  "values",  "group-obs",
        "Quality", "Ranking")
    data_frame_tabbing_latex(
      x = display_p_value_data,
      tabstops = c(2.75, 0.80, 0.80, 0.5, 0.6, 0.60),
      use_subsubsection_header = FALSE,
      headings = c(headers_1st_line, headers_2nd_line),
      caption = "ANOVA results"

      )
  }
  data_frame_tabbing_latex(
    x = save_sample_treatment_df,
    tabstops = c(1.25, 1.25),
    caption = "Sample classes",
    use_subsubsection_header = FALSE
    )
  param_df_exit()
  knitr::knit_exit()
  return(invisible(-1))
}

```

```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}

if (g_can_run_ksea && count_of_treatment_levels > 1) {
  # Prepare two-way contrasts with adjusted p-values
  # Strategy:
  # - use imputed, log-transformed data:
  #   - remember this when computing log2(fold-change)
  # - each contrast is between a combination of trt levels
  # - for each contrast, compute samples that are members
  # - compute one-way test:
  #   - use `oneway.test` (Welch test) if numbers of samples
  #     are not equivalent between trt levels
  #   - otherwise, aov is fine but offers no advantage
  # - adjust p-value, assuming that
  #   (# of pppeps)*(# of contrasts) tests were performed

  # Each contrast is between a combination of trt levels
  m2 <- combn(
    x = seq_len(length(levels(smpl_trt))),
    m = 2,
    simplify = TRUE
  )
  contrast_count <- ncol(m2)

  # For each contrast, compute samples that are members
  # - local function to construct a data.frame for each contrast
  #   - the contrast in the first "column"
  f_m2 <-
    function(cntrst, lvl1, lvl2) {
      return(
        data.frame(
          contrast = cntrst,
          level = smpl_trt[
              smpl_trt %in%
                levels(smpl_trt)[c(lvl1, lvl2)]
            ],
          label = sample_name_matches[
              smpl_trt %in%
                levels(smpl_trt)[c(lvl1, lvl2)]
            ]
        )
      )
    }
  # - compute a df for each contrast
  sample_level_dfs <- lapply(
    X = 1:contrast_count,
    FUN = function(i) f_m2(i, m2[1, i], m2[2, i])
  )
  # - compute a single df for all contrasts
  combined_contrast_df <- Reduce(f = rbind, x = sample_level_dfs)

  # - dispose objects to free resources
  rm(sample_level_dfs)

  # - write the df to a DB for later join-per-contrast
  db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db)

  RSQLite::dbWriteTable(
    conn = db,
    name = "contrast",
    value = combined_contrast_df,
    overwrite = TRUE
  )

  # Create UK for insert
  ddl_exec(db, "
    CREATE UNIQUE INDEX IF NOT EXISTS contrast__uk__idx
      ON contrast(contrast, label);
    "
  )
  # Create indexes for join
  ddl_exec(db, "
    -- index for join in contrast_ppep_smpl_qnlt on a.label < b.label
    CREATE INDEX IF NOT EXISTS contrast__label__idx
      ON contrast(label);
    "
  )
  ddl_exec(db, "
    -- index for joining two contrast_lvl_ppep_avg_quant on contrast
    CREATE INDEX IF NOT EXISTS contrast__contrast__idx
      ON contrast(contrast);
    "
  )
  ddl_exec(db, "
    -- index for joining two contrast_lvl_ppep_avg_quant on phophospep
    CREATE INDEX IF NOT EXISTS contrast__level__idx
      ON contrast(level);
    "
  )
  # - dispose objects to free resources
  rm(combined_contrast_df)

  # Use imputed, log-transformed data
  # - remember that this was donoe when computing log2(fold-change)
  # - melt data matrix for use in later join-per-contrast
  casted <- cbind(
    data.frame(vrbl = rownames(quant_data_imp_qn_log)),
    quant_data_imp_qn_log
  )
  quant_data_imp_qn_log_melted <- reshape2::melt(
    casted,
    id.vars = "vrbl"
  )
  colnames(quant_data_imp_qn_log_melted) <-
    c("phosphopep", "sample", "quant")
  # - dispose objects to free resources
  rm(casted)

  # - write the df to a DB for use in later join-per-contrast
  RSQLite::dbWriteTable(
    conn = db,
    name = "ppep_smpl_qnlt",
    value = quant_data_imp_qn_log_melted,
    overwrite = TRUE
  )
  # Create UK for insert
  ddl_exec(db, "
    CREATE UNIQUE INDEX IF NOT EXISTS ppep_smpl_qnlt__uk__idx
      ON ppep_smpl_qnlt(phosphopep, sample);
    "
  )
  # Create index for join
  ddl_exec(db, "
    -- index for join in contrast_ppep_smpl_qnlt
    CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__sample__idx
      ON ppep_smpl_qnlt(sample);
    "
  )
  ddl_exec(db, "
    -- index for joining two contrast_lvl_ppep_avg_quant on phopho.pep
    CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__phosphopep__idx
      ON ppep_smpl_qnlt(phosphopep);
    "
  )
  # - dispose objects to free resources
  rm(quant_data_imp_qn_log_melted)

  # - drop views if exist
  ddl_exec(db, "
    -- drop view dependent on contrast_lvl_ppep_avg_quant
    DROP VIEW IF EXISTS v_contrast_log2_fc;
    "
  )
  ddl_exec(db, "
    -- drop table dependent on contrast_ppep_smpl_qnlt
    DROP TABLE IF EXISTS contrast_lvl_ppep_avg_quant;
    "
  )
  ddl_exec(db, "
    DROP TABLE IF EXISTS contrast_lvl_lvl_metadata;
    "
  )
  ddl_exec(db, "
    DROP VIEW IF EXISTS v_contrast_lvl_metadata;
    "
  )
  ddl_exec(db, "
    -- drop view dependent on contrast_ppep_smpl_qnlt
    DROP VIEW IF EXISTS v_contrast_lvl_ppep_avg_quant;
    "
  )
  ddl_exec(db, "
    DROP VIEW IF EXISTS v_contrast_lvl_lvl;
    "
  )
  ddl_exec(db, "
    -- drop view upon which other views depend
    DROP VIEW IF EXISTS contrast_ppep_smpl_qnlt;
    "
  )
  # - create view
  dml_no_rows_exec(db, "
      -- view contrast_ppep_smpl_qnlt is used for each phopshopep to
      --   compute p-value for test of trt effect for two trt levels
      CREATE VIEW contrast_ppep_smpl_qnlt
        AS
      SELECT  contrast,
              level,
              phosphopep,
              sample,
              quant
        FROM  contrast AS c,
              ppep_smpl_qnlt AS q
        WHERE q.sample = c.label
        ORDER BY contrast, level, phosphopep
      ;
    "
  )
  # - create simplification views
  dml_no_rows_exec(db, "
      CREATE VIEW v_contrast_lvl_metadata
        AS
      SELECT  contrast,
              level,
              group_concat(label, ';') AS samples
        FROM  contrast
        GROUP BY contrast, level
        /* view v_contrast_lvl_metadata is used
           to simplify creation of table contrast_lvl_lvl_metadata */
      ;
    "
  )
  dml_no_rows_exec(db, "
      CREATE VIEW v_contrast_lvl_ppep_avg_quant
        AS
      SELECT  contrast,
              level,
              phosphopep,
              avg(quant)                AS avg_quant
        FROM  contrast_ppep_smpl_qnlt
        GROUP BY contrast, level, phosphopep
        /* view v_contrast_lvl_ppep_avg_quant is used
           to simplify view v_contrast_log2_fc */
      ;
    "
  )

  # - create contrast-metadata table
  if (print_nb_messages) nbe("CREATE TABLE contrast_lvl_lvl_metadata")
  dml_no_rows_exec(db, "
      CREATE TABLE contrast_lvl_lvl_metadata
        AS
      SELECT DISTINCT
              a.contrast              AS ab_contrast,
              a.level                 AS a_level,
              b.level                 AS b_level,
              a.samples               AS a_samples,
              b.samples               AS b_samples,
              'log2(level_'||a.level||'/level_'||b.level||')'
                                      AS fc_description
        FROM  v_contrast_lvl_metadata AS a,
              v_contrast_lvl_metadata AS b
        WHERE a.contrast = b.contrast
          AND a.level > b.level
        /* view v_contrast_lvl_lvl is used
           to simplify view v_contrast_log2_fc */
      ;
    "
  )
  # - create pseudo-materialized view table
  dml_no_rows_exec(db, "
      CREATE VIEW v_contrast_lvl_lvl
        AS
      SELECT DISTINCT
              a.contrast AS ab_contrast,
              a.level    AS a_level,
              b.level    AS b_level
        FROM  contrast   AS a,
              contrast   AS b
        WHERE a.contrast = b.contrast
          AND a.level > b.level
        /* view v_contrast_lvl_lvl is used
           to simplify view v_contrast_log2_fc */
      ;
    "
  )

  # - create view to compute log2(fold-change)
  dml_no_rows_exec(db, "
      CREATE VIEW v_contrast_log2_fc
        AS
      SELECT  ab.ab_contrast                         AS contrast,
              m.a_level                              AS a_level,
              c.avg_quant                            AS a_quant,
              m.a_samples                            AS a_samples,
              ab.b_level                             AS b_level,
              d.avg_quant                            AS b_quant,
              m.b_samples                            AS b_samples,
              m.fc_description                       AS fc_description,
              3.32193 * ( d.avg_quant - c.avg_quant) AS log2_fc,
              d.phosphopep                           AS phosphopep
        FROM  contrast_lvl_lvl_metadata                  AS m,
              v_contrast_lvl_ppep_avg_quant              AS d,
              v_contrast_lvl_lvl AS ab
                INNER JOIN v_contrast_lvl_ppep_avg_quant AS c
                  ON c.contrast = ab.ab_contrast
                  AND c.level   = ab.a_level
        WHERE d.contrast    = ab.ab_contrast
          AND m.ab_contrast = ab.ab_contrast
          AND d.level       = ab.b_level
          AND d.phosphopep  = c.phosphopep
        /* view to compute log2(fold-change) */
        ;
    "
  )

  # For each contrast, compute samples that are members
  # compute one-way test:
  # - use `oneway.test` (Welch test) if numbers of samples
  #   are not equivalent between trt levels
  # - otherwise, aov is fine but offers no advantage
  for (contrast in contrast_count:2) {
    invisible(contrast)
  }
  for (contrast in 1:contrast_count) {
    contrast_df <- sqldf::sqldf(
      x = paste0("
        SELECT level, phosphopep, sample, quant
          FROM contrast_ppep_smpl_qnlt
          WHERE contrast = ", contrast, "
          ORDER BY phosphopep, level, sample
      "),
      connection = db
    )
    contrast_cast <- reshape2::dcast(
      data = contrast_df,
      formula = phosphopep ~ sample,
      value.var = "quant"
    )
    contrast_cast_ncol <- ncol(contrast_cast)
    contrast_cast_data <- contrast_cast[, 2:contrast_cast_ncol]
    contrast_cast_samples <- colnames(contrast_cast_data)

    # - order grouping_factor by order of sample columns of contrast_cast_data
    grouping_factor <- sqldf::sqldf(
      x = paste0("
        SELECT sample, level
          FROM contrast_ppep_smpl_qnlt
          WHERE contrast = ", contrast, "
          ORDER BY phosphopep, level, sample
          LIMIT ", contrast_cast_ncol - 1
      ),
      connection = db
    )
    rownames(grouping_factor) <- grouping_factor$sample
    grouping_factor <- grouping_factor[, "level", drop = FALSE]

    # - run the two-level (one-way) test
    p_value_data_contrast_ps <-
      row_apply(
        x = contrast_cast_data,
        fun = anova_func,
        grouping_factor =
          as.factor(grouping_factor$level), # anova_func arg2
        one_way_f = one_way_two_categories, # anova_func arg3
        simplify = TRUE # TRUE is the default for simplify
        )

    if (!is.null(p_value_data_contrast_ps)) {
      contrast_data_adj_p_values <-
        p.adjust(
          p = p_value_data_contrast_ps,
          method = "fdr",
          n = length(p_value_data_contrast_ps) # this is the default, length(p)
          )

      # - compute the fold-change
      contrast_p_df <-
        data.frame(
          contrast = contrast,
          phosphopep = contrast_cast$phosphopep,
          p_value_raw = p_value_data_contrast_ps,
          p_value_adj = contrast_data_adj_p_values
        )
      db_write_table_overwrite <- (contrast < 2)
      db_write_table_append <- !db_write_table_overwrite
      RSQLite::dbWriteTable(
        conn = db,
        name = "contrast_ppep_p_val",
        value = contrast_p_df,
        append = db_write_table_append
        )
      # Create UK for insert
      ddl_exec(db, "
        CREATE UNIQUE INDEX IF NOT EXISTS contrast_ppep_p_val__uk__idx
          ON contrast_ppep_p_val(phosphopep, contrast);
        "
      )
    }
  }
  # Perhaps this could be done more elegantly using unique keys
  #   or creating the tables before saving data to them, but this
  #   is fast and, if the database exists on disk rather than in
  #   memory, it doesn't stress memory.
  dml_no_rows_exec(db, "
    CREATE TEMP table contrast_log2_fc
      AS
    SELECT *
      FROM  v_contrast_log2_fc
      ORDER BY contrast, phosphopep
    ;
    "
  )
  dml_no_rows_exec(db, "
    CREATE TEMP table ppep_p_val
      AS
    SELECT  p_value_raw,
            p_value_adj,
            contrast   AS p_val_contrast,
            phosphopep AS p_val_ppep
      FROM  contrast_ppep_p_val
      ORDER BY contrast, phosphopep
    ;
    "
  )
  dml_no_rows_exec(db, "
    DROP TABLE IF EXISTS contrast_log2_fc_p_val
    ;
    "
  )
  dml_no_rows_exec(db, "
    CREATE TABLE contrast_log2_fc_p_val
      AS
    SELECT  a.*,
            b.p_value_raw,
            b.p_value_adj,
            b.p_val_contrast,
            b.p_val_ppep
      FROM  contrast_log2_fc a, ppep_p_val b
      WHERE a.rowid = b.rowid
        AND a.phosphopep = b.p_val_ppep
    ;
    "
  )
  # Create UK
  ddl_exec(db, "
    CREATE UNIQUE INDEX IF NOT EXISTS contrast_log2_fc_p_val__uk__idx
      ON contrast_log2_fc_p_val(phosphopep, contrast);
    "
  )
  # Create indices for future queries
  ddl_exec(db, "
    CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__contrast__idx
      ON contrast_log2_fc_p_val(contrast);
    "
  )
  ddl_exec(db, "
    CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__phosphopep__idx
      ON contrast_log2_fc_p_val(phosphopep);
    "
  )
  ddl_exec(db, "
    CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_raw__idx
      ON contrast_log2_fc_p_val(p_value_raw);
    "
  )
  ddl_exec(db, "
    CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_adj__idx
      ON contrast_log2_fc_p_val(p_value_adj);
    "
  )
  dml_no_rows_exec(db, "
    DROP VIEW IF EXISTS v_contrast_log2_fc_p_val
    ;
    "
  )
  dml_no_rows_exec(db, "
      CREATE VIEW v_contrast_log2_fc_p_val
        AS
      SELECT  contrast,
              a_level,
              a_samples,
              b_level,
              b_samples,
              a_quant,
              b_quant,
              fc_description,
              log2_fc,
              p_value_raw,
              p_value_adj,
              phosphopep
        FROM  contrast_log2_fc_p_val
        ORDER BY contrast, phosphopep
        ;
    "
  )
  ddl_exec(db, "
    DROP TABLE IF EXISTS kseaapp_metadata
    ;
    "
  )
  dml_no_rows_exec(db, "
    CREATE TABLE kseaapp_metadata
      AS
    WITH extended(deppep, ppep, gene_name, uniprot_id, phosphoresidue) AS (
              SELECT DISTINCT
                  deppep.seq,
                  ppep.seq,
                  GeneName||';',
                  UniProtID||';',
                  PhosphoResidue||';'
              FROM
                ppep, deppep, mrgfltr_metadata
              WHERE
                  mrgfltr_metadata.ppep_id = ppep.id
                AND
                  ppep.deppep_id = deppep.id
        )
    SELECT
        ppep                                                  AS `ppep`,
        SUBSTR(uniprot_id,     1, INSTR(uniprot_id,';') - 1 ) AS `Protein`,
        SUBSTR(gene_name,      1, INSTR(gene_name,';')  - 1 ) AS `Gene`,
        deppep                                                AS `Peptide`,
        REPLACE(
          REPLACE(
            SUBSTR(phosphoresidue, 1, INSTR(phosphoresidue,';') - 1 ),
            'p',
            ''
            ),
          ', ',
          ';'
          )                                                   AS `Residue.Both`
      FROM extended
      ;
    "
  )
  # Create indexes for join
  ddl_exec(db, "
    CREATE INDEX IF NOT EXISTS kseaapp_metadata__ppep__idx
      ON kseaapp_metadata(ppep);
    "
  )
  ddl_exec(db, "
    DROP VIEW IF EXISTS v_kseaapp_contrast
    ;
    "
  )
  dml_no_rows_exec(db, "
    CREATE VIEW v_kseaapp_contrast
      AS
    SELECT  a.*, b.Protein, b.Gene, b.Peptide, b.`Residue.Both`
      FROM  v_contrast_log2_fc_p_val a, kseaapp_metadata b
      WHERE b.ppep = a.phosphopep
      ;
      "
  )
  ddl_exec(db, "
    DROP VIEW IF EXISTS v_kseaapp_input
    ;
    "
  )
  dml_no_rows_exec(db, "
    CREATE VIEW v_kseaapp_input
      AS
    SELECT  v.contrast,
            v.phosphopep,
            m.`Protein`,
            m.`Gene`,
            m.`Peptide`,
            m.`Residue.Both`,
            v.p_value_raw AS `p`,
            v.log2_fc AS `FC`
      FROM  kseaapp_metadata AS m,
            v_contrast_log2_fc_p_val AS v
      WHERE m.ppep = v.phosphopep
        AND NOT m.`Gene` = 'No_Gene_Name'
        AND NOT v.log2_fc = 0
      ;
      "
  )
  # We are done with DDL and insertion
  RSQLite::dbDisconnect(db)
}
```

```{r echo = FALSE, results = 'asis'}
cat("\\newpage\n")
```

# KSEA Analysis Summaries

Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched.   Enrichments are found by the CRAN `KSEAapp` package:

- The package is available on CRAN, at https://cran.r-project.org/package=KSEAapp
- The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https://doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https://doi.org/10.1093/bioinformatics/btx415).
- An online alternative (supporting only analysis of human data) is available at [https://casecpb.shinyapps.io/ksea/](https://casecpb.shinyapps.io/ksea/).

For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as:

$$
\text{kinase enrichment }z\text{-score}_{j,i} = \frac{(\overline{`r sfc`}_{j,i} - \overline{`r pfc`}_j)\sqrt{m_{j,i}}}{\delta_j}
$$

and fold-enrichment is computed as:

$$
\text{Enrichment}_{j,i} = \frac{\overline{`r sfc`}_{j,i}}{\overline{`r pfc`}_j}
$$

where:

- $\overline{`r sfc`}_{j,i}$ is the mean `r pfc_txt` in intensities of known substrates of the kinase $i$ in contrast $j$,
- $\overline{`r pfc`}_j$ is the mean `r pfc_txt` of all phosphosites identified in contrast $j$, and
- $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$,
- $\delta_j$ is the standard deviation of the $\log_2 (\text{fold-change})$ for contrast $j$ across all phosphosites in the dataset.
- Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score.

$\text{FDR}_{j,i}$ is the False Discovery Rate corrected kinase enrichment score.

Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive).

Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`.

- Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis.
- Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)).
- Kinase-substrate data were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads).

For each enriched kinase, a heatmap showing the intensities is presented for up to `r g_intensity_hm_rows` substrates, i.e., those substrates having the highest"quality".

Where possible, a heatmap of the correlations among these the selected substrates is also presented; if correlations cannot be computed (because of too many missing values), then the covariances are heatmapped for substrates having a variance greater than 1.

```{r ksea, echo = FALSE, fig.dim = c(12, 14.5), results = 'asis'}
cat("\\clearpage\n")

db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db)

# -- eliminate the table that's about to be defined
ddl_exec(db, "
DROP TABLE IF EXISTS site_metadata;
")

# -- define the site_metadata table
ddl_exec(db, "
CREATE TABLE site_metadata(
  id            INTEGER PRIMARY KEY
, site_type_id  INTEGER REFERENCES site_type(id)
, full          TEXT    UNIQUE ON CONFLICT IGNORE
, abbrev        TEXT
, pattern       TEXT
, motif         TEXT
);
")

# -- populate the table with initial values
ddl_exec(db, "
INSERT INTO site_metadata(full, abbrev, site_type_id)
  SELECT  DISTINCT kinase_map, kinase_map, site_type_id
    FROM  ppep_gene_site
    ORDER BY kinase_map;
")

# -- drop bogus KSData view if exists
ddl_exec(db, "
DROP VIEW IF EXISTS ks_data_v;
")

# -- create view to serve as an impostor for  KSEAapp::KSData
ddl_exec(db, "
CREATE VIEW IF NOT EXISTS ks_data_v
AS
SELECT
  'NA' AS KINASE,
  'NA' AS KIN_ACC_ID,
  kinase_map AS GENE,
  'NA' AS KIN_ORGANISM,
  'NA' AS SUBSTRATE,
  0 AS SUB_GENE_ID,
  'NA' AS SUB_ACC_ID,
  gene_names AS SUB_GENE,
  'NA' AS SUB_ORGANISM,
  phospho_peptide AS SUB_MOD_RSD,
  0 AS SITE_GROUP_ID,
  'NA' AS 'SITE_7AA',
  2 AS networkin_score,
  type_name AS Source
FROM ppep_gene_site_view;
")

contrast_metadata_df <-
  sqldf::sqldf("select * from contrast_lvl_lvl_metadata", connection = db)
rslt <- new_env()
rslt$score_list <- list()
rslt$name_list  <- list()
rslt$longname_list  <- list()

ddl_exec(db, "
  DROP TABLE IF EXISTS contrast_ksea_scores;
  "
)

next_index <- 0
err_na_subscr_df_const <-
 "missing values are not allowed in subscripted assignments of data frames"

for (i_cntrst in seq_len(nrow(contrast_metadata_df))) {
  cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"]
  cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"]
  cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6]
  contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level)
  contrast_longlabel <- (
    sprintf(
      "Class %s -> Class %s",
      contrast_metadata_df[i_cntrst, "b_level"],
      contrast_metadata_df[i_cntrst, "a_level"]
    )
  )
  kseaapp_input <-
    sqldf::sqldf(
      x = sprintf("
        SELECT `Protein`,`Gene`,`Peptide`,phosphopep AS `Residue.Both`,`p`,`FC`
          FROM v_kseaapp_input
          WHERE contrast = %d
        ",
        i_cntrst
      ),
    connection = db
    )

  pseudo_ksdata <- dbReadTable(db, "ks_data_v")

  # This hack is because SQL table has the log2-transformed values
  kseaapp_input[, "FC"] <- 2 ** kseaapp_input[, "FC", drop = TRUE]
  main_title <- (
      sprintf(
      "Change from treatment %s to treatment %s",
      contrast_metadata_df[i_cntrst, "b_level"],
      contrast_metadata_df[i_cntrst, "a_level"]
    )
  )
  sub_title <- contrast_longlabel
  tryCatch(
    expr = {
        ksea_scores_rslt <-
          ksea_scores(
            ksdata                  = pseudo_ksdata,
            px                      = kseaapp_input,
            networkin               = TRUE,
            networkin_cutoff        = 2,
            minimum_substrate_count = ksea_min_substrate_count
            )

        if (FALSE) {
          ksea_scores_rslt <-
            ksea_scores_rslt[
              ksea_scores_rslt$m >= ksea_min_substrate_count,
              ,
              drop = FALSE
            ]
        }

        if (FALSE) {
          if (print_nb_messages) nbe(
            "Output contents of `ksea_scores_rslt` table\n")
          cat_variable(ksea_scores_rslt)
          cat("\n\\clearpage\n")
          data_frame_tabbing_latex(
            x = ksea_scores_rslt,
            tabstops = c(1.8, 0.8, 0.8, 0.3, 0.8, 0.8),
            caption = paste("KSEA scores for contrast ",
                        cntrst_b_level, "to",  cntrst_a_level),
            use_subsubsection_header = FALSE
          )
        }

        if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) {
          next_index <- 1 + next_index
          rslt$score_list[[next_index]] <- ksea_scores_rslt
          rslt$name_list[[next_index]] <- contrast_label
          rslt$longname_list[[next_index]] <- contrast_longlabel
          ksea_low_fdr_print(
            rslt = rslt,
            i_cntrst = i_cntrst,
            i = next_index,
            a_level = cntrst_a_level,
            b_level = cntrst_b_level,
            fold_change = cntrst_fold_change,
            caption = contrast_longlabel
            )
        }
      },
    error = function(e) {
        str(e)
        cat_margins()
      }
  )
}

plotted_kinases <- NULL
if (g_can_run_ksea && length(rslt$score_list) > 1) {
  for (i in seq_len(length(ksea_heatmap_titles))) {
    hdr <- ksea_heatmap_titles[[i]]
    which_kinases <- i

    cat("\\clearpage\n\\begin{center}\n")
    if (i == const_ksea_astrsk_kinases) {
      cat(subsection_header(hdr))
    } else {
      cat(subsection_header(hdr))
    }
    cat("\\end{center}\n")

    plotted_kinases <- ksea_heatmap(
      # the data frame outputs from the KSEA.Scores() function, in list format
      score_list = rslt$score_list,
      # a character vector of all the sample names for heatmap annotation:
      # - the names must be in the same order as the data in score_list
      # - please avoid long names, as they may get cropped in the final image
      sample_labels = rslt$name_list,
      # character string of either "p.value" or "FDR" indicating the data column
      #   to use for marking statistically significant scores
      stats = c("p.value", "FDR")[2],
      # a numeric value between 0 and infinity indicating the min. number of
      #   substrates a kinase must have to be included in the heatmap
      m_cutoff = 1,
      # a numeric value between 0 and 1 indicating the p-value/FDR cutoff
      #   for indicating significant kinases in the heatmap
      p_cutoff = params$kseaCutoffThreshold,
      # a binary input of TRUE or FALSE, indicating whether or not to perform
      #   hierarchical clustering of the sample columns
      sample_cluster = TRUE,
      # a binary input of TRUE or FALSE, indicating whether or not to export
      #   the heatmap as a .png image into the working directory
      export = FALSE,
      # additional arguments to gplots::heatmap.2, such as:
      # - main: main title of plot
      # - xlab: x-axis label
      # - ylab: y-axis label
      xlab = "Contrast",
      ylab = "Kinase",
      # print which kinases:
      # - 1 : all kinases
      # - 2 : significant kinases
      # - 3 : non-significant kinases
      which_kinases = which_kinases,
      margins = c(7, 15)
    )
    if (!is.null(plotted_kinases)) {
      cat("\\begin{center}\n")
      if (which_kinases != const_ksea_nonastrsk_kinases)
        cat("Asterisks reflect significance.\n")
      cat("\\end{center}\n")
    }
  } # end for (i in ...
} # end if (length ...
```

```{r kseabar_calc, echo = FALSE, fig.dim = c(9.5, 6), results = 'asis'}
ksea_prints <- list()
ksea_barplots <- list()
    for (i_cntrst in seq_len(length(rslt$score_list))) {
      next_index <- i_cntrst
      cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"]
      cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"]
      cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6]
      contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level)
      contrast_longlabel <- (
        sprintf(
          "Class %s -> Class %s",
          contrast_metadata_df[i_cntrst, "b_level"],
          contrast_metadata_df[i_cntrst, "a_level"]
        )
      )
      main_title <- (
          sprintf(
          "Change from treatment %s to treatment %s",
          contrast_metadata_df[i_cntrst, "b_level"],
          contrast_metadata_df[i_cntrst, "a_level"]
        )
      )
      sub_title <- contrast_longlabel
      tryCatch(
        expr = {
            ksea_scores_rslt <- rslt$score_list[[next_index]]
            if (print_nb_messages) nbe(see_variable(ksea_scores_rslt))

            if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) {
              sink(deferred <- file())
              ksea_low_fdr_print(
                rslt = rslt,
                i_cntrst = i_cntrst,
                i = next_index,
                a_level = cntrst_a_level,
                b_level = cntrst_b_level,
                fold_change = cntrst_fold_change,
                caption = contrast_longlabel,
                write_db = FALSE,
                anchor = const_table_anchor_t
                )
              cat("\n")
              sink()
              lines <-
                paste(
                 readLines(deferred, warn = FALSE),
                 collapse = "\n"
                 )
              close(deferred)
              sq_put(ksea_prints, lines)
              sink(stderr())
              cat("\n---\n")
              cat_variable(ksea_prints)
              barplot_closure <-
                ksea_low_fdr_barplot_factory(
                  rslt = rslt,
                  i_cntrst = i_cntrst,
                  i = next_index,
                  a_level = cntrst_a_level,
                  b_level = cntrst_b_level,
                  fold_change = cntrst_fold_change,
                  caption = contrast_longlabel
                )
              if (rlang::is_closure(barplot_closure))
                sq_put(ksea_barplots, barplot_closure)
              else
                sq_put(ksea_barplots, no_op)
              str(ksea_barplots)
              cat("\n...\n")
              sink()
            }
          },
        error = function(e) {
            str(e)
            cat_margins()
          }
      )
    }
```

```{r phosphoelm_kinase_upid_desc, echo = FALSE, fig.dim = c(12, 13.7), results = 'asis'}

have_kinase_descriptions <-
  if (!is.null(bzip2df(kinase_uprt_desc_lut, kinase_uprt_desc_lut_bz2)) &&
      !is.null(bzip2df(kinase_name_uprt_lut, kinase_name_uprt_lut_bz2))
  ) {
    rownames(kinase_uprt_desc_lut) <- kinase_uprt_desc_lut$UniProtID
    kinase_name_to_desc_uprt <- function(s) {
      rslt <- NULL
      tryCatch(
        {
          which_rows <- eval(s == kinase_name_uprt_lut$kinase)
          kinase_uprtid <-
            kinase_name_uprt_lut[which_rows, 2]
          # filter for first _HUMAN match if any
          grepl_human <- grepl("_HUMAN$", kinase_uprtid)
          if (0 < sum(grepl_human))
            kinase_uprtid <- kinase_uprtid[grepl_human]
          # filter for first match if any
          if (0 < length(kinase_uprtid)) {
            kinase_uprtid <- kinase_uprtid[1]
            kinase_desc <- kinase_uprt_desc_lut[kinase_uprtid, 2]
            if (!is.na(kinase_desc))
              rslt <- c(kinase_desc, kinase_uprtid)
            else
              rslt <- c(kinase_desc, "")
          }
        },
        warning = str
      )
      rslt
    }
    TRUE
  } else {
    kinase_name_to_desc_uprt <- function(s) NULL
    FALSE
  }
```

```{r write_params, echo = FALSE, results = 'asis'}
# perhaps this should be moved into the functions section, eventually ...
write_params <- function(db) {
  # write parameters to report

  # write parameters to SQLite output

  mqppep_anova_script_param_df <- data.frame(
    script    = "mqppep_anova_script.Rmd",
    parameter = names(param_unlist),
    value     = param_unlist
    )
  ddl_exec(db, "
    DROP TABLE IF EXISTS script_parameter;
    "
  )
  ddl_exec(db, "
    CREATE TABLE IF NOT EXISTS script_parameter(
      script    TEXT,
      parameter TEXT,
      value     ANY,
      UNIQUE (script, parameter) ON CONFLICT REPLACE
      )
      ;
    "
  )
  RSQLite::dbWriteTable(
    conn = db,
    name = "script_parameter",
    value = mqppep_anova_script_param_df,
    append = TRUE
  )

  loaded_packages_df <-  sessioninfo::package_info("loaded")
  loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library)
  loaded_packages_df <- data.frame(
    package = loaded_packages_df$package,
    version = loaded_packages_df$loadedversion,
    date    = loaded_packages_df$date
    )
  cat("\\clearpage\n\\section{Input parameter settings}\n")
  data_frame_tabbing_latex(
    x = param_df,
    tabstops = c(1.75),
    underscore_whack = TRUE,
    caption = "Input parameters",
    verbatim = FALSE
    )
}

if (!have_kinase_descriptions) {
  write_params(db)
  # We are done with output
  RSQLite::dbDisconnect(db)
  param_df_exit()
  knitr::knit_exit()
}
```

```{r kseabar, echo = FALSE, fig.dim = c(9.5, 12.3), results = 'asis'}
# nolint start
#   squash un-actionable cyclomatic_complexity warning
if (have_kinase_descriptions) {
# nolint end
  my_section_header <-
    sprintf(
      "inases whose KSEA %s < %s\n",
      ksea_cutoff_statistic,
      signif(ksea_cutoff_threshold, 2)
    )

  # Use enriched kinases to find enriched kinase-substrate pairs
  enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash))

  enriched_kinase_descs <-
    Reduce(
      f = function(l, r) {
        lkup <- kinase_name_to_desc_uprt(r)
        if (is.null(lkup)) l
          else r2 <- rbind(
            l,
            data.frame(
              kinase = r,
              uniprot_id = lkup[2],
              description = lkup[1]
            )
          )
        },
      x = enriched_kinases$kinase,
      init = NULL
    )

  if (length(enriched_kinase_descs) > 0 && nrow(enriched_kinase_descs) > 0) {
    cat("\n\\clearpage\n")
    data_frame_tabbing_latex(
      x = enriched_kinase_descs,
      tabstops = c(0.9, 1.3),
      headings = c("Kinase", "UniProt ID", "Description"),
      caption = paste0("Descriptions of k", my_section_header)
    )
  }

  if (FALSE) {
    cat_variable(sqldf::sqldf("SELECT kinase FROM enriched_kinases"))
    cat_variable(sqldf::sqldf("
       SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
         FROM pseudo_ksdata
         WHERE gene IN (SELECT kinase FROM enriched_kinases)
    "))
    data_frame_table_latex(
      x = sqldf::sqldf("
       SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
         FROM pseudo_ksdata
         WHERE gene IN (SELECT kinase FROM enriched_kinases)
      "),
      justification = "l l l",
      centered = TRUE,
      caption = "substrates of enriched kinases",
      anchor = c(const_table_anchor_p, const_table_anchor_t),
      underscore_whack = TRUE
    )
    data_frame_table_latex(
      x = sqldf::sqldf("
         SELECT
           gene AS kinase,
           ppep,
           sub_gene,
           '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label,
           fdr_adjusted_anova_p,
           quality,
           min_group_obs_count
         FROM (
           SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
             FROM pseudo_ksdata
             WHERE gene IN (SELECT kinase FROM enriched_kinases)
           ),
           p_value_data
         WHERE ppep = phosphopeptide
         GROUP BY kinase, ppep
         ORDER BY kinase, p_value_data.quality DESC
      "),
      justification = "l l l l l l l",
      centered = TRUE,
      caption = "labeled substrates of enriched kinases",
      anchor = c(const_table_anchor_p, const_table_anchor_t),
      underscore_whack = TRUE
    )
  }
  all_enriched_substrates <- sqldf::sqldf("
   SELECT
     gene AS kinase,
     ppep,
     sub_gene,
     '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label,
     fdr_adjusted_anova_p,
     quality,
     min_group_obs_count
   FROM (
     SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
       FROM pseudo_ksdata
       WHERE gene IN (SELECT kinase FROM enriched_kinases)
     ),
     p_value_data
   WHERE ppep = phosphopeptide
   GROUP BY kinase, ppep
   ORDER BY kinase, ppep, p_value_data.quality DESC
   ")

  all_enriched_substrates <-
    all_enriched_substrates[
      all_enriched_substrates$quality >= params$minQuality,
      ,
      drop = FALSE
    ]

  all_enriched_substrates$sub_gene <-
    sub(" ///.*", "...",
      all_enriched_substrates$sub_gene
    )

  all_enriched_substrates$label <-
    with(
      all_enriched_substrates,
      sprintf(
        "(%s-%s) %s",
        kinase,
        trunc_subgene(sub_gene),
        ppep
      )
    )

  # this global is set to TRUE by cat_enriched_heading immediately below
  g_neednewpage <- FALSE

  # helper used to label per-kinase substrate enrichment figure
  cat_enriched_heading <- function(m, cut_args) {
    cutoff <- cut_args$cutoff
    kinase <- cut_args$kinase
    if (g_neednewpage) cat("\\newpage\n")
    g_neednewpage <- TRUE
    if (nrow(m) > g_intensity_hm_rows) {
      cat(subsection_header(
        sprintf(
          "Highest-quality %d (of %d) enriched %s-sites",
          g_intensity_hm_rows,
          nrow(m),
          kinase
        )
      ))
    } else {
      if (nrow(m) == 0) {
        return(FALSE)
      } else {
        nrow_m <- nrow(m)
        cat(subsection_header(
          sprintf(
            "%d enriched %s-site%s",
            nrow_m,
            kinase,
            if (nrow_m > 1) "s" else ""
            )
        ))
      }
    }
    cat("\n\n\n")
    cat("\n\n\n")
    return(TRUE)
  }

  # --------------------------------
  # hack begin - show all substrates
  enriched_substrates <- all_enriched_substrates
  # add "FALSE &&" to prevent listing of substrates
  if (show_enriched_substrates && nrow(enriched_substrates) > 0) {
    short_row_names <- sub(
        "$FAILED_MATCH_GENE_NAME",
        "not_found",
        enriched_substrates$sub_gene,
        fixed = TRUE
      )

    if (print_nb_messages) nbe(see_variable(enriched_substrates))
    substrates_df <- with(
        enriched_substrates,
        data.frame(
          kinase = kinase,
          substrate = sub(" ///*", "...", short_row_names),
          anova_p_value = signif(fdr_adjusted_anova_p, 2),
          min_group_obs_count = signif(min_group_obs_count, 0),
          quality = signif(quality, 3),
          sequence = trunc_n(30)(ppep)
        )
      )

    substrates_df <- substrates_df[
      with(substrates_df, order(kinase, -quality)),
      ,
      drop = FALSE
      ]

    if (print_nb_messages) nbe(see_variable(substrates_df))
    if (nrow(substrates_df) < 1)
      substrates_df$sequence <- c()
    if (print_nb_messages) nbe(see_variable(substrates_df))
    names(substrates_df) <- headers_2nd_line <-
      c("Kinase", "Substrate", "p-value", "per group)", "quality", "Sequence")
    headers_1st_line <- c("", "", "ANOVA", "min(values", "", "")
    data_frame_tabbing_latex(
      x = substrates_df,
      tabstops = c(1.2, 0.8, 0.5, 0.65, 0.5),
      headings = c(headers_1st_line, headers_2nd_line),
      caption = "Details for all enriched substrates of enriched kinases"
    )
    rm(
      enriched_substrates,
      substrates_df,
      short_row_names,
      headers_1st_line,
      headers_2nd_line
      )
  }
  cat("\\clearpage\n")
  # hack end - show all substrates
  # --------------------------------

  # print deferred tables and graphs for kinases from contrasts
  for (i_cntrst in seq_len(length(ksea_prints))) {
    #latex_samepage({
      cat(ksea_prints[[i_cntrst]])
      cat("\n")
      ksea_barplots[[i_cntrst]]()
      cat("\n")
      cat("\\clearpage\n")
    #})
  }

}
```

```{r enriched, echo = FALSE, fig.dim = c(12, 13.7), results = 'asis'}
# nolint start
#   squash un-actionable cyclomatic_complexity warning
if (g_can_run_ksea) {
# nolint end
  g_did_enriched_header <- FALSE
  for (kinase_name in sort(enriched_kinases$kinase)) {
    enriched_substrates <-
      all_enriched_substrates[
        all_enriched_substrates$kinase == kinase_name,
        ,
        drop = FALSE
        ]
    ten_trunc_ppep <- trunc_enriched_substrate(enriched_substrates$ppep)
    enriched_substrates$label <- with(
      enriched_substrates,
      sprintf(
        "(%s) %s",
        make.names(
          sub("$FAILED_MATCH_GENE_NAME", "not_found", sub_gene, fixed = TRUE),
          unique = TRUE
        ),
        ten_trunc_ppep
        )
      )
    # Get the intensity values for the heatmap
    enriched_intensities <-
      as.matrix(unimputed_quant_data_log[
                  enriched_substrates$ppep, , drop = FALSE
                  ])

    # Remove rows having too many NA values to be relevant
    good_rows <- (rowSums(enriched_intensities, na.rm = TRUE) != 0)

    enriched_substrates <- enriched_substrates[good_rows, , drop = FALSE]
    enriched_intensities <- enriched_intensities[good_rows, , drop = FALSE]

    # Rename the rows with the display-name for the heatmap
    short_row_names <- sub(
        "$FAILED_MATCH_GENE_NAME",
        "not_found",
        enriched_substrates$sub_gene,
        fixed = TRUE
      )
    short_row_names <-
      make.names(short_row_names, unique = TRUE)
    long_row_names <-
      sapply(
        X = rownames(enriched_intensities),
        FUN = function(rn) {
          enriched_substrates[enriched_substrates$ppep == rn, "label"]
        }
      )
    rownames(enriched_intensities) <- long_row_names
    # Format as matrix for heatmap
    m <- as.matrix(enriched_intensities)
    rownames(m) <- trunc_enriched_substrate(rownames(m))

    # Draw the heading and heatmap
    nrow_m <- nrow(m)
    if (nrow_m > 0) {
      if (!g_did_enriched_header) {
        cat("\n\\clearpage\n")
        cat(section_header(paste0("K", my_section_header)))
        g_did_enriched_header <- TRUE
      }
      is_na_m <- is.na(m)
      cellnote_m <- is_na_m
      cellnote_m[!is_na_m] <- ""
      cellnote_m[is_na_m] <- params$heatMapNAcellNote
      cut_args <- new_env()
      cut_args$cutoff <- cutoff
      cut_args$kinase <- kinase_name
      cut_args$statistic <- ksea_cutoff_statistic
      cut_args$threshold <- ksea_cutoff_threshold
      number_of_peptides_found <-
        ppep_heatmap(
          m                       = m,
          cellnote                = cellnote_m,
          cutoff                  = cut_args,
          hm_heading_function     = cat_enriched_heading,
          hm_main_title           = paste(
            "Unnormalized (zero-imputed)",
            "intensities of enriched kinase-substrates"),
          suppress_row_dendrogram = FALSE,
          master_cex              = 0.35,
          sepcolor                = "black",
          colsep                  = sample_colsep,
          row_scaling             = "none"
        )
        if (number_of_peptides_found > 1) {

          tryCatch(
            {
              rownames(m) <- short_row_names
              cov_heatmap(
                m = m,
                kinase_name = kinase_name,
                top_substrates = nrow_m > g_intensity_hm_rows
                )
            },
            error = function(e) {
                cat(
                  sprintf(
                    "ERROR: %s\n\\newline\n",
                    mget("e")
                  )
                )
                cat(
                  sprintf(
                    "message: %s\n\\newline\n",
                    e$message
                  )
                )
                cat_margins()
              }
          )
        }
      substrates_df <- with(
          enriched_substrates,
          data.frame(
            substrate = sub(" ///*", "...", short_row_names),
            sequence = trunc_long_ppep(ppep),
            anova_p_value = signif(fdr_adjusted_anova_p, 2),
            min_group_obs_count = signif(min_group_obs_count, 0),
            quality = signif(quality, 3)
          )
        )
      excess_substrates <- nrow(substrates_df) > g_intensity_hm_rows
      if (excess_substrates)
        substrates_df <- substrates_df[1:g_intensity_hm_rows, ]
      names(substrates_df) <- headers_2nd_line <-
        c("Substrate", "Sequence", "p-value", "per group)", "quality")
      headers_1st_line <- c("", "", "ANOVA", "min(values", "")
      if (1 < nrow(enriched_substrates))
        cat("\n\\newpage\n")
      cat(subsubsection_header(
        sprintf(
          "Details for %s%s-sites",
          if (excess_substrates)
            sprintf(
              "%s \"highest quality\" ",
              g_intensity_hm_rows
            )
          else "",
          kinase_name
        )
      ))
      substrates_df <- substrates_df[order(-substrates_df$quality), ]
      data_frame_tabbing_latex(
        x = substrates_df,
        tabstops = c(0.8, 3.8, 0.6, 0.8),
        headings = c(headers_1st_line, headers_2nd_line)
      )
    } else {
      if (print_nb_messages) nbe(see_variable(nrow_m > 0), "\n")
    }
    if (print_nb_messages) nb("end kinase ", kinase_name, "\n")
  }

  # Write output tabular files

  # get kinase, ppep, concat(kinase) tuples for enriched kinases

  if (print_nb_messages) nb("kinase_ppep_label <- ...\n")
  if (print_nb_messages) nbe("kinase_ppep_label <- ...\n")
  kinase_ppep_label <- sqldf::sqldf("
    WITH
      t(ppep, label) AS
        (
          SELECT DISTINCT
              SUB_MOD_RSD AS ppep,
              group_concat(gene, '; ') AS label
            FROM pseudo_ksdata
            WHERE GENE IN (SELECT kinase FROM enriched_kinases)
            GROUP BY ppep
        ),
      k(kinase, ppep_join) AS
        (
        SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join
          FROM pseudo_ksdata
          WHERE GENE IN (SELECT kinase FROM enriched_kinases)
        )
    SELECT k.kinase, t.ppep, t.label
      FROM  t,  k
      WHERE t.ppep = k.ppep_join
      ORDER BY k.kinase, t.ppep
    ")


  # extract what we need from full_data
  impish <- cbind(rownames(quant_data_imp), quant_data_imp)
  colnames(impish)[1] <- "Phosphopeptide"
  data_table_imputed_sql <- "
    SELECT
      f.*,
      k.label AS KSEA_enrichments,
      q.*
    FROM
      metadata_plus_p f
        LEFT JOIN kinase_ppep_label k
          ON f.Phosphopeptide = k.ppep,
      impish q
    WHERE
      f.Phosphopeptide = q.Phosphopeptide
    "
  data_table_imputed <- sqldf::sqldf(data_table_imputed_sql)
  # Zap the duplicated 'Phosphopeptide' column named 'ppep'
  data_table_imputed <-
      data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]

  # Output imputed, un-normalized data
  if (print_nb_messages) nb("Output imputed, un-normalized data tabular file\n")
  if (print_nb_messages) nbe(
    "Output imputed, un-normalized data tabular file\n")
  write.table(
      data_table_imputed
    , file = imputed_data_filename
    , sep = "\t"
    , col.names = TRUE
    , row.names = FALSE
    , quote = FALSE
    )


  #output quantile normalized data
  impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log)
  colnames(impish)[1] <- "Phosphopeptide"
  data_table_imputed <- sqldf::sqldf(data_table_imputed_sql)
  # Zap the duplicated 'Phosphopeptide' column named 'ppep'
  data_table_imputed <-
      data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
  if (print_nb_messages) nb("Output quantile normalized data tabular file\n")
  if (print_nb_messages) nbe("Output quantile normalized data tabular file\n")
  write.table(
    data_table_imputed,
    file = imp_qn_lt_data_filenm,
    sep = "\t",
    col.names = TRUE,
    row.names = FALSE,
    quote = FALSE
  )

  ppep_kinase <- sqldf::sqldf("
    SELECT DISTINCT k.ppep, k.kinase
      FROM (
        SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep
          FROM pseudo_ksdata
          WHERE GENE IN (SELECT kinase FROM enriched_kinases)
        ) k
      ORDER BY k.ppep, k.kinase
    ")

  RSQLite::dbWriteTable(
    conn = db,
    name = "ksea_enriched_ks",
    value = ppep_kinase,
    append = FALSE
    )
}

if (print_nb_messages) nb("RSQLite::dbWriteTable anova_signif\n")

RSQLite::dbWriteTable(
  conn = db,
  name = "anova_signif",
  value = p_value_data,
  append = FALSE
  )

  ddl_exec(db, "
    DROP VIEW IF EXISTS stats_metadata_v;
    "
  )
  dml_no_rows_exec(db, "
      CREATE VIEW stats_metadata_v
        AS
      SELECT DISTINCT  m.*,
          p.raw_anova_p,
          p.fdr_adjusted_anova_p,
          kek.kinase AS ksea_enrichments
        FROM
          mrgfltr_metadata_view m
            LEFT JOIN anova_signif p
              ON m.phospho_peptide = p.phosphopeptide
            LEFT JOIN ksea_enriched_ks kek
              ON m.phospho_peptide = kek.ppep
      ;
    "
  )

if (print_nb_messages) nb(
   "Output contents of `stats_metadata_v` table to tabular file\n")
if (print_nb_messages) nbe(
   "Output contents of `stats_metadata_v` table to tabular file\n")

write.table(
  dbReadTable(db, "stats_metadata_v"),
  file = anova_ksea_mtdt_file,
  sep = "\t",
  col.names = TRUE,
  row.names = FALSE,
  quote = FALSE
  )

cat("\n\\clearpage\n")

```

# Data-processing summary flowchart

![Flowchart showing ANOVA and KSEA data-processing steps](KSEA_impl_flowchart.pdf)

```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
cat("\\leavevmode\n\n\n")

write_params(db)
# We are done with output
RSQLite::dbDisconnect(db)

cat("\\clearpage\n\\section{R package versions}\n")
utils::toLatex(utils::sessionInfo())
```
author	galaxyp
date	Fri, 17 Feb 2023 22:38:40 +0000
parents	5c2beb4eb41c
children