# HG changeset patch
# User galaxyp
# Date 1666981641 0
# Node ID 08678c931f5dd40048531ea185ebe62075776155
# Parent dbff53e6f75f8e4fe42358bd84cc470ce7d9d6db
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 43e7a43b545c24b2dc33d039198551c032aa79be
diff -r dbff53e6f75f -r 08678c931f5d KSEA_impl_flowchart.dia
Binary file KSEA_impl_flowchart.dia has changed
diff -r dbff53e6f75f -r 08678c931f5d KSEA_impl_flowchart.pdf
Binary file KSEA_impl_flowchart.pdf has changed
diff -r dbff53e6f75f -r 08678c931f5d MaxQuantProcessingScript.R
--- a/MaxQuantProcessingScript.R Mon Jul 11 19:22:25 2022 +0000
+++ b/MaxQuantProcessingScript.R Fri Oct 28 18:27:21 2022 +0000
@@ -220,7 +220,6 @@
type = "character",
help = "pY or pST enriched samples (ie, 'Y' or 'ST')"
)
- # default = "^Number of Phospho [(]STY[)]$",
,
make_option(
c("-p", "--phosphoCol"),
@@ -229,7 +228,6 @@
help = paste0("PERL-compatible regular expression matching",
" header of column having number of 'Phospho (STY)'")
)
- # default = "^Intensity[^_]",
,
make_option(
c("-s", "--startCol"),
@@ -238,7 +236,6 @@
help = paste0("PERL-compatible regular expression matching",
" header of column having first sample intensity")
)
- # default = 1,
,
make_option(
c("-I", "--intervalCol"),
@@ -247,7 +244,6 @@
help = paste0("Column interval between the Intensities of samples",
" (eg, 1 if subsequent column; 2 if every other column")
)
- # default = 0.75,
,
make_option(
c("-l", "--localProbCutoff"),
@@ -255,7 +251,6 @@
type = "double",
help = "Localization Probability Cutoff"
)
- # default = "sum",
,
make_option(
c("-f", "--collapse_func"),
@@ -264,7 +259,6 @@
help = paste0("merge identical phosphopeptides",
" by ('sum' or 'average') the intensities")
)
- # default = "filtered_data.txt",
,
make_option(
c("-r", "--filtered_data"),
@@ -272,7 +266,6 @@
type = "character",
help = "filtered_data.txt"
)
- # default = "quantData.txt",
,
make_option(
c("-q", "--quant_data"),
diff -r dbff53e6f75f -r 08678c931f5d kinase_name_uniprot_lut.tabular.bz2
Binary file kinase_name_uniprot_lut.tabular.bz2 has changed
diff -r dbff53e6f75f -r 08678c931f5d kinase_uniprot_description_lut.tabular.bz2
Binary file kinase_uniprot_description_lut.tabular.bz2 has changed
diff -r dbff53e6f75f -r 08678c931f5d macros.xml
--- a/macros.xml Mon Jul 11 19:22:25 2022 +0000
+++ b/macros.xml Fri Oct 28 18:27:21 2022 +0000
@@ -1,89 +1,47 @@
- 0.1.13
+ 0.1.15
0
bioconductor-preprocesscore
- numpy
+ gmp
+ numpy
openblas
- pandas
- perl-dbd-sqlite
- perl
- pyahocorasick
- python
- r-data.table
- r-dbi
- r-ggplot2
+ pandas
+ perl-dbd-sqlite
+ perl
+ pyahocorasick
+ python
+ r-base
+ r-caret
+ r-data.table
+ r-dbi
+ r-ggplot2
r-gplots
- r-latex2exp
- r-optparse
+ r-latex2exp
+ r-optparse
r-reshape2
- r-rmarkdown
- r-rsqlite
- r-sass
+ r-rmarkdown
+
+ r-sessioninfo
r-sqldf
- r-stringr
- r-tinytex
+ r-stringr
+ r-tinytex
r-vioplot
-
diff -r dbff53e6f75f -r 08678c931f5d mqppep_anova.R
--- a/mqppep_anova.R Mon Jul 11 19:22:25 2022 +0000
+++ b/mqppep_anova.R Fri Oct 28 18:27:21 2022 +0000
@@ -1,20 +1,15 @@
#!/usr/bin/env Rscript
# libraries
library(optparse)
-library(data.table)
library(stringr)
+library(tinytex)
# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
# parse options
option_list <- list(
- make_option(
- c("-i", "--inputFile"),
- action = "store",
- default = NA,
- type = "character",
- help = "Phosphopeptide Intensities sparse input file path"
- ),
+
+ # files
make_option(
c("-a", "--alphaFile"),
action = "store",
@@ -24,64 +19,11 @@
" path to text file having one column and no header")
),
make_option(
- c("-S", "--preproc_sqlite"),
- action = "store",
- default = NA,
- type = "character",
- help = "Path to 'preproc_sqlite' produced by `mqppep_mrgfltr.py`"
- ),
- make_option(
- c("-K", "--ksea_sqlite"),
+ c("-M", "--anova_ksea_metadata"),
action = "store",
- default = NA,
- type = "character",
- help = "Path to 'ksea_sqlite' output produced by this tool"
- ),
- make_option(
- c("-f", "--firstDataColumn"),
- action = "store",
- default = "^Intensity[^_]",
- type = "character",
- help = "First column of intensity values"
- ),
- make_option(
- c("-m", "--imputationMethod"),
- action = "store",
- default = "random",
+ default = "anova_ksea_metadata.tsv",
type = "character",
- help = paste0("Method for missing-value imputation,",
- " one of c('group-median','median','mean','random')")
- ),
- make_option(
- c("-p", "--meanPercentile"),
- action = "store",
- default = 3,
- type = "integer",
- help = paste0("Mean percentile for randomly generated imputed values;",
- ", range [1,99]")
- ),
- make_option(
- c("-d", "--sdPercentile"),
- action = "store",
- default = 3,
- type = "double",
- help = paste0("Adjustment value for standard deviation of",
- " randomly generated imputed values; real")
- ),
- make_option(
- c("-s", "--regexSampleNames"),
- action = "store",
- default = "\\.(\\d+)[A-Z]$",
- type = "character",
- help = "Regular expression extracting sample-names"
- ),
- make_option(
- c("-g", "--regexSampleGrouping"),
- action = "store",
- default = "(\\d+)",
- type = "character",
- help = paste0("Regular expression extracting sample-group",
- " from an extracted sample-name")
+ help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments"
),
make_option(
c("-o", "--imputedDataFile"),
@@ -102,11 +44,56 @@
)
),
make_option(
+ c("-i", "--inputFile"),
+ action = "store",
+ default = NA,
+ type = "character",
+ help = "Phosphopeptide Intensities sparse input file path"
+ ),
+ make_option(
+ c("-K", "--ksea_sqlite"),
+ action = "store",
+ default = NA,
+ type = "character",
+ help = "Path to 'ksea_sqlite' output produced by this tool"
+ ),
+ make_option(
+ c("-S", "--preproc_sqlite"),
+ action = "store",
+ default = NA,
+ type = "character",
+ help = "Path to 'preproc_sqlite' produced by `mqppep_mrgfltr.py`"
+ ),
+ make_option(
c("-r", "--reportFile"),
action = "store",
- default = "QuantDataProcessingScript.html",
+ default = "mqppep_anova.pdf",
+ type = "character",
+ help = "PDF report file path"
+ ),
+
+ # parameters
+ make_option(
+ c("-f", "--firstDataColumn"),
+ action = "store",
+ default = "^Intensity[^_]",
type = "character",
- help = "HTML report file path"
+ help = "First column of intensity values"
+ ),
+ make_option(
+ c("-m", "--imputationMethod"),
+ action = "store",
+ default = "random",
+ type = "character",
+ help = paste0("Method for missing-value imputation,",
+ " one of c('group-median','median','mean','random')")
+ ),
+ make_option(
+ c("-C", "--intensityMinValuesPerClass"),
+ action = "store",
+ default = "0",
+ type = "integer",
+ help = "Minimum number of observed values per class"
),
make_option(
c("-k", "--ksea_cutoff_statistic"),
@@ -124,14 +111,120 @@
help = paste0("Maximum score to be used to score a kinase enrichment as significant")
),
make_option(
- c("-M", "--anova_ksea_metadata"),
+ c("-c", "--kseaMinSubstrateCount"),
+ action = "store",
+ default = "1",
+ type = "integer",
+ help = "Minimum number of substrates to consider any kinase for KSEA"
+ ),
+ make_option(
+ c("--kseaUseAbsoluteLog2FC"),
+ action = "store_true",
+ default = "FALSE",
+ type = "logical",
+ help = paste0("Should abs(log2(fold-change)) be used for KSEA?",
+ " (TRUE may alter number of hits.)")
+ ),
+ make_option(
+ c("-p", "--meanPercentile"),
+ action = "store",
+ default = 3,
+ type = "integer",
+ help = paste0("Mean percentile for randomly generated imputed values;",
+ ", range [1,99]")
+ ),
+ make_option(
+ c("--minQuality"),
+ action = "store",
+ default = 0,
+ type = "integer",
+ help = paste0("Minimum quality (higher value reduces number of substrates",
+ " accepted; you may want to keep below 100), range [0,infinity]")
+ ),
+ make_option(
+ c("--oneWayManyCategories"),
+ action = "store",
+ default = "aov",
+ type = "character",
+ help = "Name of R function for one-way tests among more than two categories"
+ ),
+ make_option(
+ c("--oneWayTwoCategories"),
action = "store",
- default = "anova_ksea_metadata.tsv",
+ default = "two.way",
+ type = "character",
+ help = "Name of R function for one-way tests between two categories"
+ ),
+ make_option(
+ c("-s", "--regexSampleNames"),
+ action = "store",
+ default = "\\.(\\d+)[A-Z]$",
+ type = "character",
+ help = "Regular expression extracting sample-names"
+ ),
+ make_option(
+ c("-g", "--regexSampleGrouping"),
+ action = "store",
+ default = "(\\d+)",
type = "character",
- help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments"
+ help = paste0("Regular expression extracting sample-group",
+ " from an extracted sample-name")
+ ),
+ make_option(
+ c("-d", "--sdPercentile"),
+ action = "store",
+ default = 3,
+ type = "double",
+ help = paste0("Adjustment value for standard deviation of",
+ " randomly generated imputed values; real")
+ ),
+ make_option(
+ c("-F", "--sampleGroupFilter"),
+ action = "store",
+ default = "none",
+ type = "character",
+ help = paste0("Should no filter be applied to sample group names (none)",
+ " or should the filter specify samples to include or exclude?")
+ ),
+ make_option(
+ c("--sampleGroupFilterMode"),
+ action = "store",
+ default = "r",
+ type = "character",
+ help = paste0("First character ('f', 'p', or 'r') indicating regular",
+ "expression matching mode ('fixed', 'perl', or 'grep'; ",
+ "see https://rdrr.io/r/base/grep.html). Second character may be 'i;",
+ "to make search ignore case.")
+ ),
+ make_option(
+ c("-G", "--sampleGroupFilterPatterns"),
+ action = "store",
+ default = ".*",
+ type = "character",
+ help = paste0("Regular expression extracting sample-group",
+ " from an extracted sample-name")
)
)
-args <- parse_args(OptionParser(option_list = option_list))
+
+tryCatch(
+ args <- parse_args(
+ OptionParser(
+ option_list = option_list,
+ add_help_option = TRUE
+ ),
+ print_help_and_exit = TRUE
+ ),
+ error = function(e) {
+ parse_args(
+ OptionParser(
+ option_list = option_list,
+ add_help_option = TRUE
+ ),
+ print_help_and_exit = TRUE
+ )
+ stop(as.character(e))
+ }
+)
print("args is:")
cat(str(args))
@@ -140,16 +233,34 @@
if (! file.exists(args$inputFile)) {
stop((paste("Input file", args$inputFile, "does not exist")))
}
-input_file <- args$inputFile
-alpha_file <- args$alphaFile
-preproc_sqlite <- args$preproc_sqlite
-imputed_data_file_name <- args$imputedDataFile
-imp_qn_lt_data_filenm <- args$imputedQNLTDataFile
-anova_ksea_metadata <- args$anova_ksea_metadata
-report_file_name <- args$reportFile
-ksea_sqlite <- args$ksea_sqlite
-ksea_cutoff_statistic <- args$ksea_cutoff_statistic
-ksea_cutoff_threshold <- args$ksea_cutoff_threshold
+
+# files
+alpha_file <- args$alphaFile
+anova_ksea_metadata_file <- args$anova_ksea_metadata
+imp_qn_lt_data_file <- args$imputedQNLTDataFile
+imputed_data_file <- args$imputedDataFile
+input_file <- args$inputFile
+ksea_sqlite_file <- args$ksea_sqlite
+preproc_sqlite_file <- args$preproc_sqlite
+report_file_name <- args$reportFile
+
+# parameters
+# firstDataColumn - see below
+group_filter <- args$sampleGroupFilter
+group_filter_mode <- args$sampleGroupFilterMode
+# imputationMethod - see below
+intensity_min_values_per_class <- args$intensityMinValuesPerClass
+ksea_cutoff_statistic <- args$ksea_cutoff_statistic
+ksea_cutoff_threshold <- args$ksea_cutoff_threshold
+ksea_min_substrate_count <- args$kseaMinSubstrateCount
+ksea_use_absolute_log2_fc <- args$kseaUseAbsoluteLog2FC
+# mean_percentile - see below
+min_quality <- args$minQuality
+# regexSampleNames - see below
+# regexSampleGrouping - see below
+# sampleGroupFilterPatterns - see below (becomes group_filter_patterns)
+# sd_percentile - see below
+
if (
sum(
grepl(
@@ -192,32 +303,59 @@
# - regexSampleNames
# - regexSampleGrouping
read_config_file_string <- function(fname, limit) {
+ cat(sprintf("read_config_file_string: fname = '%s'\n", fname))
+ cat(sprintf("length(fname) = '%s'\n", length(fname)))
+ result <-
+ if (file.exists(fname)) {
+ cat(sprintf("reading '%s' ...\n", fname))
+ readChar(fname, limit)
+ } else {
+ cat(sprintf("not a file: '%s'\n", fname))
+ fname
+ }
+ #AC print(paste0("read_config_file_string: opening file '", as.character(fname), "'"))
# eliminate any leading whitespace
- result <- gsub("^[ \t\n]*", "", readChar(fname, limit))
+ result <- gsub("^[ \t\n]*", "", result)
# eliminate any trailing whitespace
- result <- gsub("[ \t\n]*$", "", result)
+ result <- gsub("[ \t\n]*$", "", result)
# substitute characters escaped by Galaxy sanitizer
- result <- gsub("__lt__", "<", result)
- result <- gsub("__le__", "<=", result)
- result <- gsub("__eq__", "==", result)
- result <- gsub("__ne__", "!=", result)
- result <- gsub("__gt__", ">", result)
- result <- gsub("__ge__", ">=", result)
- result <- gsub("__sq__", "'", result)
- result <- gsub("__dq__", '"', result)
- result <- gsub("__ob__", "[", result)
- result <- gsub("__cb__", "]", result)
+ result <- gsub("__lt__", "<", result)
+ result <- gsub("__le__", "<=", result)
+ result <- gsub("__eq__", "==", result)
+ result <- gsub("__ne__", "!=", result)
+ result <- gsub("__gt__", ">", result)
+ result <- gsub("__ge__", ">=", result)
+ result <- gsub("__sq__", "'", result)
+ result <- gsub("__dq__", '"', result)
+ result <- gsub("__ob__", "[", result)
+ result <- gsub("__cb__", "]", result)
}
+nc <- 1000
+
+sink(stderr())
+
cat(paste0("first_data_column file: ", args$firstDataColumn, "\n"))
-cat(paste0("regex_sample_names file: ", args$regexSampleNames, "\n"))
-cat(paste0("regex_sample_grouping file: ", args$regexSampleGrouping, "\n"))
-nc <- 1000
-regex_sample_names <- read_config_file_string(args$regexSampleNames, nc)
-regex_sample_grouping <- read_config_file_string(args$regexSampleGrouping, nc)
first_data_column <- read_config_file_string(args$firstDataColumn, nc)
cat(paste0("first_data_column: ", first_data_column, "\n"))
+
+cat(paste0("regex_sample_grouping file: ", args$regexSampleGrouping, "\n"))
+regex_sample_grouping <- read_config_file_string(args$regexSampleGrouping, nc)
+cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\n"))
+
+cat(paste0("regex_sample_names file: ", args$regexSampleNames, "\n"))
+regex_sample_names <- read_config_file_string(args$regexSampleNames, nc)
cat(paste0("regex_sample_names: ", regex_sample_names, "\n"))
-cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\n"))
+
+if (group_filter != "none") {
+ cat(paste0("group_filter_patterns file: '", args$sampleGroupFilterPatterns, "'\n"))
+ group_filter_patterns <- read_config_file_string(args$sampleGroupFilterPatterns, nc)
+} else {
+ group_filter_patterns <- ".*"
+}
+cat(paste0("group_filter_patterns: ", group_filter_patterns, "\n"))
+
+sink()
+
# from: https://github.com/molgenis/molgenis-pipelines/wiki/
# How-to-source-another_file.R-from-within-your-R-script
@@ -253,45 +391,72 @@
return(NULL)
}
-script_dir <- location_of_this_script()
+# validation of input parameters is complete; it is now justifiable to
+# install LaTeX tools to render markdown as PDF; this involves a big
+# download from GitHub
+if (!tinytex::is_tinytex()) tinytex::install_tinytex()
rmarkdown_params <- list(
- inputFile = input_file
- , alphaFile = alpha_file
- , preprocDb = preproc_sqlite
+
+ # files
+ alphaFile = alpha_file
+ , anovaKseaMetadata = anova_ksea_metadata_file
+ , imputedDataFilename = imputed_data_file
+ , imputedQNLTDataFile = imp_qn_lt_data_file
+ , inputFile = input_file
+ , kseaAppPrepDb = ksea_sqlite_file
+ , preprocDb = preproc_sqlite_file
+
+ # parameters
, firstDataColumn = first_data_column
+ , groupFilter = group_filter
+ , groupFilterMode = group_filter_mode # arg sampleGroupFilterMode
+ , groupFilterPatterns = group_filter_patterns # arg sampleGroupFilterPatterns
, imputationMethod = imputation_method
+ , intensityMinValuesPerGroup = intensity_min_values_per_class
+ , kseaCutoffStatistic = ksea_cutoff_statistic
+ , kseaCutoffThreshold = ksea_cutoff_threshold
+ , kseaMinSubstrateCount = ksea_min_substrate_count
+ , kseaUseAbsoluteLog2FC = ksea_use_absolute_log2_fc # add
, meanPercentile = mean_percentile
- , sdPercentile = sd_percentile
+ , minQuality = min_quality # add
+ , regexSampleGrouping = regex_sample_grouping
, regexSampleNames = regex_sample_names
- , regexSampleGrouping = regex_sample_grouping
- , imputedDataFilename = imputed_data_file_name
- , imputedQNLTDataFile = imp_qn_lt_data_filenm
- , anovaKseaMetadata = anova_ksea_metadata
- , kseaAppPrepDb = ksea_sqlite
- , kseaCutoffThreshold = ksea_cutoff_threshold
- , kseaCutoffStatistic = ksea_cutoff_statistic
+ , sdPercentile = sd_percentile
)
print("rmarkdown_params")
-str(rmarkdown_params)
+print(rmarkdown_params)
+print(
+ lapply(
+ X = rmarkdown_params,
+ FUN = function(x) {
+ paste0(
+ nchar(as.character(x)),
+ ": '",
+ as.character(x),
+ "'"
+ )
+ }
+ )
+)
+
# freeze the random number generator so the same results will be produced
# from run to run
set.seed(28571)
-# BUG (or "opportunity")
-# To render as PDF for the time being requires installing the conda
-# package `r-texlive` until this issue in `texlive-core` is resolved:
-# https://github.com/conda-forge/texlive-core-feedstock/issues/19
-# This workaround is detailed in the fourth comment of:
-# https://github.com/conda-forge/texlive-core-feedstock/issues/61
+script_dir <- location_of_this_script()
-library(tinytex)
-tinytex::install_tinytex()
rmarkdown::render(
input = paste(script_dir, "mqppep_anova_script.Rmd", sep = "/")
-, output_format = rmarkdown::pdf_document(toc = TRUE)
, output_file = report_file_name
, params = rmarkdown_params
+, output_format = rmarkdown::pdf_document(
+ includes = rmarkdown::includes(in_header = "mqppep_anova_preamble.tex")
+ , dev = "pdf"
+ , toc = TRUE
+ , toc_depth = 2
+ , number_sections = FALSE
+ )
)
diff -r dbff53e6f75f -r 08678c931f5d mqppep_anova.xml
--- a/mqppep_anova.xml Mon Jul 11 19:22:25 2022 +0000
+++ b/mqppep_anova.xml Fri Oct 28 18:27:21 2022 +0000
@@ -7,6 +7,28 @@
Runs ANOVA and KSEA for phosphopeptides.
macros.xml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
topic_0121
@@ -27,29 +49,58 @@
both need access to a writeable directory, but most directories in a
biocontainer are read-only, so this builds a pseudo-home under /tmp
-->
+
+
+
+
+
+
+
+
+
+
+
$sample_names_regex
@@ -57,18 +108,39 @@
$sample_grouping_regex
+
+ #if $group_filter.group_filter_method != "none"
+ $group_filter.group_filter_patterns
+ #end if
+
$intensity_column_regex
-
+
-
-
@@ -93,15 +165,15 @@
label="Mean percentile for random values"
help="Percentile center of random values; range [1,99]"
/>
-
+ help="Pattern extracting sample-names from names of columns of 'Filtered phosphopeptide intensities' that have peptide intensity data (PERL-compatible regular expression)"
+ label="Sample-name extraction pattern">
@@ -109,32 +181,65 @@
+ help="Pattern extracting sample-group from the extracted sample-names (PERL-compatible regular expression)"
+ label="Sample-group extraction pattern">
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
+
+
+
+
+
+
-
+
@@ -156,20 +261,32 @@
-
+
-
+
+
+
+
+
+
+
-
+
+
+
+
+
+
+
-
+
@@ -237,8 +366,8 @@
-
-
+
+
@@ -248,14 +377,34 @@
Phopsphoproteomic Enrichment Pipeline ANOVA and KSEA
====================================================
-**Input files**
+**Overview**
+============
+
+Perform statistical analysis of preprocessed MaxQuant output data collected as described in `[Cheng, 2018] `_.
+
+ - Extracts sample-group IDs from sample names.
+ - Imputes missing values.
+ - Performs ANOVA analysis for each phosphopeptide.
+ - Performs Kinase-Substrate Enrichment Analysis (KSEA) using the method described by `Casado et al. (2013) `_; see *"Algorithms"* section below.
-``Filtered Phosphopeptide Intensities``
+**Workflow position**
+=====================
+
+Upstream tool
+ The "MaxQuant Phosphopeptide Preprocessing" tool (``mqppep_preproc``) that transforms MaxQuant output for phospoproteome-enriched samples into a form suitable for statistical analysis.
+
+**Input datasets**
+==================
+
+``Filtered phosphopeptide intensities`` (tabular)
Phosphopeptides annotated with SwissProt and phosphosite metadata (in tabular format).
- This is the output from the "Phopsphoproteomic Enrichment Pipeline Merge and Filter"
- (``mqppep_mrgflt``) tool.
+ This is the output from the "MaxQuant Phopsphopeptide Preprocessing"
+ (``mqppep_preproc``) tool.
-``ANOVA alpha cutoff level``
+ - First column label 'Phosphopeptide'.
+ - Sample-intensities must begin in first column matching 'Intensity-column pattern' and must have column labels to match argument 'Sample-name extraction pattern'.
+
+``ANOVA alpha cutoff level`` (tabular)
List of alpha cutoff values for significance testing; text file having one column and no header. For example:
::
@@ -264,60 +413,99 @@
0.1
0.05
+``Database from mqppep_preproc`` (sqlite)
+ SQLite database produced by the "MaxQuant Phopsphopeptide Preprocessing"
+ (``mqppep_preproc``) tool.
+
**Input parameters**
+====================
``Intensity-column pattern``
- First column of ``input_file`` having intensity values (integer or PERL-compatible regular expression matching column label). Default: **Intensity**
+ First column of ``Filtered phosphopeptide intensities`` having intensity values (integer or PERL-compatible regular expression matching column label). Default::
+
+ ^Intensity[^_]
``Imputation method``
Impute missing values by:
- 1. ``group-median`` - use median for each sample-group;
- 2. ``mean`` - use mean across all samples; or
- 3. ``median`` - use median across all samples;
- 4. ``random`` - use randomly generated values where:
+ 1. ``group-median`` - use median for each sample-group;
+ 2. ``mean`` - use mean across all samples; or
+ 3. ``median`` - use median across all samples;
+ 4. ``random`` - use randomly generated values where:
- - ``Mean percentile for random values`` specifies the percentile among non-missing values to be used as mean of random values, and
- - ``Percentile std. dev. for random values`` specifies the factor to be multiplied by the standard deviation among the non-missing values (across all samples) to determine the standard deviation of random values.
+ (i) ``Mean percentile for random values`` specifies the percentile among non-missing values to be used as mean of random values, and
+ (ii) ``Percentile SD for random values`` specifies the factor to be multiplied by the standard deviation among the non-missing values (across all samples) to determine the standard deviation of random values.
-``Sample-extraction pattern``
- PERL-compatible regular expression extracting the sample-name from the the name of a column of instensities (from ``input_file``) for one sample.
+``Sample-name extraction pattern``
+ PERL-compatible regular expression extracting the sample-name from the the name of a column of intensities (from ``Filtered phosphopeptide intensities``) for one sample.
- - For example, ``"\.\d+[A-Z]$"`` applied to ``Intensity.splunge.10A`` would produce ``.10A``
+ - For example, ``"\.\d+[A-Z]$"`` applied to "``Intensity.splunge.10A``" would produce "``.10A``".
- Note that *this is case sensitive* by default.
-``Group-extraction pattern``
- PERL-compatible regular expression extracting the sample-grouping from the sample-name that was extracted with ``sample_names_regex`` from a column of intensites (from ``input_file``).
+``Sample-group extraction pattern``
+ PERL-compatible regular expression extracting the sample-grouping from the sample-name (that was in turn extracted with ``Sample-name extraction pattern`` from a column of intensites from ``Filtered phosphopeptide intensities``).
+
+ - For example, ``"\d+$"`` applied to "``.10A``" would produce "``10``".
+ - Note that *this is case sensitive* by default.
+
+``Minimum number of values per sample-group``
+ Sometimes you may wish to filter out the intensities that are poorly represented among some sample groups because they complicate the comparison process. You can use this parameter to specify the minimum number of values in any sample-group (range [0,]]>∞∞`_.
+The code is adapted from `"Danica D. Wiredja (2017). KSEAapp: Kinase-Substrate Enrichment Analysis. R package version 0.99.0." `_ to work with output from the "MaxQuant Phosphopeptide Preprocessing" Galaxy tool and the multiple kinase-substrate databases that the latter tool searches.
**Authors**
+===========
``Larry C. Cheng``
(`ORCiD 0000-0002-6922-6433 `_) wrote the original script.
@@ -337,5 +525,11 @@
10.3791/57996
10.1093/bioinformatics/btx415
+ @Manual{,
+ title = {KSEAapp: Kinase-Substrate Enrichment Analysis},
+ author = {Danica D. Wiredja},
+ year = {2017},
+ note = {R package version 0.99.0},
+ }
diff -r dbff53e6f75f -r 08678c931f5d mqppep_anova_preamble.tex
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mqppep_anova_preamble.tex Fri Oct 28 18:27:21 2022 +0000
@@ -0,0 +1,90 @@
+% -----------------------------------------------------------------------------
+% preamble includes BEGIN
+% -----------------------------------------------------------------------------
+\usepackage{longtable, lscape, ifthen}
+
+% -----------------------------------------------------------------------------
+% put \T or \B at the ends of lines to add space for super- or sub-
+% scripts above or below, respectively
+% ref: ?
+\newcommand\T{\rule{0pt}{2.6ex}} % Top strut
+\newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut
+
+% -----------------------------------------------------------------------------
+% horizontal line commands; ideally, these would compute the width rather than
+% hardcoding it
+% ref: ?
+\def\hlinport{\makebox[6.5in]{\hrulefill} \\} % hline outside tabular, port
+\def\hlinlscp{\makebox[9in]{\hrulefill} \\} % hline outside tabular, lndscp
+%ref: https://stackoverflow.com/a/67335722
+\def\hlinnotab{\\makebox[1.0\linewidth]{\hrulefill}\\[1ex]}
+
+% -----------------------------------------------------------------------------
+% ref: https://latex.org/forum/viewtopic.php?p=23257#p23257
+\newcommand{\nonemptyline}[1]{%
+ %\ifthenelse{\equal{#1}{}}{do when empty}{do when not empty}
+ \ifthenelse{\equal{#1}{}}{}{#1}%
+}
+
+% -----------------------------------------------------------------------------
+% For RMarkdown, I needed to put this into a preamble.tex file and include it
+% via `output: pdf_document: includes: in_header: preamble.tex` because
+% Markdown was expanding the \tabfill command before writing the tex file
+% ref: https://tex.stackexchange.com/a/119477 in reply to
+% https://tex.stackexchange.com/questions/119473/tabbing-and-line-wrapping
+\makeatletter
+\newlength\tdima
+\newcommand\tabfill[1]{\setlength\tdima{\linewidth}%
+ \addtolength\tdima{\@totalleftmargin}%
+ \addtolength\tdima{-\dimen\@curtab}%
+ \parbox[t]{\tdima}{#1\ifhmode\strut\fi}}
+ %\parbox[t]{\tdima}{\nonemptyline{#1}\ifhmode\strut\fi}}
+\makeatother
+%
+% Create a tabbing environment in which to use tabfill
+% param #1 is specified the tabstops (as expected by the tabbing
+% environment) and is provided in braces after invocation, e.g.:
+% \begin{tabwrap}{\hspace{1.25in}\=}
+% param #2 is the contents of the envirnent
+\newenvironment{tabwrap}[2]{%
+ \begin{tabbing}#1\kill\ignorespaces%
+ #2}%
+ {\end{tabbing}%
+}
+
+% -----------------------------------------------------------------------------
+% Make a caption for a non-floating figure or table, e.g.,
+% ref: https://github.com/rf-latex/capt-of/blob/main/capt-of.dtx
+% https://texfaq.org/FAQ-figurehere
+%
+% Usage: \captionof{*type*}[*move*]{*caption*}
+% *type* is `figure` or `table` (or some type you've
+% defined with the`float` package)
+% *move* is the optional moving argument *caption* (the thing
+% that goes to the list of tables/figures)
+% *caption* is the text of the caption
+\makeatletter
+\newcommand\captionof[1]{\def\@captype{#1}\caption}
+\makeatother
+%
+%%ACE \captionof{table}{Hello world from line 210}
+% To circumvent mis-numbering of interleaved float and non-float table
+% and figure captions, it is necessary to include the `perpage` package and
+% "make them sorted" (FFI see https://texfaq.org/FAQ-figurehere)
+% I (ACE) don't know how to get this package to include:
+% \usepackage{bigfoot}
+% so I included the source instead:
+\makeatletter
+\input{perpage.tex}
+\makeatother
+%
+% Ensure that table numbers are sorted
+\MakeSorted{table}
+% Ensure that figure numbers are sorted
+\MakeSorted{figure}
+
+% -----------------------------------------------------------------------------
+
+% -----------------------------------------------------------------------------
+% preamble includes END
+% -----------------------------------------------------------------------------
diff -r dbff53e6f75f -r 08678c931f5d mqppep_anova_script.Rmd
--- a/mqppep_anova_script.Rmd Mon Jul 11 19:22:25 2022 +0000
+++ b/mqppep_anova_script.Rmd Fri Oct 28 18:27:21 2022 +0000
@@ -7,81 +7,153 @@
date:
- "May 28, 2018"
- "; revised June 23, 2022"
+lot: true
output:
pdf_document:
toc: true
- toc_depth: 3
+ toc_depth: 2
keep_tex: true
-header-includes:
- - \usepackage{longtable}
- - \newcommand\T{\rule{0pt}{2.6ex}} % Top strut
- - \newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut
+ dev: pdf
+ includes:
+ in_header: mqppep_anova_preamble.tex
+latex_macros: false
+raw_tex: true
+urlcolor: blue
params:
alphaFile: "test-data/alpha_levels.tabular"
inputFile: "test-data/test_input_for_anova.tabular"
preprocDb: "test-data/test_input_for_anova.sqlite"
kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2]
- show_toc: true
- firstDataColumn: "^Intensity[^_]"
- imputationMethod: !r c("group-median", "median", "mean", "random")[1]
- meanPercentile: 1
- sdPercentile: 1.0
regexSampleNames: "\\.\\d+[A-Z]$"
regexSampleGrouping: "\\d+"
+ groupFilterPatterns: ".+"
+ groupFilter: !r c("none", "exclude", "include")[1]
+ imputationMethod: !r c("group-median", "median", "mean", "random")[4]
+ kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5, 0.9)[5]
+ #imputationMethod: !r c("group-median", "median", "mean", "random")[1]
+
+ # how should sample groups be interpreted?
+ # - "f": fixed patterns (like `grep -F`)
+ # - "p": PERL-compatible (like `grep -P`)
+ # - "r": extended grep patterns (like `grep -E`)
+ # use what case sensitivity?
+ # - "i": case insensitive matching (like `grep -i`)
+ groupFilterMode: !r c("r", "ri", "p", "pi", "f", "fi")[1]
+ # what pattern should be used for the first column
+ # (extended grep pattern, case sensitive)
+ firstDataColumn: "^Intensity[^_]"
+ # for small random value imputation, what percentile should be center?
+ meanPercentile: 50
+ #meanPercentile: 1
+ # for small random value imputation, what should `s / mean(x)` ratio be?
+ sdPercentile: 1.0
+ # output path for imputed data file
imputedDataFilename: "test-data/limbo/imputedDataFilename.txt"
+ # output path for imputed/quantile-normalized/log-transformed data file
imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt"
+ # output path for contents of `stats_metadata_v` table
anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt"
+ # how to test one variable with > 2 categories (e.g., aov or kruskal.test)
oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1]
+ # how to test one variable with 2 categories (e.g., oneway.test)
oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3]
- kseaCutoffStatistic: !r c("p.value", "FDR")[2]
- kseaCutoffThreshold: !r c( 0.1, 0.05)[2]
- kseaMinKinaseCount: 1
- intensityHeatmapRows: 75
+ # what should be the minimum quality for consideration in both
+ minQuality: 0
+ # correct KSEA with FDR (recommended) or raw p-value
+ kseaCutoffStatistic: !r c("FDR", "p.value")[1]
+ # correct KSEA threshold 0.05 (conventional) or higher (perhaps better)
+ # "perhaps better" meaning that KSEA is an hypothesis-generator, not -test
+ #kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5)[1]
+ # minimum number of substrates required for a kinase to be considered in KSEA
+ kseaMinSubstrateCount: 1
+ # Should KSEA be performed aggregating signed log2FC or absolute?
+ # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores
+ # TRUE use abs(log2FC) for KSEA as Justin Drake requested; this is a
+ # justifiable deviation from the KSEAapp::KSEA.Scores algorithm.
+ kseaUseAbsoluteLog2FC: TRUE
+ #kseaUseAbsoluteLog2FC: FALSE
+ # minimum number of observed values per sample-group
+ intensityMinValuesPerGroup: 1
+ # maximum number of heatmap rows (result are poor when > 50)
+ intensityHeatmapRows: 50
+ # what should be the primary criterion to eliminate excessive heatmap rows
+ intensityHeatmapCriteria: !r c("quality", "na_count", "p_value")[1]
+ # should correlation among substrates be used (rather than covariance)
+ correlateSubstrates: TRUE
+ # only show covariance among variables having variance > 1
+ filterCovVarGT1: TRUE
+ # maximum number of residues to display for ppeps in rownames or columnames
+ ppepTruncN: 10
+ # maximum number of characters of subgenes to display in rownames or columnames
+ subgeneTruncN: 10
+ # maximum number of characters for paste(subgene, ppep) for enrichment plots
+ substTruncN: 20
+ # should boxplots use variable-width boxes to reflect # of samples
+ boxPlotVarWidth: TRUE
+ # should boxplots use notched boxes to reflect difference between samples
+ boxPlotNotch: TRUE
+ # look-up tables for kinase descriptions
+ kinaseNameUprtLutBz2: "./kinase_name_uniprot_lut.tabular.bz2"
+ kinaseUprtDescLutBz2: "./kinase_uniprot_description_lut.tabular.bz2"
+ # should debugging trace messages be printed?
+ showEnrichedSubstrates: FALSE
+
+ # should debugging nb/nbe messages be printed?
+ printNBMsgs: FALSE
+ # should debugging trace messages be printed?
+ printTraceMsgs: FALSE
+ # when debugging files are needed, set debugFileBasePath to the path
+ # to the directory where they should be writtn
+ debugFileBasePath: !r if (TRUE) NULL else "test-data"
---
-
-```{r setup, include = FALSE}
+
+```{r setup, include = FALSE, results = 'asis'}
+
+# simple debug messaging
+print_nb_messages <- params$printNBMsgs
+
+nb <- if (!print_nb_messages) {
+ function(...) invisible()
+ } else {
+ function(..., f = cat) f("\n$\\exists{}\\supset\\forall{}$", ...)
+ }
+
+nbe <- if (!print_nb_messages) {
+ function(...) invisible()
+ } else {
+ function(..., f = cat, file = stderr()) {
+ cat(
+ stringi::stri_unescape_unicode("\nNBE \\u2203\\u2283\\u2200"),
+ ...,
+ file = file
+ )
+ }
+ }
+
#ref for debugging: https://yihui.org/tinytex/r/#debugging
options(tinytex.verbose = TRUE)
# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
# ref for top and bottom struts: https://tex.stackexchange.com/a/50355
-knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10))
+knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10), dpi = 300)
# freeze the random number generator so the same results will be produced
# from run to run
set.seed(28571)
### LIBRARIES
+
+if (print_nb_messages) nbe("library(gplots)")
library(gplots)
+if (print_nb_messages) nbe("library(caret)")
+# load caret for nearZeroVar
+if (print_nb_messages) nbe("Please ignore the messages about systemd, if any.\n")
+library(caret)
+if (print_nb_messages) nbe("library(DBI)")
library(DBI)
+if (print_nb_messages) nbe("library(RSQLite)")
library(RSQLite)
+if (print_nb_messages) nbe("library(sqldf)\n")
# Suppress "Warning: no DISPLAY variable so Tk is not available"
suppressWarnings(suppressMessages(library(sqldf)))
@@ -97,30 +169,42 @@
### CONSTANTS
-const_parfin <- par("fin")
-const_boxplot_fill <- "grey94"
-const_stripchart_cex <- 0.5
-const_stripsmall_cex <-
- sqrt(const_stripchart_cex * const_stripchart_cex / 2)
-const_stripchart_jitter <- 0.3
-const_write_debug_files <- FALSE
-const_table_anchor_bp <- "bp"
-const_table_anchor_ht <- "ht"
-const_table_anchor_p <- "p"
-const_table_anchor_tbp <- "tbp"
-
-
+const_boxplot_fill <- "grey94"
const_ksea_astrsk_kinases <- 1
const_ksea_nonastrsk_kinases <- 2
const_ksea_all_kinases <- 3
-
-const_log10_e <- log10(exp(1))
-
-### FUNCTIONS
-
-# from `demo(error.catching)`
+const_log10_e <- log10(exp(1))
+const_stripchart_cex <- 0.5
+const_stripchart_jitter <- 0.3
+const_table_anchor_bp <- "bp"
+const_table_anchor_ht <- "ht"
+const_table_anchor_p <- "p"
+const_table_anchor_t <- "t"
+const_table_anchor_tbp <- "tbp"
+
+
+### GLOBAL VARIABLES (params)
+
+## functions to process params
+
+is_string_null_or_empty <- function(x) {
+ # N. B. non-strings are intentionally treated as NULL
+ if (is.null(x))
+ TRUE
+ else if (!is.character(x))
+ TRUE
+ else x == ""
+}
+
##' Catch *and* save both errors and warnings, and in the case of
##' a warning, also keep the computed result.
+##' return result as list(value = ..., warning = ...)
+##' - value will be:
+##' - the result if no exception is thrown
+##' - the exception if an exception is caught
+##' - warning will be a string except perhaps when warning argument is not NULL
+##'
+##' adapted from `demo(error.catching)`
##'
##' @title tryCatch both warnings (with value) and errors
##' @param expr an \R expression to evaluate
@@ -128,32 +212,428 @@
##' 'value' may be an error caught.
##' @author Martin Maechler;
##' Copyright (C) 2010-2012 The R Core Team
-try_catch_w_e <- function(expr) {
- wrn <- NULL
- # warning handler
- w_handler <- function(w) {
- wrn <<- w
- invokeRestart("muffleWarning")
+try_catch_w_e <-
+ function(expr, error = function(e) e, warning = NULL) {
+ wrn <- NULL
+ # warning handler
+ w_handler <-
+ if (is.function(warning))
+ warning
+ else
+ function(w) {
+ wrn <<- w
+ invokeRestart("muffleWarning")
+ }
+ e_handler <-
+ if (is.function(error))
+ error
+ else
+ function(e) e
+ # return result as list(value = ..., warning = ...)
+ # - value will be:
+ # - the result if no exception is thrown
+ # - the exception if an exception is caught
+ list(
+ value = withCallingHandlers(
+ tryCatch(
+ expr,
+ error = e_handler
+ ),
+ warning = w_handler
+ ),
+ warning = wrn
+ )
+ }
+
+see_kvp <-
+ function(format, key, value, suffix = "") {
+ if (
+ !all(
+ is.character(format),
+ is.character(key),
+ is.character(value),
+ is.character(suffix)
+ )
+ ) {
+ cat("all arguments to see_kvp should be character")
+ knitr::knit_exit()
+ }
+ result <- sprintf(format, value)
+ if (length(result) > 1) {
+ sprintf(
+ "%s = c(%s)%s",
+ whack_underscores(key),
+ paste(result, collapse = ", "),
+ suffix
+ )
+ } else {
+ sprintf(
+ "%s = %s%s",
+ key,
+ result,
+ suffix
+ )
+ }
+ }
+
+see_logical <-
+ function(x, suffix = "", xprssn = deparse1(substitute(x))) {
+ result <- as.character(as.logical(x))
+ # handle NAs and NaNs
+ result[is.na(result)] <- "NA"
+ see_kvp(
+ format = "%s",
+ key = xprssn,
+ value = result,
+ suffix = suffix
+ )
+ }
+
+see_numeric <-
+ function(x, suffix = "", digits = 3, xprssn = deparse1(substitute(x))) {
+ if (is.numeric(digits) && is.numeric(x)) {
+ digits <- as.integer(digits)
+ digits <- min(16, max(0, digits))
+ format <- paste0("%0.", as.character(digits), "g")
+ result <- sprintf(format, x)
+ see_kvp(
+ format = "%s",
+ key = xprssn,
+ value = result,
+ suffix = suffix
+ )
+ }
+ }
+
+see_character <-
+ function(x, suffix = "", xprssn = deparse1(substitute(x))) {
+ if (is.character(x)) {
+ see_kvp(
+ format = "%s",
+ key = xprssn,
+ value = sprintf("\"%s\"", x),
+ suffix = suffix
+ )
+ }
+ }
+
+see_variable <-
+ function(x, suffix = "", digits = 3, xprssn = deparse1(substitute(x))) {
+ if (is.character(x)) {
+ see_character(x, suffix, xprssn)
+ } else if (is.numeric(x)) {
+ see_numeric(x, suffix, digits, xprssn)
+ } else if (is.logical(x)) {
+ see_logical(x, suffix, xprssn)
+ } else {
+ f <- file("")
+ sink(f)
+ str(x)
+ msg <- paste(readLines(f), collapse = "\n")
+ sink()
+ close(f)
+ paste0(
+ "see_variable - str(",
+ xprssn,
+ "):\n",
+ msg, "\n"
+ )
+ }
+ }
+
+# ref: https://tug.org/texinfohtml/latex2e.html
+# LaTeX sets aside the following characters for special purposes.
+# For example, the percent sign % is for comments.
+# They are called reserved characters or special characters.
+# They are all discussed elsewhere in this manual.
+#
+# $ % & { } _ ~ ^ \ #
+#
+# If you want a reserved character to be printed as itself, in the text body
+# font, for all but the final three characters in that list simply put
+# a backslash \ in front of the character.
+# Thus, typing \$1.23 will produce $1.23 in your output.
+#
+# As to the last three characters, to get a tilde in the text body font,
+# use \~{} (omitting the curly braces would result in the next character
+# receiving a tilde accent).
+# Similarly, to get a text body font circumflex use \^{}.
+# To get a backslash in the font of the text body enter \textbackslash{}.
+whack_math <-
+ function(v) {
+ v <- as.character(v)
+ w <- gsub("\\", "\\textbackslash ", v, fixed = TRUE)
+ w <- Reduce(
+ f = function(l, r) {
+ gsub(r, paste0("\\", r), l, fixed = TRUE)
+ },
+ x = c("#", "$", "%", "&", "{", "}", "_"),
+ init = w
+ )
+ w <- gsub("^", "\\^{}", w, fixed = TRUE)
+ return(w)
+ if (all(v == w))
+ v
+ else
+ paste0("\\texttt{", w, "}")
}
- list(
- value = withCallingHandlers(
- tryCatch(
- expr,
- error = function(e) e
- ),
- warning = w_handler
- ),
- warning = wrn
+whack_underscores <- whack_math
+
+## dump params to stderr (remove this eventually)
+
+if (FALSE) nbe(see_variable(params))
+
+## unlist params for eventual output
+
+param_unlist <- unlist(as.list(params))
+
+# no need to whack underscores and dollars because this is verbatim
+param_df <- data.frame(
+ parameter = paste0("\\verb@", names(param_unlist), "@"),
+ value = paste0(
+ "\n\\begin{tiny}\n\\verb@",
+ param_unlist,
+ "@\n\\end{tiny}"
+ )
+ )
+param_df <- data.frame(
+ parameter = names(param_unlist),
+ value = param_unlist
+ )
+param_df <- param_df[order(param_df$parameter), ]
+
+## general output control
+
+debug_file_base_path <- params$debugFileBasePath
+print_trace_messages <- params$printTraceMsgs
+show_enriched_substrates <- params$showEnrichedSubstrates
+boxplot_varwidth <- params$boxPlotVarWidth
+boxplot_notch <- params$boxPlotNotch
+
+## parameters for static data
+
+kinase_name_uprt_lut_bz2 <- params$kinaseNameUprtLutBz2
+kinase_uprt_desc_lut_bz2 <- params$kinaseUprtDescLutBz2
+
+## parameters for input file
+
+preproc_db <- params$preprocDb
+alpha_file <- params$alphaFile
+input_file <- params$inputFile
+
+# First data column - ideally, this could be detected via
+# regexSampleNames, but for now leave it as is.
+first_data_column <- params$firstDataColumn
+fdc_is_integer <- is.integer(first_data_column)
+if (fdc_is_integer) {
+ first_data_column <- as.integer(params$firstDataColumn)
+}
+
+## parameters for output files
+
+ksea_app_prep_db <- params$kseaAppPrepDb
+imputed_data_filename <- params$imputedDataFilename
+imp_qn_lt_data_filenm <- params$imputedQNLTDataFile
+anova_ksea_mtdt_file <- params$anovaKseaMetadata
+
+## parameters for imputation
+
+# Imputation method, should be one of
+# "random", "group-median", "median", or "mean"
+imputation_method <- params$imputationMethod
+
+# Selection of percentile of logvalue data to set the mean for random number
+# generation when using random imputation
+mean_percentile <- params$meanPercentile / 100.0
+
+# deviation adjustment-factor for random values; real number.
+sd_percentile <- params$sdPercentile
+
+## parameters for group parsing and filtering
+
+# Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$"
+regex_sample_names <- params$regexSampleNames
+# Regular expression to extract Sample Grouping from Sample Name;
+# if error occurs, compare smpl_trt vs. sample_name_matches
+# to see if groupings/pairs line up
+# e.g., "(\\d+)"
+
+regex_sample_grouping <- params$regexSampleGrouping
+# What are the patterns for filtering sample groups?
+# How should sample groups be filtered?
+# - none: do not filter
+# - include: include sample groups matching filter
+# - exclude: include sample groups not matching filter
+
+sample_group_filter <- params$groupFilter
+if (grepl("f", params$groupFilterMode, fixed = TRUE)) {
+ sample_group_filter_perl <- FALSE
+ sample_group_filter_fixed <- TRUE
+} else if (grepl("p", params$groupFilterMode, fixed = TRUE)) {
+ sample_group_filter_perl <- TRUE
+ sample_group_filter_fixed <- FALSE
+} else { # normal regex
+ sample_group_filter_perl <- FALSE
+ sample_group_filter_fixed <- FALSE
+}
+
+sample_group_filter_nocase <-
+ grepl("i", params$groupFilterMode, fixed = TRUE)
+
+# What PCRE patterns should be included or excluded
+group_filter_patterns_csv <- params$groupFilterPatterns
+sample_group_filter_patterns <- strsplit(
+ x = group_filter_patterns_csv,
+ split = ",",
+ fixed = TRUE
+ )[[1]]
+
+## parameters for hypothesis testing
+
+one_way_all_categories_fname <- params$oneWayManyCategories
+
+one_way_all_categories <- try_catch_w_e(
+ match.fun(one_way_all_categories_fname))
+
+if (!is.function(one_way_all_categories$value)) {
+ write("fatal error for parameter oneWayManyCategories:", stderr())
+ write(one_way_all_categories$value$message, stderr())
+ if (sys.nframe() > 0) {
+ cat("Cannot continue and quit() failed. Goodbye.")
+ knitr::knit_exit()
+ quit(save = "no", status = 1)
+ }
+}
+
+one_way_all_categories <- one_way_all_categories$value
+
+one_way_two_categories_fname <- params$oneWayManyCategories
+one_way_two_categories <- try_catch_w_e(
+ match.fun(one_way_two_categories_fname))
+if (!is.function(one_way_two_categories$value)) {
+ cat("fatal error for parameter oneWayTwoCategories: \n")
+ cat(one_way_two_categories$value$message, fill = TRUE)
+ if (sys.nframe() > 0) {
+ cat("Cannot continue and quit() failed. Goodbye.")
+ knitr::knit_exit()
+ quit(save = "no", status = 1)
+ }
+}
+one_way_two_categories <- one_way_two_categories$value
+
+## parameters for KSEA
+
+ksea_cutoff_statistic <- params$kseaCutoffStatistic
+ksea_cutoff_threshold <- params$kseaCutoffThreshold
+ksea_min_substrate_count <- params$kseaMinSubstrateCount
+
+## parameters for global variables consumed by functions
+
+# intensityHeatmapCriteria: !r c("na_count", "p_value")[2] # TODO switch to 1
+# TODO Validate within list
+g_intensity_hm_criteria <- params$intensityHeatmapCriteria
+if (is_string_null_or_empty(g_intensity_hm_criteria)) {
+ cat("invalid intensityHeatmapCriteria parameter (must be string)")
+ knitr::knit_exit()
+}
+switch(
+ g_intensity_hm_criteria,
+ "quality" = NULL,
+ "na_count" = NULL,
+ "p_value" = NULL,
+ {
+ with(
+ params,
+ cat(
+ sprintf(
+ "invalid %s (must be %s)",
+ see_variable(intensityHeatmapCriteria),
+ "one of quality or na_count or p_value"
+ )
+ )
+ )
+ knitr::knit_exit()
+ }
+)
+
+# intensityHeatmapRows: 50
+# TODO Validate >> 0 < 75
+g_intensity_hm_rows <- params$intensityHeatmapRows
+if (!is.integer(g_intensity_hm_rows) || g_intensity_hm_rows < 1) {
+ cat("invalid intensityHeatmapRows (must be integer > 0)")
+ knitr::knit_exit()
+}
+
+g_intensity_min_per_class <- params$intensityMinValuesPerGroup
+if (!is.integer(g_intensity_min_per_class) || g_intensity_min_per_class < 0) {
+ cat("invalid intensityMinValuesPerGroup (must be integer > -1")
+ knitr::knit_exit()
+}
+
+if (is.na(as.logical(g_correlate_substrates <- params$correlateSubstrates))) {
+ cat("invalid correlateSubstrates (must be TRUE or FALSE)")
+ knitr::knit_exit()
+}
+
+if (is.na(as.logical(g_filter_cov_var_gt_1 <- params$filterCovVarGT1))) {
+ cat("invalid filterCovVarGT1 parameter (must be TRUE or FALSE)")
+ knitr::knit_exit()
+}
+
+# TODO Validate >> 0 < 30
+g_ppep_trunc_n <- params$ppepTruncN
+
+# TODO Validate >> 0 < 30
+g_subgene_trunc_n <- params$subgeneTruncN
+
+# TODO Validate >> 0 < 30
+g_sbstr_trunc_n <- params$substTruncN
+
+
+### OPERATORS
+
+# Test for exclusion
+# ref: https://www.reneshbedre.com/blog/in-operator-r.html
+`%notin%` <- Negate(`%in%`)
+
+# Augmented assignment
+# ref: https://www2.cs.arizona.edu/icon/refernce/infix2.htm#aug_assign
+`%||:=%` <- function(lvalue, ...) {
+ pf <- parent.frame()
+ rvalue <- Reduce(paste0, x = ..., init = lvalue)
+ assign(
+ x = as.character(substitute(lvalue)),
+ value = rvalue,
+ pos = pf
+ )
+ invisible(rvalue)
+}
+
+### FUNCTIONS
+
+no_op <-
+ function() {
+ }
+# this function is not used in this file and should be removed while
+# factoring out reusable code
+all_apply <- function(f, v, na_rm = TRUE, ...) {
+ Reduce(
+ f = function(l, r) if (na_rm && is.na(r)) TRUE else l && r,
+ x = sapply(X = v, FUN = f, ...),
+ init = TRUE
)
}
-
-write_debug_file <- function(s) {
- if (const_write_debug_files) {
- s_path <- sprintf("test-data/%s.txt", deparse(substitute(s)))
- print(sprintf("DEBUG writing file %s", spath))
+write_debug_file <- function(data_frame) {
+ if (!is.null(debug_file_base_path)) {
+ s_path <-
+ sprintf(
+ "%s/%s.txt",
+ debug_file_base_path,
+ deparse(substitute(data_frame))
+ )
write.table(
- s,
+ data_frame,
file = s_path,
sep = "\t",
col.names = TRUE,
@@ -174,6 +654,137 @@
new.env(parent = emptyenv())
}
+# make apply readable for rows
+row_apply <- function(x, fun, ..., simplify = TRUE) {
+ apply(x, MARGIN = 1, fun, ..., simplify = TRUE)
+}
+
+# make apply readable for columns
+column_apply <- function(x, fun, ..., simplify = TRUE) {
+ apply(x, MARGIN = 2, fun, ..., simplify = TRUE)
+}
+
+##' Produce a vector of boolean values whose i-th value is TRUE when any
+##' member of v matches the i-th membr of s, where i in 1:seq_len(length(s))
+##'
+##' @title Search multiple strings for matches of multiple substrings
+##' @param v a vector of substrings to match
+##' @param s a vector of strings to search for matches
+##' @param ... additional arguments to grepl
+##' @return a list with keys in s and valuse that are vectors of elements of v
+##' @author Art Eschenlauer
+##' Copyright (C) 2022 Art Eschenlauer;
+##' MIT License; https://en.wikipedia.org/wiki/MIT_License#License_terms
+mgrepl <- function(v, s, ...) {
+ grpl_rslt <- rep_len(0, length(s))
+ for (vi in v) {
+ grpl_rslt_v <- sapply(
+ X = s,
+ FUN = function(t) {
+ Reduce(
+ f = function(l, r) if (is.null(l)) r else c(l, r),
+ x = sapply(
+ X = vi,
+ FUN = function(f) grepl(f, t, ...)
+ ),
+ init = c()
+ )
+ },
+ simplify = "array"
+ )
+ grpl_rslt <- grpl_rslt + grpl_rslt_v
+ }
+ rslt <- unname(grpl_rslt > 0)
+}
+
+##' Produce positions in a vector where succeeding value != current valus
+##'
+##' @title Search vector for neighboring positions having different values
+##' @param v a vector of comparable numeric values (e.g. integers)
+##' @return a vector of positions i where v[i] != v[i + 1]
+##' @author Art Eschenlauer
+##' Copyright (C) 2022 Art Eschenlauer;
+##' MIT License; https://en.wikipedia.org/wiki/MIT_License#License_terms
+transition_positions <- function(v) {
+ Reduce(
+ f = function(l, i) if ((i != 1) && (v[i - 1] != v[i])) c(l, i - 1) else l,
+ x = seq_along(v)[-1:0],
+ init = c()
+ )
+}
+
+### figure debug functions
+
+cat_par_vector <- function(par_name, lbl = "", newlines = TRUE) {
+ cat(
+ sprintf(
+ "%spar(%s) = c(%s)%s",
+ lbl,
+ par_name,
+ paste(par(par_name), collapse = ", "),
+ if (newlines) "\n\n" else ""
+ )
+ )
+}
+
+cat_margins <- function(lbl = NULL) {
+ for (p in c("fig", "fin", "mar", "mai", "omd", "omi", "oma"))
+ cat_par_vector(p, if (!is.null(lbl)) paste0(lbl, " ") else NULL)
+}
+
+cat_variable <-
+ function(x, suffix = "", digits = 3, force_str = FALSE) {
+ xprssn <- deparse1(substitute(x))
+ if (force_str || is.matrix(x) || is.list(x) || is.data.frame(x)) {
+ cat(
+ paste0(
+ "\n\\texttt{\\textbf{",
+ whack_underscores(xprssn),
+ "}} [",
+ typeof(x),
+ ",",
+ mode(x),
+ "] =\n"
+ )
+ )
+ cat("\n\\begin{verbatim}\n")
+ str(x)
+ cat("\n\\end{verbatim}\n")
+ } else {
+ cat("\n", see_variable(x, suffix, digits, xprssn))
+ }
+ }
+
+### structure helper functions
+
+# ref: staque.R - Icon-oriented stack and queue operations
+# - https://gist.github.com/eschen42/917690355e53918b9e7ba7138a02d1f8
+#
+# sq_get(v):x produces the leftmost element of v and removes it from v,
+# but produces NA if v is empty
+sq_get <- function(v) {
+ if (length(v) == 0) return(NA)
+ assign(as.character(substitute(v)), v[-1], parent.frame())
+ return(v[1])
+}
+#
+# sq_put(v,x1,...,xn):v puts x1, x2, ..., xn onto the right end of v,
+# producing v.
+# Values are pushed in order from left to right,
+# so xn becomes the last (rightmost) value on v.
+# sq_put(v) with no second argument does nothing.
+sq_put <- function(v, x = NA, ...) {
+ pf <- parent.frame()
+ if (is.null(x)) return(pf$v)
+ if (
+ !(length(x) > 1) &&
+ !rlang::is_closure(x) &&
+ is.na(x)
+ ) return(pf$v)
+ assign(as.character(substitute(v)), c(v, x, ...), pf)
+ pf[[as.character(substitute(v))]]
+}
+
### numerical/statistical helper functions
any_nan <- function(x) {
@@ -186,6 +797,7 @@
sd(x[ok])
}
+# compute anova raw p-value
anova_func <- function(x, grouping_factor, one_way_f) {
subject <- data.frame(
intensity = x
@@ -203,12 +815,421 @@
pvalue
}
+# This code adapted from matrixcalc::is.positive.definite
+# Notably, this simply tests without calling stop()
+is_positive_definite <- function(x, tol = 1e-08) {
+ if (!is.matrix(x))
+ return(FALSE)
+ if (!is.numeric(x))
+ return(FALSE)
+ if (nrow(x) < 1)
+ return(FALSE)
+ if (ncol(x) < 1)
+ return(FALSE)
+ if (nrow(x) != ncol(x))
+ return(FALSE)
+ sum_symm <- sum(x == t(x), na.rm = TRUE)
+ value_count <- Reduce("*", dim(x))
+ if (sum_symm != value_count)
+ return(FALSE)
+ eigenvalues <- eigen(x, only.values = TRUE)$values
+ n <- nrow(x)
+ for (i in 1:n) {
+ if (abs(eigenvalues[i]) < tol) {
+ eigenvalues[i] <- 0
+ }
+ }
+ if (any(eigenvalues <= 0)) {
+ return(FALSE)
+ }
+ return(TRUE)
+}
### LaTeX functions
-latex_collapsed_vector <- function(collapse_string, v, underscore_whack = TRUE) {
- v_sub <- if (underscore_whack) gsub("_", "\\\\_", v) else v
- cat(
+# Use this like print.data.frame, from which it is adapted:
+data_frame_table_latex <-
+ function(
+ x,
+ # digits to pass to format.data.frame
+ digits = NULL,
+ # TRUE -> right-justify columns; FALSE -> left-justify
+ right = TRUE,
+ # maximumn number of rows to print
+ max = NULL,
+ # string with justification of each column
+ justification = NULL,
+ # TRUE to center on page
+ centered = TRUE,
+ # optional caption
+ caption = NULL,
+ # h(inline); b(bottom); t (top) or p (separate page)
+ anchor = "h",
+ # set underscore_whack to TRUE to escape underscores
+ underscore_whack = TRUE,
+ # how to emit results
+ emit = cat
+ ) {
+ if (is.null(justification))
+ justification <-
+ Reduce(
+ f = paste,
+ x = rep_len(if (right) "r" else "l", length(colnames(x)))
+ )
+ n <- length(rownames(x))
+ if (length(x) == 0L) {
+ emit(
+ sprintf(
+ # if n is one, use singular 'row', else use plural 'rows'
+ ngettext(
+ n,
+ "data frame with 0 columns and %d row",
+ "data frame with 0 columns and %d rows"
+ ),
+ n
+ ),
+ "\n",
+ sep = ""
+ )
+ } else if (n == 0L) {
+ emit("0 rows for:\n")
+ latex_itemized_list(
+ v = names(x),
+ underscore_whack = underscore_whack
+ )
+ } else {
+ if (is.null(max))
+ max <- getOption("max.print", 99999L)
+ if (!is.finite(max)) {
+ cat("Abend because: invalid 'max' / getOption(\"max.print\"): ", max)
+ knitr::knit_exit()
+ }
+ omit <- (n0 <- max %/% length(x)) < n
+ m <- as.matrix(
+ format.data.frame(
+ if (omit) x[seq_len(n0), , drop = FALSE] else x,
+ digits = digits,
+ na.encode = FALSE
+ )
+ )
+ emit(
+ # h(inline); b(bottom); t (top) or p (separate page)
+ paste0("\\begin{table}[", anchor, "]"),
+ "\\leavevmode",
+ sep = "\n"
+ )
+ if (!is.null(caption))
+ emit(paste0(" \\caption{", caption, "}"))
+ if (centered) emit("\\centering\n")
+ emit(
+ paste(
+ " \\begin{tabular}{",
+ justification,
+ "}\n",
+ sep = ""
+ )
+ )
+
+ # ref for top and bottom struts (\T and \B):
+ # https://tex.stackexchange.com/a/50355
+ if (!is.null(caption))
+ emit("\\B \\\\\n")
+ latex_table_row(
+ v = colnames(m),
+ extra = " \\T \\B",
+ underscore_whack = underscore_whack
+ )
+ emit("\\hline \\\\\n")
+ for (i in seq_len(length(m[, 1]))) {
+ latex_table_row(
+ v = m[i, ],
+ underscore_whack = underscore_whack
+ )
+ }
+ emit(
+ paste(
+ " \\end{tabular}",
+ "\\end{table}",
+ sep = "\n"
+ )
+ )
+ if (omit)
+ emit(" [ reached 'max' / getOption(\"max.print\") -- omitted",
+ n - n0, "rows ]\n")
+ }
+ invisible(x)
+ }
+
+# Use this like print.data.frame, from which it is adapted:
+data_frame_tabbing_latex <-
+ function(
+ x,
+ # vector of tab stops, in inches
+ tabstops,
+ # vector of headings, registered with tab-stops
+ headings = colnames(x),
+ # digits to pass to format.data.frame
+ digits = NULL,
+ # maximumn number of rows to print
+ max = NULL,
+ # optional caption
+ caption = NULL,
+ # set underscore_whack to TRUE to escape underscores
+ underscore_whack = TRUE,
+ # flag for landscape mode
+ landscape = FALSE,
+ # flag indicating that subsubsection should be used for caption
+ # rather than subsection
+ use_subsubsection_header = TRUE,
+ # character-size indicator; for possible values, see:
+ # https://tug.org/texinfohtml/latex2e.html#Font-sizes
+ charactersize = "small",
+ # set verbatim to TRUE to debug output
+ verbatim = FALSE
+ ) {
+
+ hlinport <- if (landscape) {
+ function() cat("\\hlinlscp \\\\\n")
+ } else {
+ function() cat("\\hlinport \\\\\n")
+ }
+
+ tabstops_tex <-
+ Reduce(
+ f = function(l, r) paste0(l, r),
+ x = sprintf("\\hspace{%0.2fin}\\=", tabstops),
+ init = ""
+ )
+
+ n <- length(rownames(x))
+ if (length(x) == 0L) {
+ cat(
+ sprintf(
+ # if n is one, use singular 'row', else use plural 'rows'
+ ngettext(
+ n,
+ "data frame with 0 columns and %d row",
+ "data frame with 0 columns and %d rows"
+ ),
+ n
+ ),
+ "\n",
+ sep = ""
+ )
+ } else if (n == 0L) {
+ cat("0 rows for:\n")
+ latex_itemized_list(
+ v = names(x),
+ underscore_whack = underscore_whack
+ )
+ } else {
+ if (is.null(max))
+ max <- getOption("max.print", 99999L)
+ if (!is.finite(max)) {
+ cat("Abend because: invalid 'max' / getOption(\"max.print\"): ", max)
+ knitr::knit_exit()
+ }
+ omit <- (n0 <- max %/% length(x)) < n
+ m <- as.matrix(
+ format.data.frame(
+ if (omit) x[seq_len(n0), , drop = FALSE] else x,
+ digits = digits,
+ na.encode = FALSE
+ )
+ )
+ if (landscape)
+ cat("\n\\begin{landscape}")
+ tex_caption <-
+ if (!is.null(caption)) sprintf("\\captionof{table}{%s}\n", caption)
+ else "\n"
+ # build the column names, which have multiple lines when
+ # length(headings) is a multiple of the number of columns
+ column_names <- ""
+ while (length(headings) > 0) {
+ my_row <- c()
+ for (i in 1:(1 + length(tabstops))) {
+ my_field <- sq_get(headings)
+ sq_put(my_row, if (is.na(my_field)) "" else my_field)
+ }
+ column_names %||:=% latex_tabbing_row(
+ v = my_row,
+ underscore_whack = underscore_whack,
+ action = paste0
+ )
+ }
+
+ # Begin tabbing environment after beginning charactersize environment
+ if (verbatim) cat("\n\\begin{verbatim}")
+ cat(
+ paste0(
+ "\n\\begin{", charactersize, "}", tex_caption,
+ "\\begin{tabwrap}{", tabstops_tex, "}\n"
+ )
+ )
+ # emit column names
+ cat(column_names)
+ # emit hline
+ hlinport()
+ for (i in seq_len(length(m[, 1]))) {
+ my_row <- latex_tabbing_row(
+ v = m[i, ],
+ underscore_whack = underscore_whack,
+ action = paste0
+ )
+ if (FALSE)
+ cat(my_row)
+ else
+ cat(my_row)
+ }
+ hlinport()
+ if (omit)
+ cat(" [ reached 'max' / getOption(\"max.print\") -- omitted",
+ n - n0, "rows ]\n")
+ # End charactersize environment after ending tabbing environment
+ cat(paste0("\\end{tabwrap}\n\\end{", charactersize, "}\n"))
+ if (verbatim) cat("\\end{verbatim}\n")
+ if (landscape)
+ cat("\\end{landscape}\n")
+ }
+ invisible(x)
+ }
+
+param_df_noexit <-
+ function(e = NULL) {
+ data_frame_tabbing_latex(
+ x = param_df,
+ tabstops = c(1.75),
+ underscore_whack = TRUE,
+ caption = "Input parameters",
+ verbatim = FALSE
+ )
+ if (!is.null(e)) {
+ sink(stderr())
+ cat("Caught fatal error:\n\n")
+ str(e)
+ sink()
+ }
+ }
+
+param_df_exit <-
+ function(e = NULL) {
+ param_df_noexit(e)
+ knitr::knit_exit()
+ exit(-1)
+ }
+
+# exit with exit code (default 0) and optional msg
+exit <-
+ function(code = 0, msg = NULL, use_stderr = FALSE) {
+ if (!is.null(msg)) {
+ if (use_stderr) sink(stderr())
+ cat("\n\n", msg, "\n\n")
+ if (use_stderr) sink()
+ }
+ q(save = "no", status = code)
+ }
+
+# make control sequences into printable latex sequences
+latex_printable_control_seqs <-
+ function(s) {
+ s <- gsub("[\\]", "xyzzy_plugh", s)
+ s <- gsub("[$]", "\\\\$", s)
+ s <- gsub("xyzzy_plugh", "$\\\\backslash$", s)
+ return(s)
+ }
+nolatex_verbatim <-
+ function(expr) eval(expr)
+
+latex_verbatim <-
+ function(expr) {
+ arg_string <- deparse1(substitute(expr))
+ cat("\n\\begin{verbatim}\n___\n")
+ tryCatch(
+ expr = expr,
+ error = param_df_exit,
+ #ACE error =
+ #ACE function(e) {
+ #ACE cat("Caught error:\n\n")
+ #ACE str(e)
+ #ACE knitr::knit_exit()
+ #ACE stop(e)
+ #ACE },
+ finally = cat("...\n\\end{verbatim}\n")
+ )
+ }
+
+latex_samepage <-
+ function(expr) {
+ arg_string <- deparse1(substitute(expr))
+ cat("\n\\begin{samepage}\n")
+ tryCatch(
+ expr = expr,
+ error = param_df_exit,
+ #ACE error =
+ #ACE function(e) {
+ #ACE cat("Caught error:\n\n")
+ #ACE str(e)
+ #ACE knitr::knit_exit()
+ #ACE stop(e)
+ #ACE },
+ finally = cat("\n\\end{samepage}\n")
+ )
+ }
+
+# return the result of invocation after showing parameters
+# ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/
+latex_show_invocation <-
+ function(f, f_name = deparse1(substitute(f)), head_patch = FALSE) {
+ function(...) {
+ my_env <- (as.list(environment()))
+ va <- list(...)
+ my_rslt <- new_env()
+ my_rslt$rslt <- NULL
+ latex_verbatim(
+ expr = {
+ cat(sprintf("\n .. Local variables for '%s':\n\n", f_name))
+ str(va)
+ if (!head_patch) {
+ # return this result
+ # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/
+ cat(sprintf("\n .. Invoking '%s'\n", f_name))
+ tryCatch(
+ {
+ cat("\n\\end{verbatim}\n")
+ rslt <- do.call(f, va)
+ },
+ error = param_df_exit,
+ #ACE error = function(e) {
+ #ACE cat("\n\\begin{verbatim}\n")
+ #ACE str(e)
+ #ACE cat("\n\\end{verbatim}\n")
+ #ACE knitr::knit_exit()
+ #ACE stop(e)
+ #ACE },
+ finally = cat("\n\\begin{verbatim}\n")
+ )
+ cat(sprintf("\n .. '%s' returned:\n", f_name))
+ str(rslt)
+ my_rslt$rslt <- rslt
+ }
+ }
+ )
+ # return the result of invocation with the shown parameters
+ # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/
+ if (head_patch) my_rslt$rslt <- do.call(f, va)
+ (my_rslt$rslt)
+ }
+ }
+
+latex_collapsed_vector <- function(
+ collapse_string,
+ v,
+ underscore_whack = TRUE,
+ action = cat0
+ ) {
+ v_sub <-
+ if (underscore_whack) whack_underscores(v) else v
+ action(
paste0(
v_sub,
collapse = collapse_string
@@ -242,113 +1263,36 @@
cat(" \\\\\n")
}
-# Use this like print.data.frame, from which it is adapted:
-data_frame_latex <-
- function(
- x,
- ...,
- # digits to pass to format.data.frame
- digits = NULL,
- # TRUE -> right-justify columns; FALSE -> left-justify
- right = TRUE,
- # maximumn number of rows to print
- max = NULL,
- # string with justification of each column
- justification = NULL,
- # TRUE to center on page
- centered = TRUE,
- # optional caption
- caption = NULL,
- # h(inline); b(bottom); t (top) or p (separate page)
- anchor = "h",
- # set underscore_whack to TRUE to escape underscores
- underscore_whack = TRUE
+latex_tabbing_row <- function(
+ v,
+ extra = "",
+ underscore_whack = TRUE,
+ action = cat0
) {
- if (is.null(justification))
- justification <-
- Reduce(
- f = paste,
- x = rep_len(if (right) "r" else "l", length(colnames(x)))
- )
- n <- length(rownames(x))
- if (length(x) == 0L) {
- cat(
- sprintf(
- # if n is one, use singular 'row', else use plural 'rows'
- ngettext(
- n,
- "data frame with 0 columns and %d row",
- "data frame with 0 columns and %d rows"
- ),
- n
- ),
- "\n",
- sep = ""
- )
- } else if (n == 0L) {
- cat("0 rows for:\n")
- latex_itemized_list(
- v = names(x),
- underscore_whack = underscore_whack
- )
- } else {
- if (is.null(max))
- max <- getOption("max.print", 99999L)
- if (!is.finite(max))
- stop("invalid 'max' / getOption(\"max.print\"): ",
- max)
- omit <- (n0 <- max %/% length(x)) < n
- m <- as.matrix(
- format.data.frame(
- if (omit) x[seq_len(n0), , drop = FALSE] else x,
- digits = digits,
- na.encode = FALSE
- )
- )
- cat(
- # h(inline); b(bottom); t (top) or p (separate page)
- paste0("\\begin{table}[", anchor, "]\n")
- )
- if (!is.null(caption))
- cat(paste0(" \\caption{", caption, "}"))
- if (centered) cat("\\centering\n")
- cat(
- paste(
- " \\begin{tabular}{",
- justification,
- "}\n",
- sep = ""
- )
- )
- # ref: https://tex.stackexchange.com/a/50353
- # Describes use of \rule{0pt}{3ex}
- if (!is.null(caption))
- cat("\\B \\\\ \\hline\\hline\n")
- # ref for top and bottom struts: https://tex.stackexchange.com/a/50355
- latex_table_row(
- v = colnames(m),
- extra = "\\T\\B",
- underscore_whack = underscore_whack
- )
- cat("\\hline\n")
- for (i in seq_len(length(m[, 1]))) {
- latex_table_row(
- v = m[i, ],
- underscore_whack = underscore_whack
- )
- }
- cat(
- paste(
- " \\end{tabular}",
- "\\end{table}",
- sep = "\n"
- )
- )
- if (omit)
- cat(" [ reached 'max' / getOption(\"max.print\") -- omitted",
- n - n0, "rows ]\n")
- }
- invisible(x)
+ # latex_collapsed_vector applies action to result of paste0;
+ # by default, action = cat;
+ # hence, a scalar string is assigned to v_collapsed
+ v_collapsed <-
+ latex_collapsed_vector(
+ "} \\> \\tabfill{",
+ v,
+ underscore_whack,
+ action = paste0
+ )
+ action(
+ "\\tabfill{",
+ v_collapsed,
+ "}",
+ extra,
+ " \\\\\n"
+ )
+ }
+
+# N.B. use con = "" to emulate regular cat
+fcat0 <-
+ function(..., sprtr = " ", cnnctn = file()) {
+ cat0(..., sep = sprtr, file = cnnctn)
+ invisible(cnnctn)
}
hypersub <-
@@ -356,32 +1300,39 @@
hyper <- tolower(s)
hyper <- gsub("[^a-z0-9]+", "-", hyper)
hyper <- gsub("[-]+", "-", hyper)
+ hyper <- gsub("[_]+", "-", hyper)
hyper <- sub("^[-]", "", hyper)
hyper <- sub("[-]$", "", hyper)
return(hyper)
}
-subsection_header <-
- function(s) {
- hyper <- hypersub(s)
- cat(
- sprintf(
- "\\hypertarget{%s}\n{\\subsection{%s}\\label{%s}}\n",
- hyper, s, hyper
- )
- )
+table_href <- function(s = "offset", caption = "") {
+ paste0("\\hyperlink{table.\\arabic{", s, "}}{Table \\arabic{", s, "}}")
+ }
+
+table_offset <- function(i = 0, s = "offset", new = FALSE) {
+ paste0(
+ if (new) paste0("\\newcounter{", s, "}\n") else "",
+ "\\setcounter{", s, "}{\\value{table}}\n",
+ paste0(if (i > 0) rep(paste0("\\stepcounter{", s, "}"), i), "\n")
+ )
}
-subsubsection_header <-
- function(s) {
+a_section_header <-
+ function(s, prefix = "") {
hyper <- hypersub(s)
- cat(
- sprintf(
- "\\hypertarget{%s}\n{\\subsubsection{%s}\\label{%s}}\n",
- hyper, s, hyper
- )
+ my_subsection_header <- sprintf(
+ "\\hypertarget{%s}{\\%ssection{%s}\\label{%s}}\n",
+ hyper,
+ prefix,
+ gsub("_", "\\_", s, fixed = TRUE),
+ hyper
)
+ my_subsection_header
}
+section_header <- function(s) a_section_header(s, "")
+subsection_header <- function(s) a_section_header(s, "sub")
+subsubsection_header <- function(s) a_section_header(s, "subsub")
### SQLite functions
@@ -419,10 +1370,54 @@
### KSEA functions and helpers
-# Adapted from KSEAapp::KSEA.Scores to allow retrieval of:
-# - maximum log2(FC)
+#' The KSEA App Analysis (KSEA Kinase Scores Only)
+#'
+#' Compute KSEA kinase scores and statistics from phoshoproteomics data input
+#' Adapted from KSEAapp::KSEA.Scores to allow retrieval of maximum log2(FC)
+#'
+#' Result is an R data.frame with column names
+#' "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR"
+#' "Please refer to the original Casado et al. publication for detailed
+#' description of these columns and what they represent:
+#'
+#' - Kinase.Gene indicates the gene name for each kinase.
+#' - mS represents the mean log2(fold change) of all the
+#' kinase's substrates.
+#' - Enrichment is the background-adjusted value of the kinase's mS.
+#' - m is the total number of detected substrates
+#' from the experimental dataset for each kinase.
+#' - z.score is the normalized score for each kinase, weighted by
+#' the number of identified substrates.
+#' - p.value represents the statistical assessment for the z.score.
+#' - FDR is the p-value adjusted for multiple hypothesis testing
+#' using the Benjamini & Hochberg method."
+#'
+#' @param ksdata the Kinase-Substrate dataset uploaded from the file
+#' prefaced with "PSP&NetworKIN_"
+#' available from github.com/casecpb/KSEA/
+#' @param px the experimental data file formatted as described in
+#' the KSEA.Complete() documentation
+#' @param networkin a binary input of TRUE or FALSE, indicating whether
+#' or not to include NetworKIN predictions, where
+#' \code{NetworKIN = TRUE}
+#' means include NetworKIN predictions
+#' @param networkin_cutoff a numeric value between 1 and infinity setting
+#' the minimum NetworKIN score
+#' (this can be omitted if NetworKIN = FALSE)
+#'
+#' @return creates a new R data.frame with all the KSEA kinase
+#' scores, along with each one's statistical
+#' assessment, as described herein.
+#'
+#' @references
+#'
+#' Casado et al. (2013) Sci Signal. 6(268):rs6
+#'
+#' Hornbeck et al. (2015) Nucleic Acids Res. 43:D512-20
+#'
+#' Horn et al. (2014) Nature Methods 11(6):603-4
+#'
ksea_scores <- function(
-
# For human data, typically, ksdata = KSEAapp::ksdata
ksdata,
@@ -444,9 +1439,13 @@
# A numeric value between 1 and infinity setting the minimum NetworKIN
# score (can be left out if networkin = FALSE)
- networkin_cutoff
+ networkin_cutoff,
+
+ # Minimum substrate count, necessary to adjust the p-value appropriately.
+ minimum_substrate_count
) {
+ # no px$FC should be <= 0, but abs(px$FC) is used below as a precaution.
if (length(grep(";", px$Residue.Both)) == 0) {
# There are no Residue.Both entries having semicolons, so new is
# simply px except two columns are renamed and a column is added
@@ -492,18 +1491,53 @@
# Eliminate rows having missing values (e.g., non-imputed data)
new <- new[complete.cases(new$log2_fc), ]
}
- if (networkin == TRUE) {
- # When NetworKIN is true, filter on NetworKIN.cutoff which includes
- # PhosphoSitePlus data *because its networkin_score is set to Inf*
- ksdata_filtered <- ksdata[grep("[a-z]", ksdata$Source), ]
- ksdata_filtered <- ksdata_filtered[
- (ksdata_filtered$networkin_score >= networkin_cutoff), ]
- } else {
- # Otherwise, simply use PhosphSitePlus rows
- ksdata_filtered <- ksdata[
- grep("PhosphoSitePlus", ksdata$Source), ]
+ # At this point, new$log2_fc is signed according to which contrast has
+ # the greater intensity
+ # To take the magnitude into account without taking the direction into
+ # account, set params$kseaUseAbsoluteLog2FC to TRUE
+ #
+ # Should KSEA be performed aggregating signed log2FC or absolute?
+ # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores
+ if (params$kseaUseAbsoluteLog2FC) {
+ # TRUE use abs(log2FC) for KSEA as Justin requested; this is a
+ # justifiable deviation from the KSEAapp::KSEA.Scores algorithm.
+ new$log2_fc <- abs(new$log2_fc)
+ }
+
+ monitor_filtration_on_stderr <- TRUE
+ if (monitor_filtration_on_stderr) {
+ # set to TRUE to monitor filtration on stderr
+ sink(stderr())
+ cat(see_variable(networkin, "\n"))
}
- # Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD
+ ksdata_filtered <-
+ sqldf(
+ sprintf("%s %s",
+ "select * from ksdata where not Source = 'NetworKIN'",
+ if (networkin)
+ sprintf("or networkin_score >= %d", networkin_cutoff)
+ else
+ ""
+ )
+ )
+ if (monitor_filtration_on_stderr) {
+ cat(see_variable(sqldf(
+ "select count(*), Source from ksdata group by Source"), "\n"))
+ cat(see_variable(sqldf(
+ "select count(*), Source from ksdata_filtered group by Source"), "\n"))
+ sink()
+ }
+
+ ############################################################################
+ # Line numbers below refer to lines of:
+ # https://github.com/casecpb/KSEAapp/blob/master/R/KSEA.Scores.R
+ # I would put the original line in a comment but then lint would complain...
+ # - Indeed, I had to rename all the variables because lint didn't like names
+ # containing periods or capital letters.
+ # ACE
+ ############################################################################
+ #
+ # (1) Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD
# colnames of ksdata_filtered:
# "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID"
# "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID"
@@ -516,6 +1550,8 @@
# INNER JOIN new b
# ON a.SUB_GENE = b.SUB_GENE
# AND a.SUB_MOD_RSD = b.SUB_MOD_RSD
+ # (KSEA.Scores.R line # 105)
+ # "Extract KSData.filtered annotations that are only found in new"
ksdata_dataset <- base::merge(ksdata_filtered, new)
# colnames of ksdata_dataset:
# "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE"
@@ -523,24 +1559,31 @@
# "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein"
# "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform)
# Re-order dataset; prior to accounting for isoforms
+ # (KSEA.Scores.R line # 106)
ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ]
# Extract non-isoform accession in UniProtKB
+ # (KSEA.Scores.R line # 107)
ksdata_dataset$uniprot_no_isoform <- sapply(
ksdata_dataset$KIN_ACC_ID,
function(x) unlist(strsplit(as.character(x), split = "-"))[1]
)
+ # "last expression collapses isoforms ... for easy processing"
# Discard previous results while selecting interesting columns ...
+ # (KSEA.Scores.R line # 110)
ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)]
# Column names are now:
# "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p"
# "FC" "log2_fc" "Source"
# Make column names human-readable
+ # (KSEA.Scores.R line # 111)
colnames(ksdata_dataset_abbrev) <- c(
"Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p",
"FC", "log2FC", "Source"
)
# SELECT * FROM ksdata_dataset_abbrev
# ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p
+ # (KSEA.Scores.R line # 112)
+ # "Extract KSData.filtered annotations that are only found in new"
ksdata_dataset_abbrev <-
ksdata_dataset_abbrev[
order(
@@ -549,6 +1592,7 @@
ksdata_dataset_abbrev$Substrate.Mod,
ksdata_dataset_abbrev$p),
]
+ if (print_nb_messages) nbe(see_variable(ksdata_dataset_abbrev))
# First aggregation step to account for multiply phosphorylated peptides
# and differing peptide sequences; the goal here is to combine results
# for all measurements of the same substrate.
@@ -560,12 +1604,16 @@
# ORDER BY `Kinase.Gene`;
# in two steps:
# (1) compute average log_2(fold-change)
+ # "take the mean of the log2FC amongst phosphosite duplicates"
+ # (KSEA.Scores.R line # 115)
ksdata_dataset_abbrev <- aggregate(
log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source,
data = ksdata_dataset_abbrev,
FUN = mean
)
+ if (print_nb_messages) nbe(see_variable(ksdata_dataset_abbrev))
# (2) order by Kinase.Gene
+ # (KSEA.Scores.R line # 117)
ksdata_dataset_abbrev <-
ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ]
# SELECT `Kinase.Gene`, count(*)
@@ -573,9 +1621,14 @@
# GROUP BY `Kinase.Gene`;
# in two steps:
# (1) Extract the list of Kinase.Gene names
+ # "@@@@@@@@@@@@@@@@@@@@"
+ # "Do analysis for KSEA"
+ # "@@@@@@@@@@@@@@@@@@@@"
+ # (KSEA.Scores.R line # 124)
kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene)
# (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev,
# named by Kinase.Gene
+ # (KSEA.Scores.R line # 125)
kinase_list <- as.matrix(table(kinase_list))
# Second aggregation step to account for all substrates per kinase
# CREATE TABLE mean_fc
@@ -583,50 +1636,123 @@
# SELECT `Kinase.Gene`, avg(log2FC) AS log2FC
# FROM ksdata_dataset_abbrev
# GROUP BY `Kinase.Gene`
- mean_fc <- aggregate(
- log2FC ~ Kinase.Gene,
- data = ksdata_dataset_abbrev,
- FUN = mean
- )
- # mean_fc columns: "Kinase.Gene", "log2FC"
- if (FALSE) {
- # I need to re-think this; I was trying to find the most-represented
- # peptide, but that horse has already left the barn
- # SELECT `Kinase.Gene`, max(abs(log2FC)) AS log2FC
- # FROM ksdata_dataset_abbrev
- # GROUP BY `Kinase.Gene`
- max_fc <- aggregate(
+ # (KSEA.Scores.R line # 127)
+ if (print_nb_messages) nb(see_variable(ksdata_dataset_abbrev), "\n")
+ mean_fc <-
+ aggregate(
log2FC ~ Kinase.Gene,
data = ksdata_dataset_abbrev,
- FUN = function(r) max(abs(r))
+ FUN = mean
)
- }
+ if (print_nb_messages) nbe(see_variable(mean_fc), "\n")
+
+ # for contrast j
+ # for each kinase i
+ # extract log2 of fold-change (from `new` above)
+ # (used in KSEA.Scores.R lines # 130 & 132)
+ log2_fc_j_each_i <-
+ new$log2_fc
+
+ # for contrast j
+ # for all kinases i
+ # compute mean of abs(log2 of fold-change)
+ # (used in KSEA.Scores.R lines # 130)
+ mean_abs_log2_fc_j_all_i <-
+ mean(abs(log2_fc_j_each_i), na.rm = TRUE)
+
+ # for contrast j
+ # for all kinases i
+ # compute mean of log2 of fold-change
+ # (used in KSEA.Scores.R lines # 132)
+ mean_log2_fc_j_all_i <-
+ mean(log2_fc_j_each_i, na.rm = TRUE)
+
+ # Reorder mean_fc, although I don't see why
+ # (KSEA.Scores.R line 128
+ mean_fc <- mean_fc[order(mean_fc[, 1]), ]
+ # mean_fc columns so far: "Kinase.Gene", "log2FC"
+ # - Kinase.Gene
+ # indicates the gene name for each kinase.
# Create column 3: mS
- mean_fc$m_s <- mean_fc[, 2]
+ # - mS
+ # represents the mean log2(fold change) of all the
+ # kinase's substrates.
+ # (KSEA.Scores.R line # 129)
+ mean_fc$m_s <-
+ mean_fc_m_s <- mean_fc[, 2]
+
# Create column 4: Enrichment
- mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = TRUE))
- # Create column 5: m, count of substrates
- mean_fc$m <- kinase_list
+ # - Enrichment
+ # is the background-adjusted value of the kinase's mS.
+ # (KSEA.Scores.R line # 130)
+ mean_fc$enrichment <-
+ mean_fc_m_s / mean_abs_log2_fc_j_all_i
+
+ # Create column 5: m, count of substrates of kinase (count of j for i)
+ # - m
+ # is the total number of detected substrates
+ # from the experimental dataset for each kinase.
+ # (KSEA.Scores.R line # 131)
+ mean_fc$m <-
+ mean_fc_m <- kinase_list
+
+
# Create column 6: z-score
- mean_fc$z_score <- (
- (mean_fc$m_s - mean(new$log2_fc, na.rm = TRUE)) *
- sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = TRUE)
+ # - z.score
+ # is the normalized score for each kinase, weighted by
+ # the number of identified substrates.
+ # (KSEA.Scores.R line # 132)
+ mean_fc$z_score <-
+ (mean_fc_m_s - mean_log2_fc_j_all_i) * sqrt(mean_fc_m) /
+ sd(log2_fc_j_each_i, na.rm = TRUE)
+
# Create column 7: p-value, deduced from z-score
- mean_fc$p_value <- pnorm(-abs(mean_fc$z_score))
+ # - p.value
+ # represents the statistical assessment for the z.score.
+ # (KSEA.Scores.R line # 133)
+ # "one-tailed p-value"
+ mean_fc$p_value <-
+ pnorm(-abs(mean_fc$z_score))
+
+ # zap excluded kinases; this must be done before adjusting p-value
+ if (TRUE) {
+ mean_fc <-
+ mean_fc[
+ mean_fc$m >= minimum_substrate_count,
+ ,
+ drop = FALSE
+ ]
+ }
+
+ #ACE nb(see_variable(nrow(mean_fc)), "\n")
# Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value
- mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr")
-
- # Remove log2FC column, which is duplicated as mS
- mean_fc <- mean_fc[order(mean_fc$Kinase.Gene), -2]
+ # - FDR
+ # is the p-value adjusted for multiple hypothesis testing
+ # using the Benjamini & Hochberg method."
+ # (KSEA.Scores.R line # 134)
+ mean_fc$fdr <-
+ p.adjust(mean_fc$p_value, method = "fdr")
+
+ # It makes no sense to leave Z-scores negative when using
+ # absolute value of fold-change
+ if (params$kseaUseAbsoluteLog2FC) {
+ mean_fc$z_score <- abs(mean_fc$z_score)
+ }
+
+ # Remove second column (log2FC), which is duplicated as mS
+ # (KSEA.Scores.R line # 136)
+ mean_fc <-
+ mean_fc[order(mean_fc$Kinase.Gene), -2]
# Correct the column names which we had to hack because of the linter...
colnames(mean_fc) <- c(
"Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR"
)
+ # (KSEA.Scores.R line # 138)
return(mean_fc)
}
-low_fdr_barplot <- function(
+ksea_low_fdr_barplot_factory <- function(
rslt,
i_cntrst,
i,
@@ -658,57 +1784,94 @@
"p.value" = {
k$p_value
},
- stop(
- sprintf(
- "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
- ksea_cutoff_statistic
+ {
+ cat(
+ sprintf(
+ "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+ ksea_cutoff_statistic
+ )
)
- )
- )
+ param_df_exit()
+ knitr::knit_exit()
+ }
+ )
k <- k[selector < ksea_cutoff_threshold, ]
-
- if (nrow(k) > 1) {
- op <- par(mai = c(1, 1.5, 0.4, 0.4))
+ nrow_k <- nrow(k)
+
+ #ACE nbe(see_variable(fdr_barplot_dataframe <- k))
+
+ if (nrow_k > 0) {
+ max_nchar_rowname <- max(nchar(rownames(k)))
+ my_cex_names <- 1.0 / (1 + nrow_k / 50)
+
+ if (print_trace_messages) cat_margins("Initially")
+ if (print_trace_messages) cat_variable(nrow_k, "\n\n", 0)
+ if (print_trace_messages) cat_variable(my_cex_names, "\n\n", 0)
+ if (print_trace_messages) cat_variable(max_nchar_rowname, "\n\n", 0)
+
+ # fin: The figure region dimensions, (width, height), in inches.
+ # mar: A numerical vector of the form c(bottom, left, top, right)
+ # that gives the number of lines of margin to be specified
+ # on the four sides of the plot; default: c(5, 4, 4, 2) + 0.1
+
+# mar: The figure region dimensions, (width, height), in inches.
numeric_z_score <- as.numeric(k$z_score)
- z_score_order <- order(numeric_z_score)
+ bar_order <- order(-as.numeric(k$p_value))
kinase_name <- k$kinase_gene
long_caption <-
sprintf(
- "Kinase z-score, %s < %s, %s",
+ "Kinase z-score, %s, KSEA %s < %s",
+ caption,
ksea_cutoff_statistic,
- ksea_cutoff_threshold,
- caption
+ ksea_cutoff_threshold
)
my_cex_caption <- 65.0 / max(65.0, nchar(long_caption))
- cat("\n\\clearpage\n")
- barplot(
- height = numeric_z_score[z_score_order],
- border = NA,
- xpd = FALSE,
- cex.names = 1.0,
- cex.axis = 1.0,
- main = long_caption,
- cex.main = my_cex_caption,
- names.arg = kinase_name[z_score_order],
- horiz = TRUE,
- srt = 45,
- las = 1)
- par(op)
+ # return a function that draws the plot
+ function() {
+ par_fin <- par("fin") # vector of width_in_inches and height_in_inches)
+ op <- par(
+ bg = if (print_trace_messages) "yellow" else "white",
+ fin = c(par_fin[1], min(par_fin[2], 2.5 + nrow_k / 6)),
+ mar = par("mar") +
+ c(3 / nrow_k, (1 + max_nchar_rowname * my_cex_names) / 2, 0, 0)
+ # bottom, left, top, right
+ )
+ on.exit(par(op))
+ if (print_trace_messages) cat_margins("Eventually")
+
+ barplot(
+ height = numeric_z_score[bar_order],
+ border = NA,
+ xpd = FALSE,
+ cex.names = my_cex_names,
+ main = long_caption,
+ cex.main = my_cex_caption,
+ names.arg = kinase_name[bar_order],
+ horiz = TRUE,
+ srt = 45,
+ las = 1,
+ cex.axis = 0.9
+ )
+ }
}
+ } else {
+ no_op
}
}
# note that this adds elements to the global variable `ksea_asterisk_hash`
-low_fdr_print <- function(
+ksea_low_fdr_print <- function(
rslt,
i_cntrst,
i,
a_level,
b_level,
fold_change,
- caption
+ caption,
+ write_db = TRUE, # if TRUE, write to DB, else print table
+ anchor = c(const_table_anchor_p, const_table_anchor_t)
) {
rslt_score_list_i <- rslt$score_list[[i]]
if (!is.null(rslt_score_list_i)) {
@@ -734,13 +1897,17 @@
"p.value" = {
k$p_value
},
- stop(
- sprintf(
- "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
- ksea_cutoff_statistic
+ {
+ cat(
+ sprintf(
+ "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+ ksea_cutoff_statistic
+ )
)
- )
- )
+ param_df_exit()
+ knitr::knit_exit()
+ }
+ )
k <- k[selector < ksea_cutoff_threshold, ]
# save kinase names to ksea_asterisk_hash
@@ -748,110 +1915,175 @@
ksea_asterisk_hash[[kinase_name]] <- 1
}
- db_write_table_overwrite <- (i_cntrst < 2)
- db_write_table_append <- !db_write_table_overwrite
- RSQLite::dbWriteTable(
- conn = db,
- name = "contrast_ksea_scores",
- value = contrast_ksea_scores,
- append = db_write_table_append
+ if (write_db) {
+ db_write_table_overwrite <- (i_cntrst < 2)
+ db_write_table_append <- !db_write_table_overwrite
+ RSQLite::dbWriteTable(
+ conn = db,
+ name = "contrast_ksea_scores",
+ value = contrast_ksea_scores,
+ append = db_write_table_append
+ )
+ ""
+ } else {
+ selector <- switch(
+ ksea_cutoff_statistic,
+ "FDR" = {
+ contrast_ksea_scores$fdr
+ },
+ "p.value" = {
+ contrast_ksea_scores$p_value
+ },
+ {
+ cat(
+ sprintf(
+ "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+ ksea_cutoff_statistic
+ )
+ )
+ param_df_exit()
+ knitr::knit_exit()
+ }
)
- selector <- switch(
- ksea_cutoff_statistic,
- "FDR" = {
- contrast_ksea_scores$fdr
- },
- "p.value" = {
- contrast_ksea_scores$p_value
- },
- stop(
- sprintf(
- "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
- ksea_cutoff_statistic
+ if (print_nb_messages) nbe(see_variable(contrast_ksea_scores))
+ output_df <- contrast_ksea_scores[
+ selector < ksea_cutoff_threshold,
+ c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count",
+ "z_score", "p_value", "fdr")
+ ]
+ output_df$kinase_gene <-
+ gsub(
+ "_",
+ "\\\\_",
+ output_df$kinase_gene
+ )
+ colnames(output_df) <-
+ c(
+ colnames(output_df)[1],
+ colnames(output_df)[2],
+ "enrichment",
+ "m_s",
+ "z_score",
+ "p_value",
+ "fdr"
+ )
+ #ACE output_order <- with(output_df, order(fdr))
+ output_order <- with(output_df, order(p_value))
+ output_df <- output_df[output_order, ]
+
+ output_df[, 2] <- sprintf("%0.3g", output_df[, 2])
+ output_df$fdr <- sprintf("%0.4f", output_df$fdr)
+ output_df$p_value <- sprintf("%0.2e", output_df$p_value)
+ output_df$z_score <- sprintf("%0.2f", output_df$z_score)
+ output_df$m_s <- sprintf("%d", output_df$m_s)
+ output_df$enrichment <- sprintf("%0.3g", output_df$enrichment)
+ output_ncol <- ncol(output_df)
+ colnames(output_df) <-
+ c(
+ "Kinase",
+ "\\(\\overline{{\\lvert}\\log_2 (\\text{fold-change}){\\rvert}}\\)",
+ "Enrichment",
+ "Substrates",
+ "z-score",
+ "p-value",
+ "FDR"
+ )
+ selector <- switch(
+ ksea_cutoff_statistic,
+ "FDR" = {
+ rslt$score_list[[i]]$FDR
+ },
+ "p.value" = {
+ rslt$score_list[[i]]$p.value
+ },
+ {
+ cat(
+ sprintf(
+ "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+ ksea_cutoff_statistic
+ )
+ )
+ param_df_exit()
+ knitr::knit_exit()
+ }
+ )
+ if (sum(selector < ksea_cutoff_threshold) > 0) {
+ if (print_nb_messages) nbe(see_variable(output_df))
+ math_caption <- gsub("{", "\\{", caption, fixed = TRUE)
+ math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE)
+ # with (
+ # output_df,
+ # )
+ if (TRUE) {
+ output_df$Kinase <- whack_underscores(output_df$Kinase)
+ data_frame_tabbing_latex(
+ x = output_df,
+ # vector of tab stops, in inches
+ tabstops = c(1.0, 1.2, 1.0, 1.0, 1.0, 1.0),
+ # vector of headings, registered with tab-stops
+ headings = colnames(output_df),
+ # digits to pass to format.data.frame
+ digits = NULL,
+ # maximumn number of rows to print
+ max = NULL,
+ # optional caption
+ caption = sprintf(
+ "\\text{%s}, KSEA %s < %s",
+ math_caption,
+ ksea_cutoff_statistic,
+ ksea_cutoff_threshold
+ ),
+ # set underscore_whack to TRUE to escape underscores
+ underscore_whack = FALSE,
+ # flag for landscape mode
+ landscape = FALSE,
+ # flag indicating that subsubsection should be used for caption
+ # rather than subsection
+ use_subsubsection_header = TRUE,
+ # character-size indicator; for possible values, see:
+ # https://tug.org/texinfohtml/latex2e.html#Font-sizes
+ charactersize = "small",
+ # set verbatim to TRUE to debug output
+ verbatim = FALSE
+ )
+ } else {
+ data_frame_table_latex(
+ x = output_df,
+ justification = "l c c c c c c",
+ centered = TRUE,
+ caption = sprintf(
+ "\\text{%s}, KSEA %s < %s",
+ math_caption,
+ ksea_cutoff_statistic,
+ ksea_cutoff_threshold
+ ),
+ anchor = anchor,
+ underscore_whack = FALSE
+ )
+ }
+ } else {
+ cat(
+ sprintf(
+ "\\break
+ No kinases had
+ \\(\\text{KSEA %s}_\\text{enrichment} < %s\\)
+ for contrast %s\\hfill\\break\n",
+ ksea_cutoff_statistic,
+ ksea_cutoff_threshold,
+ caption
)
)
- )
- output_df <- contrast_ksea_scores[
- selector < ksea_cutoff_threshold,
- c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count",
- "z_score", "p_value", "fdr")
- ]
- output_order <- with(output_df, order(mean_log2_fc, kinase_gene))
- output_df <- output_df[output_order, ]
- colnames(output_df) <-
- c(
- colnames(output_df)[1],
- colnames(output_df)[2],
- "enrichment",
- "m_s",
- "z_score",
- "p_value",
- "fdr"
- )
- output_df$fdr <- sprintf("%0.4f", output_df$fdr)
- output_df$p_value <- sprintf("%0.2e", output_df$p_value)
- output_df$z_score <- sprintf("%0.2f", output_df$z_score)
- output_df$m_s <- sprintf("%d", output_df$m_s)
- output_df$enrichment <- sprintf("%0.2f", output_df$enrichment)
- output_ncol <- ncol(output_df)
- colnames(output_df) <-
- c(
- "Kinase",
- "\\(\\overline{\\log_2 (|\\text{fold-change}|)}\\)",
- "Enrichment",
- "Substrates",
- "z-score",
- "p-value",
- "FDR"
- )
- selector <- switch(
- ksea_cutoff_statistic,
- "FDR" = {
- rslt$score_list[[i]]$FDR
- },
- "p.value" = {
- rslt$score_list[[i]]$p.value
- },
- stop(
- sprintf(
- "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
- ksea_cutoff_statistic
- )
- )
- )
- if (sum(selector < ksea_cutoff_threshold) > 0) {
- math_caption <- gsub("{", "\\{", caption, fixed = TRUE)
- math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE)
- data_frame_latex(
- x = output_df,
- justification = "l c c c c c c",
- centered = TRUE,
- caption = sprintf(
- "\\text{%s}, %s < %s",
- math_caption,
- ksea_cutoff_statistic,
- ksea_cutoff_threshold
- ),
- anchor = const_table_anchor_p
- )
- } else {
- cat(
- sprintf(
- "\\break
- No kinases had
- \\(\\text{%s}_\\text{enrichment} < %s\\)
- for contrast %s\\hfill\\break\n",
- ksea_cutoff_statistic,
- ksea_cutoff_threshold,
- caption
- )
- )
+ }
}
+ } else {
+ ""
}
}
# create_breaks is a helper for ksea_heatmap
create_breaks <- function(merged_scores) {
+ if (sum(!is.na(merged_scores)) < 2)
+ return(NULL)
if (min(merged_scores, na.rm = TRUE) < -1.6) {
breaks_neg <- seq(-1.6, 0, length.out = 30)
breaks_neg <-
@@ -889,14 +2121,135 @@
return(color_breaks)
}
+hm2plus <- function(
+ x,
+ mat = matrix(
+ c(
+ c(0, 4, 0),
+ c(0, 3, 3),
+ c(2, 1, 1)
+ ),
+ nrow = 3,
+ ncol = 3,
+ byrow = TRUE
+ ),
+ denwid = 0.5,
+ denhgt = 0.15,
+ widths = c(0.5, 2.5, 1.5),
+ heights = c(0.4, 0.15, 3.95),
+ divergent = FALSE,
+ notecol = "grey50",
+ trace = "none",
+ margins = c(6, 20),
+ srtcol = 90,
+ srtrow = 0,
+ density_info = "none",
+ key_xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
+ key_par = list(),
+ hclustfun = hclust,
+ ...
+) {
+
+ varargs <- list(...)
+ if (FALSE) # this is to avoid commenting out code to pass linting...
+ my_hm2 <- latex_show_invocation(heatmap.2, head_patch = FALSE)
+ else
+ my_hm2 <- heatmap.2
+
+ x <- as.matrix(x)
+ if (sum(!is.na(x)) < 1)
+ return(NULL)
+ color_count <- 1 + max(64, length(as.vector(x))) # 8 was not enough
+ break_count <- 1 + color_count
+ min_nonax <- min(x, na.rm = TRUE)
+ max_nonax <- max(x, na.rm = TRUE)
+ if (print_nb_messages) nb("within hm2plus", see_variable(divergent), "\n")
+ if (divergent) {
+ zlim <- max(abs(min_nonax), abs(max_nonax))
+ if (print_nb_messages) nb(see_variable(pre_zlim <- zlim, "\n"))
+ breaks <- (zlim) / (break_count:1)
+ if (print_nb_messages) nb(see_variable(breaks, "\n"))
+ breaks <- breaks - median(breaks)
+ zlim <- c(-zlim, zlim)
+ if (print_nb_messages) nb(see_variable(zlim, "\n"))
+ } else {
+ zlim <- max(abs(min_nonax), abs(max_nonax))
+ if (print_nb_messages) nb(see_variable(pre_zlim <- zlim, "\n"))
+ breaks <- zlim / (break_count:1)
+ if (print_nb_messages) nb(see_variable(breaks, "\n"))
+ if (max_nonax < 0) {
+ breaks <- breaks - zlim
+ zlim <- c(-zlim, 0)
+ } else {
+ zlim <- c(0, zlim)
+ }
+ if (print_nb_messages) nb(see_variable(zlim, "\n"))
+ }
+ nonax <- x
+ nonax[is.na(x)] <- min_nonax
+ if (is.null(widths)) widths <- c(denwid, 4 - denwid, 1.5)
+ if (is.null(heights)) heights <- c(0.4, denhgt, 4.0)
+ colors <-
+ if (divergent && min_nonax < 0) {
+ # divergent colors on both sides of zero
+ colorRampPalette(c("red", "white", "blue"))(color_count)
+ } else if (divergent && min_nonax > 0) {
+ # "divergent" colors > zero
+ colorRampPalette(c("white", "blue"))(color_count)
+ } else if (divergent && max_nonax < 0) {
+ # "divergent" colors < zero
+ colorRampPalette(c("red", "white"))(color_count)
+ } else {
+ # "non-divergent" colors including zero
+ hcl.colors(color_count, "YlOrRd", rev = TRUE)
+ }
+
+ #ACE if (print_nb_messages) nb("within hm2plus", see_variable(key_par), "\n")
+ #ACE if (print_nb_messages) nb(see_variable(colors, "\n"))
+ #ACE key_par$col = colors
+ #ACE key_par$breaks = breaks
+
+ if (print_nb_messages) nb(see_variable(par(), "\n")) #ACE TODO remove me
+ if (print_nb_messages) cat("\\leavevmode\n\\linebreak\n") #ACE TODO remove me
+ suppressWarnings(
+ my_hm2(
+ x = x,
+ col = colors,
+ #ACE symkey = FALSE,
+ density.info = density_info,
+ srtCol = srtcol,
+ srtRow = srtrow,
+ margins = margins,
+ lwid = widths,
+ lhei = heights,
+ key.title = NA,
+ key.xlab = key_xlab,
+ key.par = key_par,
+ lmat = mat,
+ notecol = notecol,
+ trace = trace,
+ bg = "yellow",
+ hclustfun = hclustfun,
+ #ACE breaks = breaks,
+ oldstyle = FALSE,
+ ... # varargs
+ )
+ )
+ # implicitly returning value returned by heatmap.2
+}
+
# draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap
draw_kseaapp_summary_heatmap <- function(
- x,
- sample_cluster,
- merged_asterisk,
- my_cex_row,
- color_breaks,
- margins,
+ x, # matrix with row/col names already formatted
+ sample_cluster, # a binary input of TRUE or FALSE,
+ # indicating whether or not to perform
+ # hierarchical clustering of the sample columns
+ merged_asterisk, # matrix having dimensions of x, values "*" or ""
+ color_breaks, # breaks for color gradation, from create_breaks
+ # passed to `breaks` argument of `image`
+ margins = c(8, 15), # two integers setting the bottom and right margins
+ # to accommodate row and column labels
+ master_cex = 0.7, # basis for text sizes
...
) {
merged_scores <- x
@@ -908,44 +2261,128 @@
"' rather than 'matrix'.\n\n"
)
)
- } else if (nrow(x) < 2) {
- cat("No plot because matrix x has ", nrow(x), " rows.\n\n")
- cat("\\begin{verbatim}\n")
- str(x)
- cat("\\end{verbatim}\n")
- } else if (ncol(x) < 2) {
- cat("No plot because matrix x has ", ncol(x), " columns.\n\n")
- cat("\\begin{verbatim}\n")
- str(x)
- cat("\\end{verbatim}\n")
- } else {
- gplots::heatmap.2(
- x = merged_scores,
- Colv = sample_cluster,
- scale = "none",
- cellnote = merged_asterisk,
- notecol = "white",
- cexCol = 0.9,
- # Heuristically assign size of row labels
- cexRow = min(1.0, ((3 * my_cex_row) ^ 1.7) / 2.25),
- srtCol = 45,
- srtRow = 45,
- notecex = 3 * my_cex_row,
- col = color_breaks[[2]],
- density.info = "none",
- trace = "none",
- breaks = color_breaks[[1]],
- lmat = rbind(c(0, 3), c(2, 1), c(0, 4)),
- lhei = c(0.4, 8.0, 1.1),
- lwid = c(0.5, 3),
- key = FALSE,
- margins = margins,
- ...
- )
+ cat_variable(x)
+ return(FALSE)
+ }
+ if (print_trace_messages) cat(sprintf("master_cex = %03f\n\n", master_cex))
+ nrow_x <- nrow(x)
+ ncol_x <- ncol(x)
+ #if (nrow_x < 2) {
+ if (nrow_x < 1) {
+ cat("No plot because matrix has no rows.\n\n")
+ return(FALSE)
+ } else if (nrow_x < 2) {
+ cat("No plot because matrix has one row. Matrix looks like this:\n\n")
+ cat("\n\\begin{verbatim}\n")
+ print(x)
+ cat("\n\\end{verbatim}\n")
+ return(FALSE)
+ } else if (ncol_x < 2) {
+ cat("No plot because matrix x has ", ncol_x, " columns.\n\n")
+ cat_variable(x)
+ return(FALSE)
}
+ max_nchar_rowname <- max(nchar(rownames(x)))
+ max_nchar_colname <- max(nchar(colnames(x)))
+ my_limit <- g_intensity_hm_rows
+
+ my_row_cex_scale <- master_cex * 150 / nrow_x
+ my_col_cex_scale <- 3.0
+ my_asterisk_scale <- 0.4 * my_row_cex_scale
+ my_row_warp <- 1
+ my_note_warp <- 2
+ my_row_warp <- 1
+ my_row_cex_asterisk <-
+ master_cex * my_row_warp * my_asterisk_scale
+
+
+ my_col_cex <- my_col_cex_scale * master_cex
+ my_row_cex <- min(3.5 * my_row_cex_asterisk, my_col_cex)
+ my_key_cex <- 1.286
+ my_hm2_cex <- 1 * master_cex
+ my_offset <- (4.8 / (9 + nrow_x / 10)) - 0.4
+ if (print_trace_messages) cat(sprintf("nrow_x = %03f\n\n", nrow_x))
+ if (print_trace_messages) cat(sprintf("my_offset = %03f\n\n", my_offset))
+ my_offset <- 0.05
+ if (print_trace_messages) cat(sprintf("my_offset = %03f\n\n", my_offset))
+ my_scale <- 3.0
+ if (ncol_x < 10 && nrow_x < 10)
+ my_scale <- my_scale * 10 / (10 - nrow_x) * 10 / (10 - ncol_x)
+
+ my_heights <- c(
+ 0.15,
+ 3.85 - my_offset,
+ 0.5 + my_offset
+ )
+ my_margins <- c(1, 1) +
+ c(
+ margins[1] * 0.08 * max_nchar_colname * my_col_cex,
+ margins[2] * 0.04 * max_nchar_rowname * my_row_cex
+ )
+
+ my_notecex <-
+ my_scale *
+ min(
+ 1.1,
+ my_row_cex_asterisk * my_note_warp,
+ my_col_cex * my_note_warp
+ )
+
+ if (print_trace_messages) {
+ cat_variable(my_heights, suffix = "; ")
+ cat_variable(my_margins, suffix = "\n\n")
+ cat_variable(my_row_cex_scale, suffix = "; ")
+ cat_variable(my_col_cex_scale, suffix = "\n\n")
+ cat_variable(my_row_cex_asterisk, suffix = "\n\n")
+ cat_variable(my_row_cex, suffix = "; ")
+ cat_variable(my_col_cex, suffix = "\n\n")
+ cat_variable(my_row_cex, suffix = "; ")
+ cat_variable(my_col_cex, suffix = "\n\n")
+ }
+
+ hm2plus(
+ x = merged_scores,
+ Colv = sample_cluster,
+ cellnote = merged_asterisk,
+ cex = my_hm2_cex,
+ cexCol = my_col_cex,
+ cexRow = my_row_cex,
+ denhgt = 0.15,
+ density_info = "none",
+ denwid = 0.5,
+ divergent = TRUE,
+ key_par = list(cex = my_key_cex),
+ key_xlab = "Z-score",
+ margins = my_margins,
+ notecex = my_scale * min(
+ 1.5,
+ my_row_cex_asterisk * my_note_warp,
+ my_col_cex * my_note_warp
+ ),
+ notecol = "white",
+ scale = "none",
+ srtcol = 90,
+ srtrow = 0,
+ trace = "none",
+ mat = matrix(
+ c(
+ c(0, 3, 3),
+ c(2, 1, 1),
+ c(0, 4, 0)
+ ),
+ nrow = 3,
+ ncol = 3,
+ byrow = TRUE
+ ),
+ widths = c(0.5, 3.1, 0.9),
+ heights = my_heights,
+ ...
+ )
+ return(TRUE)
}
-# Adapted from KSEAapp::KSEA.Heatmap
+# function drawing heatmap of contrast fold-change for each kinase,
+# adapted from KSEAapp::KSEA.Heatmap
ksea_heatmap <- function(
# the data frame outputs from the KSEA.Scores() function, in list format
score_list,
@@ -961,16 +2398,19 @@
m_cutoff,
# a numeric value between 0 and 1 indicating the p-value/FDR cutoff
# for indicating significant kinases in the heatmap
- p_cutoff =
- stop("argument 'p_cutoff' is required for function 'ksea_heatmap'"),
+ p_cutoff = {
+ cat("argument 'p_cutoff' is required for function 'ksea_heatmap'")
+ param_df_exit()
+ knitr::knit_exit()
+ },
# a binary input of TRUE or FALSE, indicating whether or not to perform
# hierarchical clustering of the sample columns
sample_cluster,
# a binary input of TRUE or FALSE, indicating whether or not to export
# the heatmap as a .png image into the working directory
export = FALSE,
- # bottom and right margins; adjust as needed if contrast names are too long
- margins = c(6, 20),
+ # bottom and right margins; adjust as needehttps://tex.stackexchange.com/a/56795d if contrast names are too long
+ margins = c(6, 6),
# print which kinases?
# - Mandatory argument, must be one of const_ksea_.*_kinases
which_kinases,
@@ -993,7 +2433,7 @@
master <-
Reduce(
f = function(...) {
- base::merge(..., by = "Kinase.Gene", all = FALSE)
+ base::merge(..., by = "Kinase.Gene", all = TRUE)
},
x = score_list_m
)
@@ -1019,15 +2459,13 @@
}
merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff))
- # begin hack to print only significant rows
asterisk_rows <- rowSums(merged_asterisk == "*") > 0
all_rows <- rownames(merged_stats)
names(asterisk_rows) <- all_rows
non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE])
asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE])
- merged_scores_asterisk <- merged_scores[names(asterisk_rows), ]
- merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), ]
- # end hack to print only significant rows
+ merged_scores_asterisk <- merged_scores[names(asterisk_rows), , drop = FALSE]
+ merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), , drop = FALSE]
row_list <- list()
row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows
@@ -1036,14 +2474,17 @@
i <- which_kinases
my_row_names <- row_list[[i]]
- scrs <- merged_scores[my_row_names, ]
- stts <- merged_stats[my_row_names, ]
+ scrs <- merged_scores[my_row_names, , drop = FALSE]
+ stts <- merged_stats[my_row_names, , drop = FALSE]
merged_asterisk <- as.matrix(asterisk(stts, p_cutoff))
color_breaks <- create_breaks(scrs)
+ if (is.null(color_breaks)) {
+ cat("No plot because matrix has too few rows.\n\n")
+ return(NULL)
+ }
plot_height <- nrow(scrs) ^ 0.55
plot_width <- ncol(scrs) ^ 0.7
- my_cex_row <- 0.25 * 16 / plot_height
if (export == "TRUE") {
png(
"KSEA.Merged.Heatmap.png",
@@ -1053,96 +2494,600 @@
pointsize = 14
)
}
- draw_kseaapp_summary_heatmap(
+ did_draw <- draw_kseaapp_summary_heatmap(
x = scrs,
sample_cluster = sample_cluster,
merged_asterisk = merged_asterisk,
- my_cex_row = my_cex_row,
color_breaks = color_breaks,
margins = margins
)
if (export == "TRUE") {
dev.off()
}
+ if (!did_draw)
+ return(NULL)
return(my_row_names)
}
-# helper for heatmaps of phosphopeptide intensities
-
-draw_intensity_heatmap <-
+# helpers for heatmaps of phosphopeptide intensities
+
+# factory producing function to truncate string after n characters
+trunc_n <- function(n) {
+ function(x) {
+ sapply(
+ X = x,
+ FUN = function(s) {
+ if (is.na(s))
+ return("NA")
+ cond <- try_catch_w_e(nchar(s) > n)
+ if (!is.logical(cond$value)) {
+ return(cond$value$message)
+ } else if (cond$value) {
+ paste0(
+ strtrim(s, n),
+ "..."
+ )
+ } else {
+ s
+ }
+ },
+ USE.NAMES = FALSE
+ )
+ }
+ }
+trunc_long_ppep <- function(x) trunc_n(40)(x)
+trunc_ppep <- function(x) trunc_n(g_ppep_trunc_n)(x)
+trunc_subgene <- function(x) trunc_n(g_subgene_trunc_n)(x)
+trunc_enriched_substrate <- function(x) trunc_n(g_sbstr_trunc_n)(x)
+
+# factory producing a function that returns a covariance
+# matrix's rows (and columns) having variance > v_min
+keep_cov_w_var_gtr_min <- function(v_min) {
+ function(x) {
+ if (!is.matrix(x))
+ return(NULL)
+ keepers <- sapply(
+ X = seq_len(nrow(x)),
+ FUN = function(i) {
+ if (x[i, i] < v_min)
+ NA
+ else
+ x[i, i]
+ }
+ )
+ names(keepers) <- rownames(x)
+ keepers <- keepers[!is.na(keepers)]
+ keepers <- names(keepers)
+ if (length(keepers) == 0)
+ return(NULL)
+ x[keepers, keepers]
+ }
+}
+# function that returns a matrix's rows having variance > 1
+keep_cov_w_var_gtr_1 <- keep_cov_w_var_gtr_min(1)
+
+# factory producing a function that returns
+# - either a matrix's rows (rows = TRUE)
+# - or a matrix's columns (rows = FALSE)
+# having variance > v_min
+keep_var_gtr_min <- function(v_min) {
+ function(x, rows = TRUE) {
+ nrowcol <- if (rows) nrow else ncol
+ if (!is.matrix(x))
+ return(NULL)
+ keepers <- sapply(
+ X = seq_len(nrowcol(x)),
+ FUN = function(i) {
+ row_var <- var(
+ if (rows) x[i, ] else x[, i],
+ na.rm = TRUE
+ )
+ if (is.na(row_var) || row_var <= v_min) NA else i
+ }
+ )
+ keepers <- keepers[!is.na(keepers)]
+ if (rows) x[keepers, ] else x[, keepers]
+ }
+}
+
+keep_var_gtr_0 <- keep_var_gtr_min(0)
+
+# function drawing heatmap of phosphopeptide intensities
+ppep_heatmap <-
function(
m, # matrix with rownames already formatted
cutoff, # cutoff used by hm_heading_function
- hm_heading_function, # construct and cat heading from m and cutoff
+ hm_heading_function, # construct $ cat heading from m and cutoff
hm_main_title, # main title for plot (drawn below heading)
suppress_row_dendrogram = TRUE, # set to false to show dendrogram
- max_peptide_count # experimental:
- = intensity_hm_rows, # values of 50 and 75 worked well
- ... # passthru parameters for heatmap
+ max_peptide_count = # experimental:
+ g_intensity_hm_rows, # values of 50 and 75 worked well
+ master_cex = 1.0, # basis for text sizes
+ margins = NULL, # optional margins (bottom, right)
+ cellnote = NULL, # optional matrix of character; dim = dim(m)
+ adj = 0.5, # adjust text: 0 left, 0.5 middle, 1 right
+ ... # passthru to hm2plus or heatmap.2
) {
+ use_heatmap_1 <- FALSE
peptide_count <- 0
# emit the heading for the heatmap
if (hm_heading_function(m, cutoff)) {
- peptide_count <- min(max_peptide_count, nrow(m))
- if (nrow(m) > 1) {
+ nrow_m <- nrow(m)
+ peptide_count <- min(max_peptide_count, nrow_m)
+ if (nrow_m > 1) {
m_margin <- m[peptide_count:1, ]
- # Margin setting was heuristically derived
- margins <-
- c(0.5, # col
- max(80, sqrt(nchar(rownames(m_margin)))) * 5 / 16 # row
- )
- }
- if (nrow(m) > 1) {
+ # Margin was heuristically derived to accommodate the widest label
+ row_mchar_max <- max(nchar(rownames(m_margin)))
+ col_mchar_max <- max(nchar(colnames(m_margin)))
+ row_margin <- master_cex * row_mchar_max * 2.6
+ col_margin <- master_cex * col_mchar_max * 2.6
+ if (print_trace_messages) cat(sprintf("row_margin = %0.3f; ", row_margin))
+ if (print_trace_messages) cat(sprintf("col_margin = %0.3f; ", col_margin))
+ hm_call <- NULL
tryCatch(
{
- old_oma <- par("oma")
- par(cex.main = 0.6)
- # Heuristically determined character size adjustment formula
- char_contractor <-
- 250000 / (
- max(4500, (nchar(rownames(m_margin)))^2) * intensity_hm_rows
+ # set non-argument parameters for hm_call inner function
+ my_row_cex <-
+ master_cex * 200000 / (
+ (max(nchar(rownames(m_margin)))^2) * g_intensity_hm_rows
)
- heatmap(
- m[peptide_count:1, ],
- Rowv = if (suppress_row_dendrogram) NA else NULL,
- Colv = NA,
- cexRow = char_contractor,
- cexCol = char_contractor * 50 / max_peptide_count,
- scale = "row",
- margins = margins,
- main =
- "Unimputed, unnormalized log(intensities)",
- xlab = "",
- las = 1,
- ...
+ m_hm <- m[peptide_count:1, , drop = FALSE]
+ if (is.null(cellnote)) {
+ cellnote <- matrix("", nrow = nrow(m_hm), ncol = ncol(m_hm))
+ cellnote[is.na(m_hm)] <- "NA"
+ } else {
+ cellnote <- cellnote[peptide_count:1, , drop = FALSE]
+ }
+ m_hm[is.na(m_hm)] <- 0
+ nrow_m_hm <- nrow(m_hm)
+ ncol_m_hm <- ncol(m_hm)
+ if (nrow_m_hm < 1 || ncol_m_hm < 1)
+ return(peptide_count) # return zero as initialized above
+ my_limit <- g_intensity_hm_rows
+
+
+ my_row_cex <- master_cex * (100 / (2 + row_mchar_max))
+ my_col_cex <- master_cex * 6 * row_margin / col_margin
+ my_col_adj <- min(my_col_cex, my_row_cex) / my_col_cex
+ my_col_cex <- min(my_col_cex, my_row_cex)
+ col_margin <- sqrt(my_col_adj) * col_margin
+ if (print_trace_messages) cat(sprintf("my_row_cex = %0.3f; ", my_row_cex))
+ if (print_trace_messages) cat(sprintf("my_col_cex = %0.3f; ", my_col_cex))
+ if (is.null(margins)) my_margins <-
+ c(
+ (my_col_cex + col_margin), # col
+ (my_row_cex + row_margin) / my_row_cex # row
+ )
+ else
+ my_margins <- margins
+
+ if (print_trace_messages) cat(
+ sprintf(
+ "my_margins = c(%s)\n\n",
+ paste(my_margins, collapse = ", ")
+ )
+ )
+ my_hm2_cex <- 2 * master_cex
+ my_key_cex <- 0.9 - 0.1 * (g_intensity_hm_rows + nrow_m_hm) / g_intensity_hm_rows
+ my_key_warp <- 1.5 * 22.75 / row_margin
+ my_key_cex <- min(1.10, my_key_warp * my_key_cex)
+ my_hgt_scale <- 3.70 - 0.4 * (max(1, 0.9 * my_row_cex) - 1)
+ my_hgt_scale <- 3.75 # 3.615
+ my_hgt_scale <- 3.60 # 3.615
+ if (print_trace_messages)
+ cat_variable(my_hgt_scale, "\n\n", 3)
+ my_warp <- max(0.1, 1.4 * (7.5 + nrow_m) / g_intensity_hm_rows)
+ if (print_trace_messages)
+ cat_variable(my_warp, "\n\n", 3)
+ # added 0.9 heuristically...
+ my_plot_height <-
+ (0.566 + 0.354 * (nrow_m / g_intensity_hm_rows)) *
+ min(my_hgt_scale, my_hgt_scale * my_warp)
+ my_plot_height <- min(3.65, my_plot_height * g_intensity_hm_rows / 50)
+ my_heights <- c(
+ 0.3, # title and top dendrogram
+ my_plot_height, # plot and bottom margin
+ 4.15 - my_hgt_scale, # legend
+ 0.05 + my_hgt_scale - my_plot_height # whitespace below legend
)
+ my_note_cex <- min(0.8, my_row_cex, my_col_cex)
+ if (print_trace_messages) {
+ cat_variable(my_plot_height, "\n\n", 3)
+ cat_variable(4.19 - my_hgt_scale, "\n\n", 3)
+ cat_variable(nrow_m_hm, "; ", 0)
+ cat_variable(ncol_m_hm, "; ", 0)
+ cat_variable(my_row_cex, "; ", 3)
+ cat_variable(my_col_cex, "; ", 3)
+ cat_variable(my_note_cex, "; ", 3)
+ cat_variable(my_key_cex, "\n\n", 3)
+ cat_variable(my_hgt_scale, "; ", 3)
+ cat_variable(my_plot_height, "; ", 3)
+ cat_variable(my_warp, "\n\n", 3)
+ cat_variable(my_heights, "; ", 2)
+ cat_variable(sum(my_heights), "\n\n", 3)
+ }
+
+ # define hm_call inner function
+ hm_call <- function(x, scaling, title) {
+ my_cex_main <- min(5.0, 220 / nchar(title))
+ op <- par(
+ cex.main = my_cex_main * master_cex,
+ adj = adj
+ )
+ if (
+ !is.null(
+ hm2plus(
+ x,
+ Colv = NA,
+ Rowv = TRUE,
+ cexRow = my_row_cex,
+ cexCol = my_col_cex,
+ dendrogram = "row",
+ las = 1,
+ main = title,
+ key_xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
+ cex = my_hm2_cex,
+ key_par = list(cex = my_key_cex),
+ margins = my_margins,
+ widths = c(0.4, 2.6, 1.5),
+ heights = my_heights,
+ mat = matrix(
+ c(
+ c(0, 3, 3),
+ c(2, 1, 1),
+ c(0, 4, 0),
+ c(0, 0, 0)
+ ),
+ nrow = 4,
+ ncol = 3,
+ byrow = TRUE
+ ),
+ na.rm = TRUE,
+ scale = scaling,
+ srtcol = 90,
+ srtrow = 0,
+ xlab = "",
+ cellnote = cellnote,
+ notecex = my_note_cex,
+ ...
+ )
+ )
+ ) {
+ if (print_trace_messages) cat(
+ sprintf(
+ "my_heights = c(%s); sum = %0.3f\n\n",
+ paste(
+ sprintf("%0.3f", my_heights),
+ collapse = ", "
+ ),
+ sum(my_heights)
+ )
+ )
+ if (print_trace_messages) cat(
+ sprintf("my_key_cex = %0.3f\n\n",
+ my_key_cex)
+ )
+ if (print_trace_messages) cat(
+ sprintf("my_key_cex/my_heights[3] = %0.3f\n\n",
+ my_key_cex / my_heights[3])
+ )
+ if (print_trace_messages) cat(
+ sprintf("my_heights[2]/my_heights[3] = %0.3f\n\n",
+ my_heights[2] / my_heights[3])
+ )
+ }
+ par(op)
+ }
+
+ # invoke hm_call inner function
+ if (sum(rowSums(!is.na(m_hm)) < 2))
+ hm_call(
+ m_hm,
+ "none",
+ "log(intensities), unscaled, unimputed, and unnormalized"
+ )
+ else
+ hm_call(
+ m_hm,
+ "row",
+ "log(intensities), row-scaled, unimputed, and unnormalized"
+ )
},
error = function(e) {
- cat(
- sprintf(
- "\nCould not draw heatmap, possibly because of too many missing values. Internal message: %s\n",
- e$message
+ if (!is.null(hm_call)) {
+ m_hm[is.na(m_hm)] <- 0
+ tryCatch(
+ {
+ if (nrow(m_hm) > 1)
+ hm_call(
+ m_hm,
+ "none",
+ paste(
+ "log(intensities), unscaled,",
+ "zero-imputed, unnormalized"
+ )
+ )
+ else
+ cat("\nThere are too few peptides to produce a heatmap.\n")
+ },
+ error = function(r) {
+ cat(
+ sprintf(
+ "\n%s %s Internal message: %s\n\\newline\n\n",
+ "Failure drawing heatmap,",
+ "possibly because of too many missing values.\n\\newline\n\n",
+ r$message
+ )
+ )
+ cat_margins()
+ }
+ )
+ } else {
+ cat(
+ "\nFailure drawing heatmap, possibly because of too many missing values.\n"
)
- )
- },
- finally = par(old_oma)
+ }
+ }
)
}
}
- return(peptide_count)
+ # return value:
+ peptide_count
}
+
+# function drawing heatmap of correlations if they exist, else covariances
+cov_heatmap <-
+ function(
+ m, # matrix with rownames already formatted
+ top_substrates = FALSE,
+ ... # passthru to hm2plus or heatmap.2
+ ) {
+ if (print_nb_messages) nbe(see_variable(m), " [", nrow(m), "x", ncol(m), "\n")
+ #ACE nb(rowSums(m, na.rm = TRUE))
+ #ACE bad_rows <- (rowSums(m, na.rm = TRUE) == 0)
+ #ACE nb(see_variable(bad_rows))
+ #ACE m <- m[-bad_rows, , drop = FALSE]
+ colnames_m <- colnames(m)
+ is_na_m <- is.na(m)
+ tmp <- m
+ tmp[is_na_m] <- 0
+
+ tmp <- m[, 0 < colSums(x = tmp)] # by default, na.rm is FALSE
+
+ colnames_tmp <- colnames(tmp)
+
+ my_low_p_seq <- seq(
+ from = min(g_intensity_hm_rows, nrow(m)),
+ to = 1,
+ by = -1
+ )
+
+ if (g_correlate_substrates) {
+ # zap samples having zero or near-zero variance
+ tmp[is.na(tmp)] <- 0
+ nzv <- caret::nearZeroVar(
+ tmp, # matrix of values, samples x variables
+ freqCut = 1.01, # min(freq most prevalent value /
+ # freq second most prevalent)
+ uniqueCut = 99 # max(number of unique values /
+ # total number of samples)
+ )
+ tmp <- if (length(nzv) > 0) {
+ m[, -nzv, drop = FALSE]
+ } else {
+ m
+ }
+ } else {
+ tmp <- m[my_low_p_seq, , drop = FALSE]
+ }
+
+
+ t_m <- t(tmp)
+ t_m[is.na(t_m)] <- 0
+ prefiltered_nrow <- ncol(t_m)
+
+ my_corcov <- cov(t_m)
+ did_filter_rows <- did_filter_cols <- FALSE
+ if (g_correlate_substrates && !is_positive_definite(my_corcov)) {
+ my_correlate_substrates <- FALSE
+ t_m <- t(m[my_low_p_seq, , drop = FALSE])
+ t_m[is.na(t_m)] <- 0
+ unfiltered_row_count <- ncol(t_m)
+ unfiltered_col_count <- nrow(t_m)
+
+ # zap empty samples
+ t_m <- t_m[0 < rowSums(x = t_m), ]
+ # zap substrates present in fewer than two samples
+ foo <- t_m > 0
+ foo <- colSums(x = foo) > 1
+ t_m <- t_m[, foo]
+
+ did_filter_rows <- unfiltered_row_count > ncol(t_m)
+ did_filter_cols <- unfiltered_col_count > nrow(t_m)
+
+ colnames_tmp <- rownames(t_m)
+ my_corcov <- cov(t_m)
+ if (g_filter_cov_var_gt_1) {
+ my_corcov <- keep_cov_w_var_gtr_1(my_corcov)
+ }
+ } else if (g_correlate_substrates) {
+ my_corcov <- cov2cor(my_corcov)
+ my_correlate_substrates <- TRUE
+ } else {
+ my_correlate_substrates <- FALSE
+ if (g_filter_cov_var_gt_1) my_corcov <- keep_cov_w_var_gtr_1(my_corcov)
+ }
+
+ omitted_samples <- colnames_m[colnames_m %notin% colnames_tmp]
+ suffix <- if (length(omitted_samples) > 1) "s" else ""
+
+ f_omissions <-
+ function(is_corr) {
+ cat(
+ sprintf(
+ "Below is the %s plot for %s substrates",
+ if (is_corr) "correlation" else "covariance",
+ sprintf(
+ if (top_substrates)
+ "%0.0f \"highest-quality\""
+ else
+ "%0.0f",
+ ncol(t_m)
+ )
+ )
+ )
+ if (did_filter_cols) {
+ cat(sprintf(", omitting sample%s ", suffix))
+ latex_collapsed_vector(", ", omitted_samples)
+ }
+ cat(".\n\n")
+ }
+
+ if (is.null(my_corcov) || sum(!is.na(t_m)) < 2) {
+ cat(
+ sprintf(
+ "\\newline\n%s %s plot.\n",
+ "Insufficient covariance to produce",
+ if (my_correlate_substrates)
+ "correlation"
+ else
+ "covariance"
+ ),
+ "\\newpage\n"
+ )
+ return(NULL)
+ }
+
+ cat("\\leavevmode\n", "\\newpage\n")
+ f_omissions(my_correlate_substrates)
+
+ master_cex <- 0.4
+ max_nchar <- max(nchar(rownames(t_m)))
+ my_limit <- g_intensity_hm_rows
+ diminution <- sqrt(my_limit / (my_limit + ncol(t_m)))
+ my_row_cex <-
+ my_col_cex <-
+ min(1.75, master_cex * 9 * diminution ^ 1.5)
+ my_margin <- 3 + my_row_cex * 64 / (8 + max_nchar)
+ my_key_cex <- 1.4
+ my_hm2_cex <- 1.0 * master_cex
+ my_hgt_scale <- 3.50 - 0.26 * (max(0.4, my_key_cex) - 0.4)
+ my_hgt_scale <- 2.7
+
+ my_legend_height <- 4.0 - my_hgt_scale
+ my_legend_height <- 0.5 * my_key_cex
+ my_warp <- 0.65 * (my_limit + ncol(t_m)) / my_limit
+ my_warp <- 0.8
+ my_legend_height <- 0.77
+ my_legend_height <- 0.67
+ my_plot_height <- my_hgt_scale + (1 - my_warp) * my_legend_height
+ my_legend_height <- my_warp * my_legend_height
+
+ parjust <- par(adj = 0.5)
+ on.exit(par(parjust))
+ my_corcov <- my_corcov[order(rownames(my_corcov)), ]
+ my_main <-
+ sprintf("%s among %s substrates %s",
+ if (my_correlate_substrates) "Correlation"
+ else "Covariance",
+ kinase_name,
+ if (!my_correlate_substrates &&
+ g_filter_cov_var_gt_1 &&
+ did_filter_rows
+ )
+ "having variance > 1"
+ else ""
+ )
+ my_main_nchar <- nchar(my_main)
+ my_heights <- c(
+ 0.3,
+ my_plot_height,
+ my_legend_height # was 4.0 - my_hgt_scale # was 4.19
+ )
+ if (print_trace_messages) cat(sprintf("max_nchar = %0.3f; ", max_nchar))
+ if (print_trace_messages) cat(sprintf("my_margin = %0.3f; ", my_margin))
+ if (print_trace_messages) cat(sprintf("my_plot_height = %0.3f\n\n", my_plot_height))
+ if (print_trace_messages) cat(sprintf("master_cex = %0.3f; ", master_cex))
+ if (print_trace_messages) cat(sprintf("my_row_cex = %0.3f; ", my_row_cex))
+ if (print_trace_messages) cat(sprintf("my_col_cex = %0.3f; ", my_col_cex))
+ if (print_trace_messages) cat(sprintf("my_key_cex = %0.3f\n\n", my_key_cex))
+ if (print_trace_messages) cat(sprintf("my_hgt_scale = %0.3f\n\n", my_hgt_scale))
+ if (print_trace_messages) cat(sprintf("legend height = %0.3f\n\n", my_legend_height))
+ if (print_trace_messages) cat(
+ sprintf(
+ "my_heights = c(%s); sum = %0.3f\n\n",
+ paste(
+ sprintf("%0.3f", my_heights),
+ collapse = ", "
+ ),
+ sum(my_heights)
+ )
+ )
+ op <- par(cex.main = (30 + my_main_nchar) / my_main_nchar)
+ on.exit(par(op))
+ hm2plus(
+ x = my_corcov,
+ cex = my_hm2_cex,
+ cexCol = my_col_cex,
+ cexRow = my_row_cex,
+ density_info = "none",
+ denhgt = 0.15,
+ denwid = 0.5,
+ divergent = TRUE,
+ key_par = list(cex = my_key_cex),
+ key_xlab = if (my_correlate_substrates) "Correlation"
+ else "Covariance",
+ main = my_main,
+ mat = matrix(
+ c(
+ c(0, 3, 3),
+ c(2, 1, 1),
+ c(0, 4, 0)
+ ),
+ nrow = 3,
+ ncol = 3,
+ byrow = TRUE
+ ),
+ heights = my_heights,
+ margins = c(my_margin, my_margin),
+ widths = c(0.5, 3.1, 0.9),
+ scale = "none",
+ symkey = TRUE,
+ symbreaks = TRUE,
+ symm = FALSE #TODO evaluate TRUE
+ # ...
+ )
+ } # end cov_heatmap
+
+### FILE IMPORT
+
+# function reading bzipped file to data.frame
+bzip2df <- function(d, f, ctor = bzfile) {
+ # read.delim file (by default, compressed by bzip2)
+ if (file.exists(f)) {
+ conn <- NULL
+ pf <- parent.frame()
+ tryCatch(
+ assign(
+ as.character(substitute(d)),
+ read.delim(conn <- bzfile(f, open = "r")),
+ pf
+ ),
+ finally = if (!is.null(conn)) close(conn)
+ )
+ }
+}
+
```
-```{r, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
-cat("\\listoftables\n")
-```
# Purpose
-To perform for phosphopeptides:
+The purpose of this analysis is to perform for phosphopeptides:
- imputation of missing values,
- quantile normalization,
-- ANOVA (using the R stats::`r params$oneWayManyCategories` function), and
+- ANOVA (using the R stats::`r params$oneWayManyCategories` function),
+- assignment of an FDR-adjusted $p$-value and a "quality score" to each phosphopeptide, and
- KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases:
- PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org)
- The Human Proteome Database [http://hprd.org](http://hprd.org)
@@ -1151,13 +3096,15 @@
```{r include = FALSE}
-### GLOBAL VARIABLES
-
-# parameters for KSEA
-
-ksea_cutoff_statistic <- params$kseaCutoffStatistic
-ksea_cutoff_threshold <- params$kseaCutoffThreshold
-ksea_min_kinase_count <- params$kseaMinKinaseCount
+if (params$kseaUseAbsoluteLog2FC) {
+ sfc <- "|s|"
+ pfc <- "|p|"
+ pfc_txt <- "$\\text{absolute value}({\\log_2 (\\text{fold-change})})$"
+} else {
+ sfc <- "s"
+ pfc <- "p"
+ pfc_txt <- "${\\log_2 (\\text{fold-change}})$"
+}
ksea_heatmap_titles <- list()
ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <-
@@ -1177,24 +3124,11 @@
# hash to hold names of significantly enriched kinases
ksea_asterisk_hash <- new_env()
-# READ PARAMETERS (mostly)
-
-intensity_hm_rows <- params$intensityHeatmapRows
-# Input Filename
-input_file <- params$inputFile
-
-# First data column - ideally, this could be detected via regexSampleNames,
-# but for now leave it as is.
-first_data_column <- params$firstDataColumn
-fdc_is_integer <- is.integer(first_data_column)
-if (fdc_is_integer) {
- first_data_column <- as.integer(params$firstDataColumn)
-}
+# PROCESS (mostly read) PARAMETERS
# False discovery rate adjustment for ANOVA
# Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05
-val_fdr <-
- read.table(file = params$alphaFile, sep = "\t", header = FALSE, quote = "")
+val_fdr <- read.table(file = alpha_file, sep = "\t", header = FALSE, quote = "")
if (
ncol(val_fdr) != 1 ||
@@ -1202,62 +3136,17 @@
sum(val_fdr[, 1] < 0) ||
sum(val_fdr[, 1] > 1)
) {
- stop("alphaFile should be one column of numbers within the range [0.0,1.0]")
+ cat("alphaFile should be one column of numbers within the range [0.0,1.0]")
+ param_df_exit()
+ knitr::knit_exit()
}
val_fdr <- val_fdr[, 1]
-#Imputed Data filename
-imputed_data_filename <- params$imputedDataFilename
-imp_qn_lt_data_filenm <- params$imputedQNLTDataFile
-anova_ksea_mtdt_file <- params$anovaKseaMetadata
-
```
-```{r echo = FALSE}
-# Imputation method, should be one of
-# "random", "group-median", "median", or "mean"
-imputation_method <- params$imputationMethod
-
-# Selection of percentile of logvalue data to set the mean for random number
-# generation when using random imputation
-mean_percentile <- params$meanPercentile / 100.0
-
-# deviation adjustment-factor for random values; real number.
-sd_percentile <- params$sdPercentile
-
-# Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$"
-regex_sample_names <- params$regexSampleNames
-
-# Regular expression to extract Sample Grouping from Sample Name;
-# if error occurs, compare sample_treatment_levels vs. sample_name_matches
-# to see if groupings/pairs line up
-# e.g., "(\\d+)"
-regex_sample_grouping <- params$regexSampleGrouping
-
-one_way_all_categories_fname <- params$oneWayManyCategories
-one_way_all_categories <- try_catch_w_e(
- match.fun(one_way_all_categories_fname))
-if (!is.function(one_way_all_categories$value)) {
- write("fatal error for parameter oneWayManyCategories:", stderr())
- write(one_way_all_categories$value$message, stderr())
- if (sys.nframe() > 0) quit(save = "no", status = 1)
- stop("Cannot continue. Goodbye.")
-}
-one_way_all_categories <- one_way_all_categories$value
-
-one_way_two_categories_fname <- params$oneWayManyCategories
-one_way_two_categories <- try_catch_w_e(
- match.fun(one_way_two_categories_fname))
-if (!is.function(one_way_two_categories$value)) {
- cat("fatal error for parameter oneWayTwoCategories: \n")
- cat(one_way_two_categories$value$message, fill = TRUE)
- if (sys.nframe() > 0) quit(save = "no", status = 1)
- stop("Cannot continue. Goodbye.")
-}
-one_way_two_categories <- one_way_two_categories$value
-
-preproc_db <- params$preprocDb
-ksea_app_prep_db <- params$kseaAppPrepDb
+```{r echo = FALSE, results = 'asis'}
+
+
result <- file.copy(
from = preproc_db,
to = ksea_app_prep_db,
@@ -1272,9 +3161,19 @@
),
stderr()
)
- if (sys.nframe() > 0) quit(save = "no", status = 1)
- stop("Cannot continue. Goodbye.")
+ if (sys.nframe() > 0) {
+ cat("Cannot continue and quit() failed. Goodbye.")
+ param_df_exit()
+ knitr::knit_exit()
+ # in case knit_exit doesn't exit
+ quit(save = "no", status = 1)
+ }
}
+
+if (FALSE) {
+ write.table(x = param_df, file = "test-data/params.txt")
+}
+
```
```{r echo = FALSE}
@@ -1289,55 +3188,75 @@
quote = "",
check.names = FALSE
)
+
```
-# Extract Sample Names and Treatment Levels
-
-Column names parsed from input file are shown in Table 1; sample names and treatment levels, in Table 2.
+
+# Extraction of Sample Classes and Names from Input Data
```{r echo = FALSE, results = 'asis'}
data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE)
+my_column_names <- names(full_data)
if (!fdc_is_integer) {
if (length(data_column_indices) > 0) {
first_data_column <- data_column_indices[1]
} else {
- stop(paste("failed to convert firstDataColumn:", first_data_column))
+ cat(paste("failed to convert firstDataColumn:", first_data_column))
+ param_df_exit()
+ knitr::knit_exit()
}
}
cat(
sprintf(
paste(
- "\n\nThe input data file has peptide-intensity data for each sample",
- "in one of columns %d through %d.\n\n"
+ "\n\nThe input data file has peptide-intensity data",
+ "in columns %d (\"%s\") through %d (\"%s\")."
),
- min(data_column_indices),
- max(data_column_indices)
+ tmp <- min(data_column_indices),
+ my_column_names[tmp],
+ tmp <- max(data_column_indices),
+ my_column_names[tmp]
)
)
-# Write column names as a LaTeX enumerated list.
-column_name_df <- data.frame(
- column = seq_len(length(colnames(full_data))),
- name = paste0("\\verb@", colnames(full_data), "@")
+if (TRUE) {
+ cat0(
+ table_offset(i = 1, new = TRUE),
+ "Sample classes and names are shown in ",
+ table_href(),
+ ".\n\n"
)
-data_frame_latex(
- x = column_name_df,
- justification = "l l",
- centered = TRUE,
- caption = "Input data column names",
- anchor = const_table_anchor_bp,
- underscore_whack = FALSE
+} else {
+ cat0(
+ "\\newcounter{offset}\n",
+ "\\setcounter{offset}{\\value{table}}\n",
+ "\\stepcounter{offset}\n",
+ "Sample classes and names are shown in ",
+ table_href(),
+ ".\n\n"
)
+}
+
+#TODO remove this unused variable and assignment
+if (FALSE) {
+ # Write column names as a LaTeX enumerated list.
+ column_name_df <- data.frame(
+ column = seq_len(length(colnames(full_data))),
+ name = paste0("\\verb@", colnames(full_data), "@")
+ )
+}
```
```{r echo = FALSE, results = 'asis'}
+# extract intensity columns from full_data to quant_data
quant_data <- full_data[first_data_column:length(full_data)]
quant_data[quant_data == 0] <- NA
rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide
+full_data_names <- colnames(quant_data)
# Extract factors and trt-replicates using regular expressions.
# Typically:
# regex_sample_names is "\\.\\d+[A-Z]$"
@@ -1355,20 +3274,133 @@
write_debug_file(quant_data)
rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE)
-sample_treatment_levels <- as.factor(regmatches(sample_name_matches, rx_match))
-number_of_samples <- length(sample_name_matches)
+smpl_trt <- as.factor(regmatches(sample_name_matches, rx_match))
+
+if (print_nb_messages) nbe(see_variable(smpl_trt, "\n\n"))
+if (print_nb_messages) nbe(see_variable(sample_name_matches, "\n\n"))
+if (print_nb_messages) nbe(see_variable(full_data_names, "\n\n"))
+
+sample_treatment_df <-
+ save_sample_treatment_df <-
+ data.frame(
+ class = smpl_trt,
+ sample = sample_name_matches,
+ full_sample_names = full_data_names
+ )
+
+if (print_nb_messages) nbe(see_variable(sample_treatment_df, "\n\n"))
+
+# reorder data
+my_order <- with(sample_treatment_df, order(class, sample))
+quant_data <- quant_data[, my_order]
+sample_name_matches <- sample_name_matches[my_order]
+smpl_trt <- smpl_trt[my_order]
sample_treatment_df <- data.frame(
- level = sample_treatment_levels,
+ class = smpl_trt,
sample = sample_name_matches
)
-data_frame_latex(
+
+# filter smpl_trt as appropriate
+if (sample_group_filter %in% c("include", "exclude")) {
+ include_sample <-
+ mgrepl(
+ v = sample_group_filter_patterns,
+ s = as.character(smpl_trt),
+ fixed = sample_group_filter_fixed,
+ perl = sample_group_filter_perl,
+ ignore.case = sample_group_filter_nocase
+ )
+ if (sum(include_sample) < 2) {
+ errmsg <-
+ paste(
+ "ERROR:",
+ sum(include_sample),
+ "samples are too few for analysis;",
+ "check input parameters for sample-name parsing"
+ )
+ cat0(
+ errmsg,
+ "\\stepcounter{offset}\n",
+ " in ",
+ table_href(),
+ ".\n\n"
+ )
+ data_frame_tabbing_latex(
+ x = save_sample_treatment_df,
+ tabstops = c(1.25, 1.25),
+ caption = "Sample classes",
+ use_subsubsection_header = FALSE
+ )
+ data_frame_tabbing_latex(
+ x =
+ param_df[
+ c("regexSampleNames",
+ "regexSampleGrouping",
+ "groupFilterPatterns",
+ "groupFilter",
+ "groupFilterMode"
+ ),
+ ],
+ tabstops = c(1.75),
+ underscore_whack = TRUE,
+ caption = "Input parameters for sample-name parsing",
+ verbatim = FALSE
+ )
+ param_df_exit()
+ knitr::knit_exit()
+ return(invisible(-1))
+ }
+ sample_treatment_df <-
+ if (sample_group_filter == "include")
+ sample_treatment_df[include_sample, ]
+ else
+ sample_treatment_df[!include_sample, ]
+} else {
+ include_sample <- rep.int(TRUE, length(smpl_trt))
+}
+sample_name_matches <- sample_treatment_df$sample
+rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE)
+smpl_trt <- as.factor(regmatches(sample_name_matches, rx_match))
+sample_treatment_df$class <- smpl_trt
+
+# filter quant_data as appropriate
+number_of_samples <- length(sample_name_matches)
+quant_data <- quant_data[, sample_name_matches]
+
+sample_level_integers <- as.integer(smpl_trt)
+sample_treatment_levels <- levels(smpl_trt)
+count_of_treatment_levels <- length(sample_treatment_levels)
+
+# for each phosphopeptide, across treatment levels, compute minimum
+# count of observed (i.e., non-missing) values
+my_env <- new_env()
+for (l in sample_treatment_levels)
+ my_env[[as.character(l)]] <-
+ as.vector(rowSums(!is.na(quant_data[l == smpl_trt])))
+min_group_obs_count <- row_apply(
+ x = Reduce(
+ f = function(l, r) cbind(l, my_env[[r]]),
+ x = sample_treatment_levels,
+ init = c()
+ ),
+ fun = min
+ )
+names(min_group_obs_count) <- rownames(quant_data)
+rm(my_env)
+
+# display (possibly-filtered) results
+cat("\\newpage\n")
+
+if (sum(include_sample) > 1) {
+data_frame_tabbing_latex(
x = sample_treatment_df,
- justification = "rp{0.2\\linewidth} lp{0.3\\linewidth}",
- centered = TRUE,
- caption = "Treatment levels",
- anchor = const_table_anchor_tbp,
- underscore_whack = FALSE
+ tabstops = c(1.25),
+ caption = "Sample classes",
+ use_subsubsection_header = FALSE
)
+}
+sample_name_grow <- 10 / (10 + max(nchar(sample_name_matches) + 6))
+sample_colsep <- transition_positions(as.integer(sample_treatment_df$class))
```
```{r echo = FALSE, results = 'asis'}
@@ -1377,7 +3409,7 @@
## Are the log-transformed sample distributions similar?
-```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
+```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'}
quant_data[quant_data == 0] <- NA #replace 0 with NA
quant_data_log <- log10(quant_data)
@@ -1387,25 +3419,81 @@
write_debug_file(quant_data_log)
-# data visualization
-old_par <- par(
- mai = par("mai") + c(0.5, 0, 0, 0)
-)
-# ref: https://r-charts.com/distribution/add-points-boxplot/
-# Vertical plot
-boxplot(
- quant_data_log
-, las = 1
-, col = const_boxplot_fill
-, ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
-, xlab = "Sample"
-)
-par(old_par)
-
-
-
-cat("\n\n\n")
-cat("\n\n\n")
+g_ppep_distrib_ctl <- new_env()
+g_ppep_distrib_ctl$xlab_line <- 3.5 + 11.86 * (0.67 - sample_name_grow)
+g_ppep_distrib_ctl$mai_bottom <- (0.5 + 3.95 * (0.67 - sample_name_grow))
+g_ppep_distrib_ctl$axis <- (0.6 + 0.925 * (0.67 - sample_name_grow))
+
+my_ppep_distrib_bxp <- function(
+ x
+ , sample_name_grow = sample_name_grow
+ , main
+ , varwidth = FALSE
+ , sub = NULL
+ , xlab
+ , ylab
+ , col = const_boxplot_fill
+ , notch = FALSE
+ , ppep_distrib_ctl = g_ppep_distrib_ctl
+ , ...
+ ) {
+ my_xlab_line <- g_ppep_distrib_ctl$xlab_line
+ my_mai_bottom <- g_ppep_distrib_ctl$mai_bottom
+ my_axis <- g_ppep_distrib_ctl$axis
+
+ if (print_trace_messages) {
+ cat_variable(my_xlab_line, suffix = "; ")
+ cat_variable(my_mai_bottom, suffix = "; ")
+ cat_variable(my_axis, suffix = "\n\n")
+ }
+
+ old_par <- par(
+ mai = par("mai") + c(my_mai_bottom, 0, 0, 0),
+ cex.axis = my_axis,
+ cex.lab = 1.2
+ )
+ tryCatch(
+ {
+ # Vertical plot
+ boxplot(
+ x
+ , las = 2
+ , col = col
+ , main = main
+ , sub = NULL
+ , ylab = ylab
+ , xlab = NULL
+ , notch = notch
+ , varwidth = varwidth
+ , ...
+ )
+ title(
+ sub = sub
+ , cex.sub = 1.0
+ , line = my_xlab_line + 1
+ )
+ title(
+ xlab = xlab
+ , line = my_xlab_line
+ )
+ },
+ finally = par(old_par)
+ )
+ }
+
+my_ppep_distrib_bxp(
+ x = quant_data_log
+ , sample_name_grow = sample_name_grow
+ , main = "Peptide intensities for each sample"
+ , varwidth = boxplot_varwidth
+ , sub = "Box widths reflect number of peptides for sample"
+ , xlab = "Sample"
+ , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+ , col = const_boxplot_fill
+ , notch = FALSE
+ )
+
+cat("\n\n\n\n")
```
@@ -1454,6 +3542,8 @@
)
```
+# Characterization of Input Data
+
## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values
```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'}
@@ -1461,7 +3551,7 @@
q1 <- quantile(logvalues, probs = mean_percentile)[1]
# 1 = row of matrix (ie, phosphopeptide)
-sds <- apply(quant_data_log, 1, sd_finite)
+sds <- row_apply(quant_data_log, sd_finite)
if (sum(!is.na(sds)) > 2) {
plot(
density(sds, na.rm = TRUE)
@@ -1495,7 +3585,7 @@
# prep for trt-median based imputation
```
-# Impute Missing Values
+# Imputation of Missing Values
```{r echo = FALSE}
@@ -1526,11 +3616,10 @@
paste("Substitute missing value with",
"median peptide-intensity for sample group.\n"
)
- sample_level_integers <- as.integer(sample_treatment_levels)
# Take the accurate ln(x+1) because the data are log-normally distributed
# and because median can involve an average of two measurements.
quant_data_imp <- log1p(quant_data_imp)
- for (i in seq_len(length(levels(sample_treatment_levels)))) {
+ for (i in seq_len(count_of_treatment_levels)) {
# Determine the columns for this factor-level
level_cols <- i == sample_level_integers
# Extract those columns
@@ -1541,7 +3630,7 @@
# a given ppep has no measurement; otherwise, proceed.
if (ncol(lvlsbst) > 1) {
the_centers <-
- apply(lvlsbst, 1, median, na.rm = TRUE)
+ row_apply(lvlsbst, median, na.rm = TRUE)
for (j in seq_len(nrow(lvlsbst))) {
for (k in seq_len(ncol(lvlsbst))) {
if (is.na(lvlsbst[j, k])) {
@@ -1565,7 +3654,7 @@
# Take the accurate ln(x+1) because the data are log-normally distributed
# and because median can involve an average of two measurements.
quant_data_imp <- log1p(quant_data_imp)
- quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = TRUE)[ind[, 1]]
+ quant_data_imp[ind] <- row_apply(quant_data_imp, median, na.rm = TRUE)[ind[, 1]]
# Take the accurate e^x - 1 to match scaling of original input.
quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
good_rows <- !is.nan(rowMeans(quant_data_imp))
@@ -1581,7 +3670,7 @@
# this will have to be
quant_data_imp <- log1p(quant_data_imp)
# Assign to NA cells the mean for the row
- quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = TRUE)[ind[, 1]]
+ quant_data_imp[ind] <- row_apply(quant_data_imp, mean, na.rm = TRUE)[ind[, 1]]
# Take the accurate e^x - 1 to match scaling of original input.
quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
good_rows <- !is.nan(rowMeans(quant_data_imp))
@@ -1625,11 +3714,40 @@
imp_smry_pot_peptides_after <- sum(good_rows)
imp_smry_rejected_after <- sum(!good_rows)
imp_smry_missing_values_after <- sum(is.na(quant_data_imp[good_rows, ]))
+
+# From ?`%in%`, %in% is currently defined as function(x, table) match(x, table, nomatch = 0) > 0
+
+sink(stderr())
+print("`%in%`:")
+print(`%in%`)
+sink()
+
+stock_in <-
+ names(good_rows) %in%
+ names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count])
+if (print_nb_messages) nbe(see_variable(stock_in), "\n")
+
+explicit_in <-
+ 0 < match(
+ names(good_rows),
+ names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count])
+ )
+if (print_nb_messages) nbe(see_variable(explicit_in), "\n")
+
+great_enough_row_names <- good_rows[
+ names(good_rows) %in%
+ names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count])
+]
+if (print_nb_messages) nbe(see_variable(great_enough_row_names), "\n")
+great_enough_row_names <- great_enough_row_names[great_enough_row_names]
+if (print_nb_messages) nbe(see_variable(great_enough_row_names), "\n")
```
+
```{r echo = FALSE, results = 'asis'}
# ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf
tabular_lines_fmt <- paste(
"\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page)
+ " \\leavevmode",
" \\caption{Imputation Results}",
" \\centering", # \centering centers the table on the page
" \\begin{tabular}{l c c c}",
@@ -1637,8 +3755,9 @@
" \\ & potential peptides & missing values & rejected",
" peptides \\\\ [0.5ex]",
" \\hline",
- " before imputation & %d & %d (%d\\%s) & \\\\",
- " after imputation & %d & %d & %d \\\\ [1ex]",
+ " before imputation & %d & %d (%d\\%s) & \\\\",
+ " after imputation & %d & %d & %d \\\\",
+ " after keep comparable & %d & & %d \\\\ [1ex]",
" \\hline",
" \\end{tabular}",
#" \\label{table:nonlin}", # may be used to refer this table in the text
@@ -1654,18 +3773,37 @@
"%",
imp_smry_pot_peptides_after,
imp_smry_missing_values_after,
- imp_smry_rejected_after
+ imp_smry_rejected_after,
+ length(great_enough_row_names),
+ imp_smry_pot_peptides_before -
+ length(great_enough_row_names)
)
cat(tabular_lines)
```
-```{r echo = FALSE}
-
-
-# Zap rows where imputation was ineffective
+
+```{r filter_good_rows, echo = FALSE}
+
+if (print_nb_messages) nbe("before name extraction, ", see_variable(length(good_rows)), " ", see_variable(good_rows), "\n")
+good_rows <- names(good_rows[names(great_enough_row_names)])
+if (print_nb_messages) nbe("after name extraction, ", see_variable(length(good_rows)), see_variable(good_rows), "\n")
+
+#ACE min_group_obs_count <- min_group_obs_count[names(great_enough_row_names)]
+#ACE nbe("good_rows")
+#ACE nbe(see_variable(good_rows))
+#ACE nbe("names(min_group_obs_count) before filter for good rows")
+#ACE nbe(see_variable(names(min_group_obs_count)))
+min_group_obs_count <- min_group_obs_count[good_rows]
+#ACE nbe("min_group_obs_count after filter for good rows")
+#ACE nbe(see_variable(names(min_group_obs_count)))
+
+# Zap rows where imputation was insufficiently effective
full_data <- full_data [good_rows, ]
quant_data <- quant_data [good_rows, ]
-
+quant_data_log <- quant_data_log [good_rows, ]
+
+if (print_nb_messages) nbe("before row filter, ", see_variable(nrow(quant_data_imp)), "\n")
quant_data_imp <- quant_data_imp[good_rows, ]
+if (print_nb_messages) nbe("after row filter, ", see_variable(nrow(quant_data_imp)), "\n")
write_debug_file(quant_data_imp)
quant_data_imp_good_rows <- quant_data_imp
@@ -1719,35 +3857,41 @@
```
-```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
+```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'}
zero_sd_rownames <-
rownames(quant_data_imp)[
- is.na((apply(quant_data_imp, 1, sd, na.rm = TRUE)) == 0)
+ is.na((row_apply(quant_data_imp, sd, na.rm = TRUE)) == 0)
]
if (length(zero_sd_rownames) >= nrow(quant_data_imp)) {
- stop("All peptides have zero standard deviation. Cannot continue.")
+ cat("All peptides have zero standard deviation. Cannot continue.")
+ param_df_exit()
+ knitr::knit_exit()
}
if (length(zero_sd_rownames) > 0) {
cat(
- sprintf("%d peptides with zero variance were removed from statistical consideration",
- length(zero_sd_rownames)
+ sprintf(
+ "%d %s %s",
+ length(zero_sd_rownames),
+ "peptides with zero variance",
+ "were removed from statistical consideration"
)
)
zap_named_rows <- function(df, nms) {
return(df[!(row.names(df) %in% nms), ])
}
- quant_data_imp <- zap_named_rows(quant_data_imp, zero_sd_rownames)
- quant_data <- zap_named_rows(quant_data, zero_sd_rownames)
- full_data <- zap_named_rows(full_data, zero_sd_rownames)
+ quant_data_imp <-
+ zap_named_rows(quant_data_imp, zero_sd_rownames)
+ quant_data <-
+ zap_named_rows(quant_data, zero_sd_rownames)
+ full_data <-
+ zap_named_rows(full_data, zero_sd_rownames)
+ min_group_obs_count <-
+ min_group_obs_count[names(min_group_obs_count) %notin% zero_sd_rownames]
}
if (sum(is.na(quant_data)) > 0) {
cat("\\leavevmode\\newpage\n")
- # data visualization
- old_par <- par(
- mai = par("mai") + c(0.5, 0, 0, 0)
- )
# Copy quant data to x
x <- quant_data
# x gets to have values of:
@@ -1776,22 +3920,24 @@
show_stripchart <-
50 > (count_red + count_blue) / length(sample_name_matches)
if (show_stripchart) {
- boxplot_sub <- "Light blue = data before imputation; Red = imputed data"
+ boxplot_sub <- "Light blue = data before imputation; Red = imputed data;"
} else {
boxplot_sub <- ""
}
# Vertical plot
colnames(blue_dots) <- sample_name_matches
- boxplot(
- blue_dots
- , las = 1 # "always horizontal"
- , col = const_boxplot_fill
- , ylim = ylim
+ my_ppep_distrib_bxp(
+ x = blue_dots
+ , sample_name_grow = sample_name_grow
, main = "Peptide intensities after eliminating unusable peptides"
- , sub = boxplot_sub
+ , varwidth = boxplot_varwidth
+ , sub = paste(boxplot_sub, "Box widths reflect number of peptides for sample")
, xlab = "Sample"
, ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+ , col = const_boxplot_fill
+ , notch = FALSE
+ , ylim = ylim
)
if (show_stripchart) {
@@ -1803,7 +3949,7 @@
method = "jitter", # Random noise
jitter = const_stripchart_jitter,
pch = 19, # Pch symbols
- cex = const_stripsmall_cex, # Size of symbols reduced
+ cex = const_stripchart_cex, # Size of symbols reduced
col = "lightblue", # Color of the symbol
vertical = TRUE, # Vertical mode
add = TRUE # Add it over
@@ -1813,49 +3959,71 @@
method = "jitter", # Random noise
jitter = const_stripchart_jitter,
pch = 19, # Pch symbols
- cex = const_stripsmall_cex, # Size of symbols reduced
+ cex = const_stripchart_cex, # Size of symbols reduced
col = "red", # Color of the symbol
vertical = TRUE, # Vertical mode
add = TRUE # Add it over
)
}
- if (TRUE) {
- # show measured values in blue on left half-violin plot
- cat("\\leavevmode\n\\quad\n\n\\quad\n\n")
- vioplot::vioplot(
- x = lapply(blue_dots, function(x) x[!is.na(x)]),
- col = "lightblue1",
- side = "left",
- plotCentre = "line",
- ylim = ylim_save,
- main = "Distributions of observed and imputed data",
- sub = "Light blue = observed data; Pink = imputed data",
- xlab = "Sample",
- ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+}
+```
+
+```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
+if (sum(is.na(quant_data)) > 0) {
+ # show measured values in blue on left half-violin plot
+ cat("\\leavevmode\n\\quad\n\n\\quad\n\n")
+ old_par <- par(
+ mai = par("mai") + c(g_ppep_distrib_ctl$mai_bottom, 0, 0, 0),
+ cex.axis = g_ppep_distrib_ctl$axis,
+ cex.lab = 1.2
+ )
+ tryCatch(
+ {
+ vioplot::vioplot(
+ x = lapply(blue_dots, function(x) x[!is.na(x)]),
+ col = "lightblue1",
+ side = "left",
+ plotCentre = "line",
+ ylim = ylim_save,
+ main = "Distributions of observed and imputed data",
+ sub = NULL,
+ las = 2,
+ xlab = NULL,
+ ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+ )
+ title(
+ sub = "Light blue = observed data; Pink = imputed data",
+ cex.sub = 1.0,
+ line = g_ppep_distrib_ctl$xlab_line + 1
)
- red_violins <- lapply(red_dots, function(x) x[!is.na(x)])
- cols_to_delete <- c()
- for (ix in seq_len(length(red_violins))) {
- if (length(red_violins[[ix]]) < 1) {
- cols_to_delete <- c(cols_to_delete, ix)
+ title(
+ xlab = "Sample",
+ line = g_ppep_distrib_ctl$xlab_line
+ )
+ red_violins <- lapply(red_dots, function(x) x[!is.na(x)])
+ cols_to_delete <- c()
+ for (ix in seq_len(length(red_violins))) {
+ if (length(red_violins[[ix]]) < 1) {
+ cols_to_delete <- c(cols_to_delete, ix)
+ }
}
- }
- # destroy any unimputable columns
- if (!is.null(cols_to_delete)) {
- red_violins <- red_violins[-cols_to_delete]
- }
- # plot imputed values in red on right half-violin plot
- vioplot::vioplot(
- x = red_violins,
- col = "lightpink1",
- side = "right",
- plotCentre = "line",
- add = TRUE
- )
- }
-
- par(old_par)
+ # destroy any unimputable columns
+ if (!is.null(cols_to_delete)) {
+ red_violins <- red_violins[-cols_to_delete]
+ }
+ # plot imputed values in red on right half-violin plot
+ vioplot::vioplot(
+ x = red_violins,
+ col = "lightpink1",
+ side = "right",
+ plotCentre = "line",
+ add = TRUE
+ )
+
+ },
+ finally = par(old_par)
+ )
# density plot
cat("\\leavevmode\n\n\n\n\n\n\n")
@@ -1893,88 +4061,88 @@
}
```
-# Perform Quantile Normalization
+# Quantile Normalization
The excellent `normalize.quantiles` function from
*[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)*
performs "quantile normalization" as described Bolstad *et al.* (2003),
DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)*
-and *its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html)*,
+and its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html),
i.e., it assumes that the goal is to detect
subtle differences among grossly similar samples (having similar distributions)
-by equailzing intra-quantile quantitations.
-Unfortunately, one software library upon which it depends
+by equalizing intra-quantile quantitations^[Unfortunately,
+one software library upon which `preprocessCore` depends
*[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)*
-that requires that a specific, non-concurrent version of the library be
+that requires that a specific, non-concurrent version of the library (`openblas` version $0.3.3$) be
installed. The installation command equivalent to what was used to install the library to produce the results presented here is:
-```
- conda install bioconductor-preprocesscore openblas=0.3.3
-```
+\linebreak
+` conda install bioconductor-preprocesscore openblas=0.3.3`].
```{r echo = FALSE, results = 'asis'}
+if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp)), "\n")
if (nrow(quant_data_imp) > 0) {
quant_data_imp_qn <- preprocessCore::normalize.quantiles(
as.matrix(quant_data_imp), keep.names = TRUE
@@ -1983,17 +4151,20 @@
quant_data_imp_qn <- as.matrix(quant_data_imp)
}
+if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn)), "\n")
+
quant_data_imp_qn <- as.data.frame(quant_data_imp_qn)
-
write_debug_file(quant_data_imp_qn)
quant_data_imp_qn_log <- log10(quant_data_imp_qn)
-
write_debug_file(quant_data_imp_qn_log)
+if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn_log)), "\n")
+if (print_nb_messages) nbe(see_variable(ncol(quant_data_imp_qn_log)), "\n")
+
quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn))))
-sel <- apply(quant_data_imp_qn_ls, 1, any_nan)
+sel <- row_apply(quant_data_imp_qn_ls, any_nan)
quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls
quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ]
@@ -2008,10 +4179,9 @@
data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log)
```
-
## Are normalized, imputed, log-transformed sample distributions similar?
-```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
+```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'}
# Save unimputed quant_data_log for plotting below
unimputed_quant_data_log <- quant_data_log
@@ -2034,6 +4204,22 @@
# data visualization
+ if (TRUE) {
+
+ my_ppep_distrib_bxp(
+ x = quant_data_log
+ , sample_name_grow = sample_name_grow
+ , main = "Peptide intensities for each sample"
+ , varwidth = boxplot_varwidth
+ , sub = NULL
+ , xlab = "Sample"
+ , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+ , col = const_boxplot_fill
+ , notch = boxplot_notch
+ )
+
+ } else {
+
old_par <- par(
mai = par("mai") + c(0.5, 0, 0, 0)
, oma = par("oma") + c(0.5, 0, 0, 0)
@@ -2043,12 +4229,16 @@
colnames(quant_data_log) <- sample_name_matches
boxplot(
quant_data_log
- , las = 1
+ , las = 2
+ , cex.axis = 0.9 * sample_name_grow
, col = const_boxplot_fill
, ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
, xlab = "Sample"
+ , notch = boxplot_notch
+ , varwidth = boxplot_varwidth
)
par(old_par)
+ }
} else {
cat("There are no peptides to plot\n")
}
@@ -2074,7 +4264,28 @@
# ANOVA Analysis
-```{r, echo = FALSE}
+## Assignment of $p$-value and quality score
+
+For each phosphopeptide, ANOVA analysis was performed to compute a $p$-value representing the evidence against the "null hypothesis" ($H_0$) that the intensity does not vary significantly among sample groups.
+
+However, because as more and more phosphopeptides are tested, there is increasd probability that, by random chance, a given peptide will have a $p$-value that appears to indicate significance. For this reason, the $p$-values were adjusted by applying the False Discovery Rate (FDR) correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x).
+
+Furthermore, to give more weight to phosphopeptides having fewer missing values, an (arbitrarily defined) quality score was assigned to each, defined as:
+
+$$
+\textit{quality}_j
+ = \frac{1 + o_{j}}{v_{j}(1 + o_{j}) + 0.005}
+$$
+
+where:
+
+- $o_j$ is the minimum number of non-missing observations per sample group for substrate $j$ for all sample groups, and
+- $v_j$ is the FDR-adjusted ANOVA $p$-value for substrate $j$.
+
+
+```{r, echo = FALSE, results = 'asis'}
+cat("\\newpage\n")
+
# Make new data frame containing only Phosphopeptides
# to connect preANOVA to ANOVA (connect_df)
connect_df <- data.frame(
@@ -2084,16 +4295,10 @@
colnames(connect_df) <- c("Phosphopeptide", "Intensity")
```
-```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
-count_of_treatment_levels <- length(levels(sample_treatment_levels))
+```{r anova, echo = FALSE, fig.dim = c(10, 12), results = 'asis'}
+g_can_run_ksea <- FALSE
+old_oma <- par("oma")
if (count_of_treatment_levels < 2) {
- nuke_control_sequences <-
- function(s) {
- s <- gsub("[\\]", "xyzzy_plugh", s)
- s <- gsub("[$]", "\\\\$", s)
- s <- gsub("xyzzy_plugh", "$\\\\backslash$", s)
- return(s)
- }
cat(
"ERROR!!!! Cannot perform ANOVA analysis",
"(see next page)\\newpage\n"
@@ -2109,7 +4314,7 @@
paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"),
"\n\\end{quote}\n\n")
- regex_sample_names <- nuke_control_sequences(regex_sample_names)
+ regex_sample_names <- latex_printable_control_seqs(regex_sample_names)
cat("\\leavevmode\n\n\n")
cat("Parsing rule for SampleNames is",
@@ -2125,7 +4330,7 @@
paste(sample_name_matches, collapse = "\n\n\n"),
"\n\\end{quote}\n\n")
- regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping)
+ regex_sample_grouping <- latex_printable_control_seqs(regex_sample_grouping)
cat("\\leavevmode\n\n\n")
cat("Parsing rule for SampleGrouping is",
@@ -2144,31 +4349,90 @@
} else {
- p_value_data_anova_ps <-
- apply(
- quant_data_imp_qn_log,
- 1,
- anova_func,
- grouping_factor = sample_treatment_levels,
- one_way_f = one_way_all_categories
- )
+ if (print_nb_messages) nbe("computing p_value_data_anova_ps\n")
+ if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn_log)), "\n")
+ if (print_nb_messages) nbe(see_variable(ncol(quant_data_imp_qn_log)), "\n")
+ if (print_nb_messages) nbe(see_variable(quant_data_imp_qn_log[, ".NE.7C"]), "\n")
+ if (print_nb_messages) nbe(see_variable(quant_data_imp_qn_log), "\n")
+ if (print_nb_messages) nbe(see_variable(anova_func), "\n")
+ if (print_nb_messages) nbe(see_variable(smpl_trt), "\n")
+ if (print_nb_messages) nbe(see_variable(one_way_all_categories), "\n")
+ tryCatch(
+ {
+ p_value_data_anova_ps <-
+ row_apply(
+ quant_data_imp_qn_log,
+ anova_func,
+ grouping_factor = smpl_trt,
+ one_way_f = one_way_all_categories
+ )
+ },
+ error = function(e) {
+ mesg <- paste("Could not compute ANOVA because", e$message)
+ cat("\n\n", mesg, "\n\n")
+ param_df_noexit(e)
+ sink(stderr())
+ cat("\n\n", mesg, "\n\n")
+ values <- paste(param_df$parameter, "=", param_df$value, collapse = "\n")
+ cat(values)
+ sink()
+ knitr::knit_exit()
+ exit(code = 1)
+ }
+ )
+ if (print_nb_messages) nbe(see_variable(p_value_data_anova_ps), "\n")
p_value_data_anova_ps_fdr <-
p.adjust(p_value_data_anova_ps, method = "fdr")
+ my_ppep_v <- full_data[, 1]
+ p_value_data <- list(
+ phosphopeptide = my_ppep_v,
+ raw_anova_p = p_value_data_anova_ps,
+ fdr_adjusted_anova_p = p_value_data_anova_ps_fdr,
+ missing_values = rowSums(is.na(quant_data)),
+ min_group_obs_count = min_group_obs_count
+ )
p_value_data <- data.frame(
- phosphopeptide = full_data[, 1]
- ,
- raw_anova_p = p_value_data_anova_ps
- ,
- fdr_adjusted_anova_p = p_value_data_anova_ps_fdr
+ phosphopeptide = my_ppep_v,
+ raw_anova_p = p_value_data_anova_ps,
+ fdr_adjusted_anova_p = p_value_data_anova_ps_fdr,
+ missing_values = rowSums(is.na(quant_data)),
+ min_group_obs_count = min_group_obs_count
)
+ p_value_data$quality <- 1.0 / with(
+ p_value_data,
+ fdr_adjusted_anova_p + 0.005 / (1 + min_group_obs_count)
+ )
+
+ p_value_data$ranking <-
+ with(
+ p_value_data,
+ switch(
+ g_intensity_hm_criteria,
+ "quality" = order(-quality),
+ "na_count" = order(missing_values, fdr_adjusted_anova_p),
+ # the default is "p_value"
+ order(fdr_adjusted_anova_p)
+ )
+ )
+ p_value_data <- p_value_data[p_value_data$ranking, , drop = FALSE]
+
+ write.table(
+ p_value_data,
+ file = "p_value_data.txt",
+ sep = "\t",
+ col.names = TRUE,
+ row.names = FALSE,
+ quote = FALSE
+ )
+
# output ANOVA file to constructed filename,
# e.g. "Outputfile_pST_ANOVA_STEP5.txt"
- # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt"
+ # becomes "Outputfile_pST_ANOVA_STEP5_FDR0.05.txt"
# Re-output datasets to include p-values
- metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:3])
+ metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:ncol(p_value_data)])
write.table(
cbind(metadata_plus_p, quant_data_imp),
file = imputed_data_filename,
@@ -2188,61 +4452,97 @@
)
- p_value_data <-
- p_value_data[order(p_value_data$fdr_adjusted_anova_p), ]
-
first_page_suppress <- 1
number_of_peptides_found <- 0
cutoff <- val_fdr[1]
for (cutoff in val_fdr) {
- if (number_of_peptides_found > 49) {
+ #loop through FDR cutoffs
+ if (number_of_peptides_found > g_intensity_hm_rows - 1) {
cat("\\leavevmode\n\n\n")
break
}
- #loop through FDR cutoffs
+ bool_1 <- (p_value_data$fdr_adjusted_anova_p < cutoff)
+ bool_2 <- (p_value_data$min_group_obs_count >= g_intensity_min_per_class)
+ g_can_run_ksea <- g_can_run_ksea || (sum(bool_2) > 0)
+ bool_4 <- (p_value_data$quality >= params$minQuality)
+ bool_3 <- as.logical(
+ as.integer(bool_1) *
+ as.integer(bool_2) *
+ as.integer(bool_4)
+ )
+ if (print_trace_messages) {
+ if (length(bool_1) > 30) {
+ cat_variable(bool_1, force_str = TRUE)
+ cat_variable(bool_2, force_str = TRUE)
+ cat_variable(bool_3, force_str = TRUE)
+ } else {
+ cat_variable(bool_1, suffix = "\n\n")
+ cat_variable(bool_2, suffix = "\n\n")
+ cat_variable(bool_3, suffix = "\n\n")
+ }
+ cat_variable(length(bool_3), digits = 0, suffix = "; ")
+ cat_variable(sum(bool_3), digits = 0, suffix = "\n\n")
+ }
filtered_p <-
- p_value_data[
- which(p_value_data$fdr_adjusted_anova_p < cutoff),
- , drop = FALSE
- ]
+ p_value_data[bool_3, , drop = FALSE]
+ filtered_p <-
+ filtered_p[!is.na(filtered_p$phosphopeptide), , drop = FALSE]
+
+ if (print_trace_messages)
+ cat_variable(filtered_p, force_str = TRUE)
+
+ have_remaining_peptides <- sum(bool_3, na.rm = TRUE) > 0
+ filter_result_string <-
+ sprintf(
+ "%s, %s of %0.0f peptides remained having both %s and %s.\n\n",
+ "After filtering for ANOVA results",
+ if (have_remaining_peptides)
+ as.character(sum(bool_3, na.rm = TRUE))
+ else
+ "none",
+ length(bool_3),
+ sprintf("adjusted p-value < %s", as.character(signif(cutoff, 2))),
+ sprintf(
+ "more than %0.0f observations in some groups",
+ max(0, g_intensity_min_per_class - 1)
+ )
+ )
+
filtered_data_filtered <-
quant_data_imp_qn_log[
rownames(filtered_p),
, drop = FALSE
]
+ # order by p-value
filtered_data_filtered <-
filtered_data_filtered[
order(filtered_p$fdr_adjusted_anova_p),
, drop = FALSE
]
- #
-
- if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) {
+ if (have_remaining_peptides && nrow(filtered_p) > 0 && nrow(filtered_data_filtered) > 0) {
if (first_page_suppress == 1) {
first_page_suppress <- 0
} else {
cat("\\newpage\n")
}
- if (nrow(filtered_data_filtered) > 1) {
- subsection_header(sprintf(
- "Intensity distributions for %d phosphopeptides whose adjusted p-value < %0.2f\n",
- nrow(filtered_data_filtered),
- cutoff
- ))
- } else {
- subsection_header(sprintf(
- "Intensity distribution for one phosphopeptide (%s) whose adjusted p-value < %0.2f\n",
- rownames(filtered_data_filtered)[1],
- cutoff
- ))
- }
- cat("\n\n\n")
- cat("\n\n\n")
-
- old_oma <- par("oma")
+ latex_samepage({
+ cat(filter_result_string)
+ if (nrow(filtered_data_filtered) > 1) {
+ cat(subsection_header(sprintf(
+ "Intensity distributions for %d phosphopeptides\n",
+ nrow(filtered_data_filtered)
+ )))
+ } else {
+ cat(subsection_header(sprintf(
+ "Intensity distribution for one phosphopeptide (%s)\n",
+ rownames(filtered_data_filtered)[1]
+ )))
+ }
+ }) # end latex_samepage
+
old_par <- par(
mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1),
oma = old_oma * c(1, 1, 0.3, 1),
@@ -2250,18 +4550,24 @@
cex.axis = 0.7,
fin = c(9, 7.25)
)
- # ref: https://r-charts.com/distribution/add-points-boxplot/
# Vertical plot
colnames(filtered_data_filtered) <- sample_name_matches
tryCatch(
boxplot(
filtered_data_filtered,
main = "Imputed, normalized intensities", # no line plot
- las = 1,
+ las = 2,
+ cex.axis = 0.9 * sample_name_grow,
col = const_boxplot_fill,
- ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+ ylab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
+ notch = FALSE,
+ varwidth = boxplot_varwidth
),
- error = function(e) print(e)
+ error = function(e) {
+ print(e)
+ cat_margins()
+ }
+
)
par(old_par)
} else {
@@ -2272,7 +4578,7 @@
))
}
- if (nrow(filtered_data_filtered) > 0) {
+ if (have_remaining_peptides && nrow(filtered_data_filtered) > 0) {
# Add Phosphopeptide column to anova_filtered table
# The assumption here is that the first intensity is unique;
# this is a hokey assumption but almost definitely will
@@ -2305,37 +4611,26 @@
# Produce heatmap to visualize significance and the effect of imputation
- anova_filtered_merge_format <- sapply(
- X = filtered_p$fdr_adjusted_anova_p
- ,
- FUN = function(x) {
- if (x > 0.0001)
- paste0("(%0.", 1 + ceiling(-log10(x)), "f) %s")
- else
- paste0("(%0.4e) %s")
- }
- )
-
cat_hm_heading <- function(m, cutoff) {
- cat("\\newpage\n")
- if (nrow(m) > intensity_hm_rows) {
- subsection_header(
+ if (nrow(m) > g_intensity_hm_rows) {
+ cat("\\clearpage\n")
+ cat(subsection_header(
paste(
sprintf("Heatmap for the %d most-significant peptides",
- intensity_hm_rows),
+ g_intensity_hm_rows),
sprintf("whose adjusted p-value < %0.2f\n", cutoff)
)
- )
+ ))
} else {
- if (nrow(m) == 1) {
+ if (nrow(m) == 0) {
return(FALSE)
} else {
- subsection_header(
+ cat(subsection_header(
paste(
- sprintf("Heatmap for %d usable peptides whose", nrow(m)),
+ sprintf("Heatmap for %d usable peptide genes whose", nrow(m)),
sprintf("adjusted p-value < %0.2f\n", cutoff)
)
- )
+ ))
}
}
cat("\n\n\n")
@@ -2346,40 +4641,125 @@
# construct matrix with appropriate rownames
m <-
as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ])
- if (nrow(m) > 0) {
+ nrow_m <- nrow(m)
+ ncol_m <- ncol(m)
+ if (nrow_m > 0) {
rownames_m <- rownames(m)
- rownames(m) <- sapply(
- X = seq_len(nrow(m))
- ,
+ q <- data.frame(pepname = rownames_m)
+ g <- sqldf("
+ SELECT q.pepname, substr(met.Gene_Name, 1, 30) AS gene_name
+ FROM q, metadata_plus_p AS met
+ WHERE q.pepname = met.Phosphopeptide
+ ORDER BY q.rowid
+ ")
+ tmp <- sapply(
+ X = seq_len(nrow(g)),
+ FUN = function(i) {
+ pre <- strsplit(g$gene_name[i], "; ")[[1]]
+ rslt <- paste(unique(pre), sep = "; ")
+ return(rslt)
+ }
+ )
+ tmp <- unlist(tmp)
+ tmp <-
+ make.names(tmp, unique = TRUE)
+ tmp <- sub(
+ "No_Gene_Name",
+ "not_found",
+ tmp,
+ fixed = TRUE
+ )
+ ten_trunc_names <- trunc_ppep(rownames_m)
+ tmp <- sapply(
+ X = seq_len(nrow_m),
FUN = function(i) {
sprintf(
- anova_filtered_merge_format[i],
- filtered_p$fdr_adjusted_anova_p[i],
- rownames_m[i]
+ "(%s) %s",
+ tmp[i],
+ ten_trunc_names[i]
)
}
)
+ rownames(m) <- tmp
}
# draw the heading and heatmap
- if (nrow(m) > 0) {
+ if (nrow_m > 0) {
number_of_peptides_found <-
- draw_intensity_heatmap(
+ ppep_heatmap(
m = m,
cutoff = cutoff,
hm_heading_function = cat_hm_heading,
- hm_main_title = "Unimputed, unnormalized log(intensities)",
- suppress_row_dendrogram = FALSE
+ hm_main_title =
+ "log(intensities), row-scaled, unimputed, unnormalized",
+ suppress_row_dendrogram = FALSE,
+ master_cex = 0.35,
+ sepcolor = "black",
+ colsep = sample_colsep
)
+ if (number_of_peptides_found > 1) {
+ cat("\\leavevmode\n")
+ }
}
}
}
}
-cat("\\leavevmode\n\n\n")
+cat(filter_result_string)
+cat("\\leavevmode\n")
+
+if (!g_can_run_ksea) {
+ errmsg <- paste("Cannot proceed with KSEA analysis",
+ "because too many values are missing.")
+ if (FALSE) cat0(
+ errmsg,
+ "\\stepcounter{offset}\n",
+ "\\stepcounter{offset}\n",
+ "\\stepcounter{offset}\n",
+ " in ",
+ table_href(),
+ ".\n\n"
+ )
+ if (FALSE) {
+ if (print_nb_messages) nbe(see_variable(p_value_data))
+ } else {
+ if (print_nb_messages) nbe(see_variable(p_value_data))
+
+ display_p_value_data <- p_value_data
+ display_p_value_data$raw_anova_p <-
+ sprintf("%0.3g", display_p_value_data$raw_anova_p)
+ display_p_value_data$fdr_adjusted_anova_p <-
+ sprintf("%0.3g", display_p_value_data$fdr_adjusted_anova_p)
+ display_p_value_data$quality <-
+ sprintf("%0.3g", display_p_value_data$quality)
+
+ headers_1st_line <-
+ c("", "Raw ANOVA", "FDR-adj.", "Missing", "Min. #", "", "")
+ headers_2nd_line <-
+ c("Phosphopeptide", "p-value", "p-value", "values", "group-obs", "Quality", "Ranking")
+ data_frame_tabbing_latex(
+ x = display_p_value_data,
+ tabstops = c(2.75, 0.80, 0.80, 0.5, 0.6, 0.60),
+ use_subsubsection_header = FALSE,
+ headings = c(headers_1st_line, headers_2nd_line),
+ caption = "ANOVA results"
+
+ )
+ }
+ data_frame_tabbing_latex(
+ x = save_sample_treatment_df,
+ tabstops = c(1.25, 1.25),
+ caption = "Sample classes",
+ use_subsubsection_header = FALSE
+ )
+ param_df_exit()
+ knitr::knit_exit()
+ return(invisible(-1))
+}
+
```
```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
-if (count_of_treatment_levels > 1) {
+if (g_can_run_ksea && count_of_treatment_levels > 1) {
# Prepare two-way contrasts with adjusted p-values
# Strategy:
# - use imputed, log-transformed data:
@@ -2395,7 +4775,7 @@
# Each contrast is between a combination of trt levels
m2 <- combn(
- x = seq_len(length(levels(sample_treatment_levels))),
+ x = seq_len(length(levels(smpl_trt))),
m = 2,
simplify = TRUE
)
@@ -2409,13 +4789,13 @@
return(
data.frame(
contrast = cntrst,
- level = sample_treatment_levels[
- sample_treatment_levels %in%
- levels(sample_treatment_levels)[c(lvl1, lvl2)]
+ level = smpl_trt[
+ smpl_trt %in%
+ levels(smpl_trt)[c(lvl1, lvl2)]
],
label = sample_name_matches[
- sample_treatment_levels %in%
- levels(sample_treatment_levels)[c(lvl1, lvl2)]
+ smpl_trt %in%
+ levels(smpl_trt)[c(lvl1, lvl2)]
]
)
)
@@ -2595,6 +4975,7 @@
)
# - create contrast-metadata table
+ if (print_nb_messages) nbe("CREATE TABLE contrast_lvl_lvl_metadata")
dml_no_rows_exec(db, "
CREATE TABLE contrast_lvl_lvl_metadata
AS
@@ -2705,12 +5086,11 @@
# - run the two-level (one-way) test
p_value_data_contrast_ps <-
- apply(
- X = contrast_cast_data,
- MARGIN = 1, # apply to rows
- FUN = anova_func,
+ row_apply(
+ x = contrast_cast_data,
+ fun = anova_func,
grouping_factor =
- as.factor(as.numeric(grouping_factor$level)), # anova_func arg2
+ as.factor(grouping_factor$level), # anova_func arg2
one_way_f = one_way_two_categories, # anova_func arg3
simplify = TRUE # TRUE is the default for simplify
)
@@ -2922,6 +5302,8 @@
;
"
)
+ # We are done with DDL and insertion
+ RSQLite::dbDisconnect(db)
}
```
@@ -2929,7 +5311,7 @@
cat("\\newpage\n")
```
-# KSEA Analysis
+# KSEA Analysis Summaries
Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package:
@@ -2940,24 +5322,24 @@
For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as:
$$
-\text{kinase enrichment score}_{j,i} = \frac{(\overline{s}_{j,i} - \overline{p}_j)\sqrt{m_{j,i}}}{\delta_j}
+\text{kinase enrichment }z\text{-score}_{j,i} = \frac{(\overline{`r sfc`}_{j,i} - \overline{`r pfc`}_j)\sqrt{m_{j,i}}}{\delta_j}
$$
and fold-enrichment is computed as:
$$
-\text{Enrichment}_{j,i} = \frac{\overline{s}_{j,i}}{\overline{p}_j}
+\text{Enrichment}_{j,i} = \frac{\overline{`r sfc`}_{j,i}}{\overline{`r pfc`}_j}
$$
where:
-- $\overline{s}_{j,i}$ is the mean $\log_2 (|\text{fold-change|})$ in intensities (for contrast $j$) of known substrates of the kinase $i$,
-- $\overline{p}_j$ is the mean $\log_2 (|\text{fold-change}|)$ of all phosphosites identified in contrast $j$, and
+- $\overline{`r sfc`}_{j,i}$ is the mean `r pfc_txt` in intensities of known substrates of the kinase $i$ in contrast $j$,
+- $\overline{`r pfc`}_j$ is the mean `r pfc_txt` of all phosphosites identified in contrast $j$, and
- $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$,
-- $\delta_j$ is the standard deviation of the $\log_2 (|\text{fold-change}|)$ for contrast $j$ across all phosphosites in the dataset.
+- $\delta_j$ is the standard deviation of the $\log_2 (\text{fold-change})$ for contrast $j$ across all phosphosites in the dataset.
- Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score.
-$\text{FDR}_{j,i}$ is computed from the $p$-value for the z-score using the R `stats::p.adjust` function, applying the False Discovery Rate correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x)
+$\text{FDR}_{j,i}$ is the False Discovery Rate corrected kinase enrichment score.
Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive).
@@ -2965,9 +5347,14 @@
- Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis.
- Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)).
-- Kinase-strate deata were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads).
-
-```{r ksea, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
+- Kinase-substrate data were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads).
+
+For each enriched kinase, a heatmap showing the intensities is presented for up to `r g_intensity_hm_rows` substrates, i.e., those substrates having the highest"quality".
+
+Where possible, a heatmap of the correlations among these the selected substrates is also presented; if correlations cannot be computed (because of too many missing values), then the covariances are heatmapped for substrates having a variance greater than 1.
+
+```{r ksea, echo = FALSE, fig.dim = c(12, 14.5), results = 'asis'}
+cat("\\clearpage\n")
db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db)
@@ -3046,21 +5433,9 @@
contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level)
contrast_longlabel <- (
sprintf(
- "Trt %s {%s} -> Trt %s {%s}",
+ "Class %s -> Class %s",
contrast_metadata_df[i_cntrst, "b_level"],
- gsub(
- pattern = ";",
- replacement = ", ",
- x = contrast_metadata_df[i_cntrst, "b_samples"],
- fixed = TRUE
- ),
- contrast_metadata_df[i_cntrst, "a_level"],
- gsub(
- pattern = ";",
- replacement = ", ",
- x = contrast_metadata_df[i_cntrst, "a_samples"],
- fixed = TRUE
- )
+ contrast_metadata_df[i_cntrst, "a_level"]
)
)
kseaapp_input <-
@@ -3091,18 +5466,44 @@
expr = {
ksea_scores_rslt <-
ksea_scores(
- ksdata = pseudo_ksdata, # KSEAapp::KSData,
- px = kseaapp_input,
- networkin = TRUE,
- networkin_cutoff = 2
+ ksdata = pseudo_ksdata,
+ px = kseaapp_input,
+ networkin = TRUE,
+ networkin_cutoff = 2,
+ minimum_substrate_count = ksea_min_substrate_count
)
+ if (FALSE) {
+ ksea_scores_rslt <-
+ ksea_scores_rslt[
+ ksea_scores_rslt$m >= ksea_min_substrate_count,
+ ,
+ drop = FALSE
+ ]
+ }
+
+ if (FALSE) {
+ data_frame_tabbing_latex(
+ x = ksea_scores_rslt,
+ tabstops = c(0.8, 0.8, 0.8, 0.8, 0.8, 0.8),
+ caption = paste("KSEA scores for contrast ",
+ cntrst_b_level, "to", cntrst_a_level),
+ use_subsubsection_header = FALSE
+ )
+ }
+
+ if (FALSE) {
+ if (print_nb_messages) nbe("Output contents of `ksea_scores_rslt` table\n")
+ cat_variable(ksea_scores_rslt)
+ cat("\n\\clearpage\n")
+ }
+
if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) {
next_index <- 1 + next_index
rslt$score_list[[next_index]] <- ksea_scores_rslt
rslt$name_list[[next_index]] <- contrast_label
rslt$longname_list[[next_index]] <- contrast_longlabel
- low_fdr_print(
+ ksea_low_fdr_print(
rslt = rslt,
i_cntrst = i_cntrst,
i = next_index,
@@ -3113,21 +5514,24 @@
)
}
},
- error = function(e) str(e)
+ error = function(e) {
+ str(e)
+ cat_margins()
+ }
)
}
plotted_kinases <- NULL
-if (length(rslt$score_list) > 1) {
+if (g_can_run_ksea && length(rslt$score_list) > 1) {
for (i in seq_len(length(ksea_heatmap_titles))) {
hdr <- ksea_heatmap_titles[[i]]
which_kinases <- i
cat("\\clearpage\n\\begin{center}\n")
if (i == const_ksea_astrsk_kinases) {
- subsection_header(hdr)
+ cat(subsection_header(hdr))
} else {
- subsection_header(hdr)
+ cat(subsection_header(hdr))
}
cat("\\end{center}\n")
@@ -3146,7 +5550,7 @@
m_cutoff = 1,
# a numeric value between 0 and 1 indicating the p-value/FDR cutoff
# for indicating significant kinases in the heatmap
- p_cutoff = 0.05,
+ p_cutoff = params$kseaCutoffThreshold,
# a binary input of TRUE or FALSE, indicating whether or not to perform
# hierarchical clustering of the sample columns
sample_cluster = TRUE,
@@ -3163,131 +5567,446 @@
# - 1 : all kinases
# - 2 : significant kinases
# - 3 : non-significant kinases
- which_kinases = which_kinases
+ which_kinases = which_kinases,
+ margins = c(7, 15)
)
- cat("\\begin{center}\n")
- cat("Color intensities reflects $z$-score magnitudes; hue reflects $z$-score sign. Asterisks reflect significance.\n")
- cat("\\end{center}\n")
+ if (!is.null(plotted_kinases)) {
+ cat("\\begin{center}\n")
+ if (which_kinases != const_ksea_nonastrsk_kinases)
+ cat("Asterisks reflect significance.\n")
+ cat("\\end{center}\n")
+ }
} # end for (i in ...
} # end if (length ...
-
-for (i_cntrst in seq_len(length(rslt$score_list))) {
- next_index <- i_cntrst
- cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"]
- cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"]
- cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6]
- contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level)
- contrast_longlabel <- (
- sprintf(
- "Trt %s {%s} -> Trt %s {%s}",
- contrast_metadata_df[i_cntrst, "b_level"],
- gsub(
- pattern = ";",
- replacement = ", ",
- x = contrast_metadata_df[i_cntrst, "b_samples"],
- fixed = TRUE
- ),
- contrast_metadata_df[i_cntrst, "a_level"],
- gsub(
- pattern = ";",
- replacement = ", ",
- x = contrast_metadata_df[i_cntrst, "a_samples"],
- fixed = TRUE
+```
+
+```{r kseabar_calc, echo = FALSE, fig.dim = c(9.5, 6), results = 'asis'}
+ksea_prints <- list()
+ksea_barplots <- list()
+ for (i_cntrst in seq_len(length(rslt$score_list))) {
+ next_index <- i_cntrst
+ cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"]
+ cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"]
+ cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6]
+ contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level)
+ contrast_longlabel <- (
+ sprintf(
+ "Class %s -> Class %s",
+ contrast_metadata_df[i_cntrst, "b_level"],
+ contrast_metadata_df[i_cntrst, "a_level"]
+ )
+ )
+ main_title <- (
+ sprintf(
+ "Change from treatment %s to treatment %s",
+ contrast_metadata_df[i_cntrst, "b_level"],
+ contrast_metadata_df[i_cntrst, "a_level"]
+ )
)
+ sub_title <- contrast_longlabel
+ tryCatch(
+ expr = {
+ ksea_scores_rslt <- rslt$score_list[[next_index]]
+ if (print_nb_messages) nbe(see_variable(ksea_scores_rslt)) #ACE
+
+ if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) {
+ sink(deferred <- file())
+ ksea_low_fdr_print(
+ rslt = rslt,
+ i_cntrst = i_cntrst,
+ i = next_index,
+ a_level = cntrst_a_level,
+ b_level = cntrst_b_level,
+ fold_change = cntrst_fold_change,
+ caption = contrast_longlabel,
+ write_db = FALSE,
+ anchor = const_table_anchor_t
+ )
+ cat("\n")
+ sink()
+ lines <-
+ paste(
+ readLines(deferred, warn = FALSE),
+ collapse = "\n"
+ )
+ close(deferred)
+ sq_put(ksea_prints, lines)
+ sink(stderr())
+ cat("\n---\n")
+ cat_variable(ksea_prints)
+ barplot_closure <-
+ ksea_low_fdr_barplot_factory(
+ rslt = rslt,
+ i_cntrst = i_cntrst,
+ i = next_index,
+ a_level = cntrst_a_level,
+ b_level = cntrst_b_level,
+ fold_change = cntrst_fold_change,
+ caption = contrast_longlabel
+ )
+ if (rlang::is_closure(barplot_closure))
+ sq_put(ksea_barplots, barplot_closure)
+ else
+ sq_put(ksea_barplots, no_op)
+ str(ksea_barplots)
+ cat("\n...\n")
+ sink()
+ }
+ },
+ error = function(e) {
+ str(e)
+ cat_margins()
+ }
+ )
+ }
+```
+
+```{r phosphoelm_kinase_upid_desc, echo = FALSE, fig.dim = c(12, 13.7), results = 'asis'}
+
+have_kinase_descriptions <-
+ if (!is.null(bzip2df(kinase_uprt_desc_lut, kinase_uprt_desc_lut_bz2)) &&
+ !is.null(bzip2df(kinase_name_uprt_lut, kinase_name_uprt_lut_bz2))
+ ) {
+ rownames(kinase_uprt_desc_lut) <- kinase_uprt_desc_lut$UniProtID
+ kinase_name_to_desc_uprt <- function(s) {
+ rslt <- NULL
+ tryCatch(
+ {
+ which_rows <- eval(s == kinase_name_uprt_lut$kinase)
+ kinase_uprtid <-
+ kinase_name_uprt_lut[which_rows, 2]
+ # filter for first _HUMAN match if any
+ grepl_human <- grepl("_HUMAN$", kinase_uprtid)
+ if (0 < sum(grepl_human))
+ kinase_uprtid <- kinase_uprtid[grepl_human]
+ # filter for first match if any
+ if (0 < length(kinase_uprtid)) {
+ kinase_uprtid <- kinase_uprtid[1]
+ kinase_desc <- kinase_uprt_desc_lut[kinase_uprtid, 2]
+ if (!is.na(kinase_desc))
+ rslt <- c(kinase_desc, kinase_uprtid)
+ else
+ rslt <- c(kinase_desc, "")
+ }
+ },
+ warning = str
+ )
+ rslt
+ }
+ TRUE
+ } else {
+ kinase_name_to_desc_uprt <- function(s) NULL
+ FALSE
+ }
+```
+
+```{r write_params, echo = FALSE, results = 'asis'}
+# perhaps this should be moved into the functions section, eventually ...
+write_params <- function(db) {
+ # write parameters to report
+
+ # write parameters to SQLite output
+
+ mqppep_anova_script_param_df <- data.frame(
+ script = "mqppep_anova_script.Rmd",
+ parameter = names(param_unlist),
+ value = param_unlist
)
+ ddl_exec(db, "
+ DROP TABLE IF EXISTS script_parameter;
+ "
)
- main_title <- (
- sprintf(
- "Change from treatment %s to treatment %s",
- contrast_metadata_df[i_cntrst, "b_level"],
- contrast_metadata_df[i_cntrst, "a_level"]
- )
+ ddl_exec(db, "
+ CREATE TABLE IF NOT EXISTS script_parameter(
+ script TEXT,
+ parameter TEXT,
+ value ANY,
+ UNIQUE (script, parameter) ON CONFLICT REPLACE
+ )
+ ;
+ "
+ )
+ RSQLite::dbWriteTable(
+ conn = db,
+ name = "script_parameter",
+ value = mqppep_anova_script_param_df,
+ append = TRUE
)
- sub_title <- contrast_longlabel
- tryCatch(
- expr = {
- ksea_scores_rslt <- rslt$score_list[[next_index]]
-
- if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) {
- low_fdr_barplot(
- rslt = rslt,
- i_cntrst = i_cntrst,
- i = next_index,
- a_level = cntrst_a_level,
- b_level = cntrst_b_level,
- fold_change = cntrst_fold_change,
- caption = contrast_longlabel
- )
- }
- },
- error = function(e) str(e)
- )
+
+ loaded_packages_df <- sessioninfo::package_info("loaded")
+ loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library)
+ loaded_packages_df <- data.frame(
+ package = loaded_packages_df$package,
+ version = loaded_packages_df$loadedversion,
+ date = loaded_packages_df$date
+ )
+ #ACE cat("\\clearpage\n\\section{R package versions}\n")
+ #ACE data_frame_tabbing_latex(
+ #ACE x = loaded_packages_df,
+ #ACE tabstops = c(2.5, 1.25),
+ #ACE caption = "R package versions"
+ #ACE )
+ cat("\\clearpage\n\\section{Input parameter settings}\n")
+ data_frame_tabbing_latex(
+ x = param_df,
+ tabstops = c(1.75),
+ underscore_whack = TRUE,
+ caption = "Input parameters",
+ verbatim = FALSE
+ )
+}
+
+if (!have_kinase_descriptions) {
+ write_params(db)
+ # We are done with output
+ RSQLite::dbDisconnect(db)
+ param_df_exit()
+ knitr::knit_exit()
}
```
-```{r enriched, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
-
-# Use enriched kinases to find enriched kinase-substrate pairs
-enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash))
-all_enriched_substrates <- sqldf("
- SELECT
- gene AS kinase,
- ppep,
- '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label
- FROM (
- SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
- FROM pseudo_ksdata
- WHERE GENE IN (SELECT kinase FROM enriched_kinases)
+```{r kseabar, echo = FALSE, fig.dim = c(9.5, 12.3), results = 'asis'}
+if (have_kinase_descriptions) {
+ my_section_header <-
+ sprintf(
+ "inases whose KSEA %s < %s\n",
+ ksea_cutoff_statistic,
+ signif(ksea_cutoff_threshold, 2)
+ )
+
+ # Use enriched kinases to find enriched kinase-substrate pairs
+ enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash))
+
+ enriched_kinase_descs <-
+ Reduce(
+ f = function(l, r) {
+ lkup <- kinase_name_to_desc_uprt(r)
+ if (is.null(lkup)) l
+ else r2 <- rbind(
+ l,
+ data.frame(
+ kinase = r,
+ uniprot_id = lkup[2],
+ description = lkup[1]
+ )
+ )
+ },
+ x = enriched_kinases$kinase,
+ init = NULL
+ )
+
+ if (length(enriched_kinase_descs) > 0 && nrow(enriched_kinase_descs) > 0) {
+ cat("\n\\clearpage\n")
+ data_frame_tabbing_latex(
+ x = enriched_kinase_descs,
+ tabstops = c(0.9, 1.3),
+ headings = c("Kinase", "UniProt ID", "Description"),
+ caption = paste0("Descriptions of k", my_section_header)
+ )
+ }
+
+ if (FALSE) {
+ cat_variable(sqldf("SELECT kinase FROM enriched_kinases"))
+ cat_variable(sqldf("
+ SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
+ FROM pseudo_ksdata
+ WHERE gene IN (SELECT kinase FROM enriched_kinases)
+ "))
+ data_frame_table_latex(
+ x = sqldf("
+ SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
+ FROM pseudo_ksdata
+ WHERE gene IN (SELECT kinase FROM enriched_kinases)
+ "),
+ justification = "l l l",
+ centered = TRUE,
+ caption = "substrates of enriched kinases",
+ anchor = c(const_table_anchor_p, const_table_anchor_t),
+ underscore_whack = TRUE
)
- GROUP BY ppep
- ")
-
-# helper used to label per-kinase substrate enrichment figure
-cat_enriched_heading <- function(m, cut_args) {
- cutoff <- cut_args$cutoff
- kinase <- cut_args$kinase
- statistic <- cut_args$statistic
- threshold <- cut_args$threshold
- cat("\\newpage\n")
- if (nrow(m) > intensity_hm_rows) {
- subsection_header(
- paste(
+ data_frame_table_latex(
+ x = sqldf("
+ SELECT
+ gene AS kinase,
+ ppep,
+ sub_gene,
+ '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label,
+ fdr_adjusted_anova_p,
+ quality,
+ min_group_obs_count
+ FROM (
+ SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
+ FROM pseudo_ksdata
+ WHERE gene IN (SELECT kinase FROM enriched_kinases)
+ ),
+ p_value_data
+ WHERE ppep = phosphopeptide
+ GROUP BY kinase, ppep
+ ORDER BY kinase, ppep, p_value_data.quality DESC
+ "),
+ justification = "l l l l l l l",
+ centered = TRUE,
+ caption = "labeled substrates of enriched kinases",
+ anchor = c(const_table_anchor_p, const_table_anchor_t),
+ underscore_whack = TRUE
+ )
+ }
+ all_enriched_substrates <- sqldf("
+ SELECT
+ gene AS kinase,
+ ppep,
+ sub_gene,
+ '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label,
+ fdr_adjusted_anova_p,
+ quality,
+ min_group_obs_count
+ FROM (
+ SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
+ FROM pseudo_ksdata
+ WHERE gene IN (SELECT kinase FROM enriched_kinases)
+ ),
+ p_value_data
+ WHERE ppep = phosphopeptide
+ GROUP BY kinase, ppep
+ ORDER BY kinase, ppep, p_value_data.quality DESC
+ ")
+
+ all_enriched_substrates <-
+ all_enriched_substrates[
+ all_enriched_substrates$quality >= params$minQuality,
+ ,
+ drop = FALSE
+ ]
+
+ all_enriched_substrates$sub_gene <-
+ sub(
+ " ///.*",
+ " ...",
+ all_enriched_substrates$sub_gene
+ )
+
+ all_enriched_substrates$label <-
+ with(
+ all_enriched_substrates,
+ sprintf(
+ "(%s-%s) %s",
+ kinase,
+ trunc_subgene(sub_gene),
+ ppep
+ )
+ )
+
+ # this global is set to TRUE by cat_enriched_heading immediately below
+ g_neednewpage <- FALSE
+
+ # helper used to label per-kinase substrate enrichment figure
+ cat_enriched_heading <- function(m, cut_args) {
+ cutoff <- cut_args$cutoff
+ kinase <- cut_args$kinase
+ if (g_neednewpage) cat("\\newpage\n")
+ g_neednewpage <- TRUE
+ if (nrow(m) > g_intensity_hm_rows) {
+ cat(subsection_header(
sprintf(
- "Lowest p-valued %d (of %d) enriched %s-substrates,",
- intensity_hm_rows,
+ "Highest-quality %d (of %d) enriched %s-substrates",
+ g_intensity_hm_rows,
nrow(m),
kinase
- ),
- sprintf(" KSEA %s < %0.2f\n", statistic, threshold)
- )
- )
- } else {
- if (nrow(m) == 1) {
- return(FALSE)
+ )
+ ))
} else {
- subsection_header(
- paste(
+ if (nrow(m) == 0) {
+ return(FALSE)
+ } else {
+ nrow_m <- nrow(m)
+ cat(subsection_header(
sprintf(
- "%d enriched %s-substrates,",
- nrow(m),
- kinase
- ),
- sprintf(
- " KSEA %s < %0.2f\n",
- statistic,
- threshold
+ "%d enriched %s-substrate%s",
+ nrow_m,
+ kinase,
+ if (nrow_m > 1) "s" else ""
)
+ ))
+ }
+ }
+ cat("\n\n\n")
+ cat("\n\n\n")
+ return(TRUE)
+ }
+
+ # --------------------------------
+ # hack begin - show all substrates
+ enriched_substrates <- all_enriched_substrates
+ # add "FALSE &&" to prevent listing of substrates
+ if (show_enriched_substrates && nrow(enriched_substrates) > 0) {
+ short_row_names <- sub(
+ "$FAILED_MATCH_GENE_NAME",
+ "not_found",
+ enriched_substrates$sub_gene,
+ fixed = TRUE
+ )
+
+ if (print_nb_messages) nbe(see_variable(enriched_substrates))
+ substrates_df <- with(
+ enriched_substrates,
+ data.frame(
+ kinase = kinase,
+ substrate = sub(" ///*", "...", short_row_names),
+ anova_p_value = signif(fdr_adjusted_anova_p, 2),
+ min_group_obs_count = signif(min_group_obs_count, 0),
+ quality = signif(quality, 3),
+ sequence = trunc_n(30)(ppep)
)
)
- }
+
+ substrates_df <- substrates_df[
+ with(substrates_df, order(kinase, -quality)),
+ ,
+ drop = FALSE
+ ]
+
+ if (print_nb_messages) nbe(see_variable(substrates_df))
+ if (nrow(substrates_df) < 1)
+ substrates_df$sequence <- c()
+ if (print_nb_messages) nbe(see_variable(substrates_df))
+ names(substrates_df) <- headers_2nd_line <-
+ c("Kinase", "Substrate", "p-value", "per group)", "quality", "Sequence")
+ headers_1st_line <- c("", "", "ANOVA", "min(values", "", "")
+ data_frame_tabbing_latex(
+ x = substrates_df,
+ tabstops = c(1.2, 0.8, 0.5, 0.65, 0.5),
+ headings = c(headers_1st_line, headers_2nd_line),
+ caption = "Details for all enriched substrates of enriched kinases"
+ )
+ rm(
+ enriched_substrates,
+ substrates_df,
+ short_row_names,
+ headers_1st_line,
+ headers_2nd_line
+ )
}
- cat("\n\n\n")
- cat("\n\n\n")
- return(TRUE)
+ cat("\\clearpage\n")
+ # hack end - show all substrates
+ # --------------------------------
+
+ # print deferred tables and graphs for kinases from contrasts
+ for (i_cntrst in seq_len(length(ksea_prints))) {
+ #latex_samepage({
+ cat(ksea_prints[[i_cntrst]])
+ cat("\n")
+ ksea_barplots[[i_cntrst]]()
+ cat("\n")
+ cat("\\clearpage\n")
+ #})
+ }
+
}
-
-# Disabling heatmaps for substrates pending decision whether to eliminate them altogether
-if (FALSE)
+```
+
+```{r enriched, echo = FALSE, fig.dim = c(12, 13.7), results = 'asis'}
+if (g_can_run_ksea) {
+ g_did_enriched_header <- FALSE
for (kinase_name in sort(enriched_kinases$kinase)) {
enriched_substrates <-
all_enriched_substrates[
@@ -3295,135 +6014,256 @@
,
drop = FALSE
]
+ ten_trunc_ppep <- trunc_enriched_substrate(enriched_substrates$ppep)
+ enriched_substrates$label <- with(
+ enriched_substrates,
+ sprintf(
+ "(%s) %s",
+ make.names(
+ sub("$FAILED_MATCH_GENE_NAME", "not_found", sub_gene, fixed = TRUE),
+ unique = TRUE
+ ),
+ ten_trunc_ppep
+ )
+ )
# Get the intensity values for the heatmap
enriched_intensities <-
as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE])
+
# Remove rows having too many NA values to be relevant
- na_counter <- is.na(enriched_intensities)
- na_counts <- apply(na_counter, 1, sum)
- enriched_intensities <-
- enriched_intensities[na_counts < ncol(enriched_intensities) / 2, , drop = FALSE]
+ good_rows <- (rowSums(enriched_intensities, na.rm = TRUE) != 0)
+ #ACE nbe(see_variable(good_rows), "\n")
+ enriched_substrates <- enriched_substrates[good_rows, , drop = FALSE]
+ enriched_intensities <- enriched_intensities[good_rows, , drop = FALSE]
+
# Rename the rows with the display-name for the heatmap
- rownames(enriched_intensities) <-
+ short_row_names <- sub(
+ "$FAILED_MATCH_GENE_NAME",
+ "not_found",
+ enriched_substrates$sub_gene,
+ fixed = TRUE
+ )
+ short_row_names <-
+ make.names(short_row_names, unique = TRUE)
+ long_row_names <-
sapply(
X = rownames(enriched_intensities),
FUN = function(rn) {
enriched_substrates[enriched_substrates$ppep == rn, "label"]
}
)
+ rownames(enriched_intensities) <- long_row_names
# Format as matrix for heatmap
m <- as.matrix(enriched_intensities)
+ rownames(m) <- trunc_enriched_substrate(rownames(m))
+
+ #ACE nb("m with bad rows: ", see_variable(m), "\n")
+ #ACE good_rows <- (rowSums(m, na.rm = TRUE) != 0)
+ #ACE nb(see_variable(good_rows), "\n")
+ #ACE m <- m[good_rows, , drop = FALSE]
+ #ACE nb("m without(?) bad rows: ", see_variable(m), "\n")
+ #ACE nb(see_variable(short_row_names), "\n")
+ #ACE local_short_row_names <- short_row_names[good_rows]
+ #ACE local_long_row_names <- long_row_names[good_rows]
+ #ACE local_enriched_intensities <- enriched_intensities[local_long_row_names, ]
+
# Draw the heading and heatmap
- if (nrow(m) > 0) {
+ nrow_m <- nrow(m)
+ if (nrow_m > 0) {
+ if (!g_did_enriched_header) {
+ cat("\n\\clearpage\n")
+ cat(section_header(paste0("K", my_section_header)))
+ g_did_enriched_header <- TRUE
+ }
+ is_na_m <- is.na(m)
+ cellnote_m <- is_na_m
+ cellnote_m[!is_na_m] <- ""
+ cellnote_m[is_na_m] <- "NA"
cut_args <- new_env()
cut_args$cutoff <- cutoff
cut_args$kinase <- kinase_name
cut_args$statistic <- ksea_cutoff_statistic
cut_args$threshold <- ksea_cutoff_threshold
number_of_peptides_found <-
- draw_intensity_heatmap(
+ ppep_heatmap(
m = m,
+ cellnote = cellnote_m,
cutoff = cut_args,
hm_heading_function = cat_enriched_heading,
hm_main_title
= "Unnormalized (zero-imputed) intensities of enriched kinase-substrates",
- suppress_row_dendrogram = FALSE
+ suppress_row_dendrogram = FALSE,
+ master_cex = 0.35,
+ sepcolor = "black",
+ colsep = sample_colsep
)
+ if (number_of_peptides_found > 1) {
+
+ tryCatch(
+ {
+ rownames(m) <- short_row_names
+ cov_heatmap(m, nrow_m > g_intensity_hm_rows)
+ },
+ error = function(e) {
+ cat(
+ sprintf(
+ "ERROR: %s\n\\newline\n",
+ mget("e")
+ )
+ )
+ cat(
+ sprintf(
+ "message: %s\n\\newline\n",
+ e$message
+ )
+ )
+ cat_margins()
+ }
+ )
+ }
+ substrates_df <- with(
+ enriched_substrates,
+ data.frame(
+ substrate = sub(" ///*", "...", short_row_names),
+ sequence = trunc_long_ppep(ppep),
+ anova_p_value = signif(fdr_adjusted_anova_p, 2),
+ min_group_obs_count = signif(min_group_obs_count, 0),
+ quality = signif(quality, 3)
+ )
+ )
+ excess_substrates <- nrow(substrates_df) > g_intensity_hm_rows
+ if (excess_substrates)
+ substrates_df <- substrates_df[1:g_intensity_hm_rows, ]
+ names(substrates_df) <- headers_2nd_line <-
+ c("Substrate", "Sequence", "p-value", "per group)", "quality")
+ headers_1st_line <- c("", "", "ANOVA", "min(values", "")
+ if (1 < nrow(enriched_substrates))
+ cat("\n\\newpage\n")
+ cat(subsubsection_header(
+ sprintf(
+ "Details for %s%s-substrates",
+ if (excess_substrates)
+ sprintf(
+ "%s \"highest quality\" ",
+ g_intensity_hm_rows
+ )
+ else "",
+ kinase_name
+ )
+ ))
+ substrates_df <- substrates_df[order(-substrates_df$quality), ]
+ data_frame_tabbing_latex(
+ x = substrates_df,
+ tabstops = c(0.8, 3.8, 0.6, 0.8),
+ headings = c(headers_1st_line, headers_2nd_line)
+ )
+ } else {
+ if (print_nb_messages) nbe(see_variable(nrow_m > 0), "\n")
}
+ if (print_nb_messages) nb("end kinase ", kinase_name, "\n")
}
-# Write output tabular files
-
-# get kinase, ppep, concat(kinase) tuples for enriched kinases
-
-kinase_ppep_label <- sqldf("
- WITH
- t(ppep, label) AS
- (
- SELECT DISTINCT
- SUB_MOD_RSD AS ppep,
- group_concat(gene, '; ') AS label
+ # Write output tabular files
+
+ # get kinase, ppep, concat(kinase) tuples for enriched kinases
+
+ if (print_nb_messages) nb("kinase_ppep_label <- ...\n")
+ if (print_nb_messages) nbe("kinase_ppep_label <- ...\n")
+ kinase_ppep_label <- sqldf("
+ WITH
+ t(ppep, label) AS
+ (
+ SELECT DISTINCT
+ SUB_MOD_RSD AS ppep,
+ group_concat(gene, '; ') AS label
+ FROM pseudo_ksdata
+ WHERE GENE IN (SELECT kinase FROM enriched_kinases)
+ GROUP BY ppep
+ ),
+ k(kinase, ppep_join) AS
+ (
+ SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join
FROM pseudo_ksdata
WHERE GENE IN (SELECT kinase FROM enriched_kinases)
- GROUP BY ppep
- ),
- k(kinase, ppep_join) AS
- (
- SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join
- FROM pseudo_ksdata
- WHERE GENE IN (SELECT kinase FROM enriched_kinases)
- )
- SELECT k.kinase, t.ppep, t.label
- FROM t, k
- WHERE t.ppep = k.ppep_join
- ORDER BY k.kinase, t.ppep
- ")
-
-# extract what we need from full_data
-impish <- cbind(rownames(quant_data_imp), quant_data_imp)
-colnames(impish)[1] <- "Phosphopeptide"
-data_table_imputed_sql <- "
- SELECT
- f.*,
- k.label AS KSEA_enrichments,
- q.*
- FROM
- metadata_plus_p f
- LEFT JOIN kinase_ppep_label k
- ON f.Phosphopeptide = k.ppep,
- impish q
- WHERE
- f.Phosphopeptide = q.Phosphopeptide
- "
-data_table_imputed <- sqldf(data_table_imputed_sql)
-# Zap the duplicated 'Phosphopeptide' column named 'ppep'
-data_table_imputed <-
- data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
-
-# Output with imputed, un-normalized data
-
-write.table(
- data_table_imputed
- , file = imputed_data_filename
- , sep = "\t"
- , col.names = TRUE
- , row.names = FALSE
- , quote = FALSE
+ )
+ SELECT k.kinase, t.ppep, t.label
+ FROM t, k
+ WHERE t.ppep = k.ppep_join
+ ORDER BY k.kinase, t.ppep
+ ")
+
+
+ # extract what we need from full_data
+ impish <- cbind(rownames(quant_data_imp), quant_data_imp)
+ colnames(impish)[1] <- "Phosphopeptide"
+ data_table_imputed_sql <- "
+ SELECT
+ f.*,
+ k.label AS KSEA_enrichments,
+ q.*
+ FROM
+ metadata_plus_p f
+ LEFT JOIN kinase_ppep_label k
+ ON f.Phosphopeptide = k.ppep,
+ impish q
+ WHERE
+ f.Phosphopeptide = q.Phosphopeptide
+ "
+ data_table_imputed <- sqldf(data_table_imputed_sql)
+ # Zap the duplicated 'Phosphopeptide' column named 'ppep'
+ data_table_imputed <-
+ data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
+
+ # Output imputed, un-normalized data
+ if (print_nb_messages) nb("Output imputed, un-normalized data tabular file\n")
+ if (print_nb_messages) nbe("Output imputed, un-normalized data tabular file\n")
+ write.table(
+ data_table_imputed
+ , file = imputed_data_filename
+ , sep = "\t"
+ , col.names = TRUE
+ , row.names = FALSE
+ , quote = FALSE
+ )
+
+
+ #output quantile normalized data
+ impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log)
+ colnames(impish)[1] <- "Phosphopeptide"
+ data_table_imputed <- sqldf(data_table_imputed_sql)
+ # Zap the duplicated 'Phosphopeptide' column named 'ppep'
+ data_table_imputed <-
+ data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
+ if (print_nb_messages) nb("Output quantile normalized data tabular file\n")
+ if (print_nb_messages) nbe("Output quantile normalized data tabular file\n")
+ write.table(
+ data_table_imputed,
+ file = imp_qn_lt_data_filenm,
+ sep = "\t",
+ col.names = TRUE,
+ row.names = FALSE,
+ quote = FALSE
)
-
-#output quantile normalized data
-impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log)
-colnames(impish)[1] <- "Phosphopeptide"
-data_table_imputed <- sqldf(data_table_imputed_sql)
-# Zap the duplicated 'Phosphopeptide' column named 'ppep'
-data_table_imputed <-
- data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
-write.table(
- data_table_imputed,
- file = imp_qn_lt_data_filenm,
- sep = "\t",
- col.names = TRUE,
- row.names = FALSE,
- quote = FALSE
-)
-
-ppep_kinase <- sqldf("
- SELECT DISTINCT k.ppep, k.kinase
- FROM (
- SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep
- FROM pseudo_ksdata
- WHERE GENE IN (SELECT kinase FROM enriched_kinases)
- ) k
- ORDER BY k.ppep, k.kinase
- ")
-
-RSQLite::dbWriteTable(
- conn = db,
- name = "ksea_enriched_ks",
- value = ppep_kinase,
- append = FALSE
- )
+ ppep_kinase <- sqldf("
+ SELECT DISTINCT k.ppep, k.kinase
+ FROM (
+ SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep
+ FROM pseudo_ksdata
+ WHERE GENE IN (SELECT kinase FROM enriched_kinases)
+ ) k
+ ORDER BY k.ppep, k.kinase
+ ")
+
+ RSQLite::dbWriteTable(
+ conn = db,
+ name = "ksea_enriched_ks",
+ value = ppep_kinase,
+ append = FALSE
+ )
+}
+
+if (print_nb_messages) nb("RSQLite::dbWriteTable anova_signif\n")
RSQLite::dbWriteTable(
conn = db,
@@ -3453,6 +6293,8 @@
"
)
+if (print_nb_messages) nb("Output contents of `stats_metadata_v` table to tabular file\n")
+if (print_nb_messages) nbe("Output contents of `stats_metadata_v` table to tabular file\n")
write.table(
dbReadTable(db, "stats_metadata_v"),
file = anova_ksea_mtdt_file,
@@ -3462,75 +6304,21 @@
quote = FALSE
)
+cat("\n\\clearpage\n")
```
+# Data-processing summary flowchart
+
+![Flowchart showing ANOVA and KSEA data-processing steps](KSEA_impl_flowchart.pdf)
+
```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
cat("\\leavevmode\n\n\n")
-# write parameters to report
-
-param_unlist <- unlist(as.list(params))
-param_df <- data.frame(
- parameter = paste0("\\verb@", names(param_unlist), "@"),
- value = paste0("\\verb@", gsub("$", "\\$", param_unlist, fixed = TRUE), "@")
- )
-
-data_frame_latex(
- x = param_df,
- justification = "p{0.35\\linewidth} p{0.6\\linewidth}",
- centered = TRUE,
- caption = "Input parameters",
- anchor = const_table_anchor_bp,
- underscore_whack = FALSE
- )
-
-# write parameters to SQLite output
-
-mqppep_anova_script_param_df <- data.frame(
- script = "mqppep_anova_script.Rmd",
- parameter = names(param_unlist),
- value = param_unlist
- )
-ddl_exec(db, "
- DROP TABLE IF EXISTS script_parameter;
- "
-)
-ddl_exec(db, "
- CREATE TABLE IF NOT EXISTS script_parameter(
- script TEXT,
- parameter TEXT,
- value ANY,
- UNIQUE (script, parameter) ON CONFLICT REPLACE
- )
- ;
- "
-)
-RSQLite::dbWriteTable(
- conn = db,
- name = "script_parameter",
- value = mqppep_anova_script_param_df,
- append = TRUE
-)
-
+write_params(db)
# We are done with output
RSQLite::dbDisconnect(db)
+
+cat("\\clearpage\n\\section{R package versions}\n")
+utils::toLatex(utils::sessionInfo())
```
-
diff -r dbff53e6f75f -r 08678c931f5d mqppep_mrgfltr.py
--- a/mqppep_mrgfltr.py Mon Jul 11 19:22:25 2022 +0000
+++ b/mqppep_mrgfltr.py Fri Oct 28 18:27:21 2022 +0000
@@ -87,7 +87,10 @@
nargs=1,
required=True,
dest="phosphopeptides",
- help="Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format",
+ help=" ".join([
+ "Phosphopeptide data for experimental results, including the",
+ "intensities and the mapping to kinase domains, in tabular format"
+ ]),
)
# UniProtKB/SwissProt DB input, SQLite
parser.add_argument(
@@ -106,7 +109,10 @@
required=False,
default=[],
dest="species",
- help="limit PhosphoSitePlus records to indicated species (field may be empty)",
+ help=" ".join([
+ "limit PhosphoSitePlus records to indicated species",
+ "(field may be empty)"
+ ]),
)
# outputs:
@@ -174,7 +180,7 @@
# determine species to limit records from PSP_Regulatory_Sites
if options.species is None:
exit(
- 'Argument "species" is required (and may be empty) but not supplied'
+ 'Argument "species" is required (& may be empty) but not supplied'
)
try:
if len(options.species) > 0:
@@ -216,20 +222,25 @@
FUNCTION_PHOSPHORESIDUE = (
"Function Phosphoresidue(PSP=PhosphoSitePlus.org)"
)
- GENE_NAME = "Gene_Name" # Gene Name from UniProtKB
- ON_FUNCTION = (
- "ON_FUNCTION" # ON_FUNCTION column from PSP_Regulatory_Sites
- )
- ON_NOTES = "NOTES" # NOTES column from PSP_Regulatory_Sites
- ON_OTHER_INTERACT = "ON_OTHER_INTERACT" # ON_OTHER_INTERACT column from PSP_Regulatory_Sites
- ON_PROCESS = (
- "ON_PROCESS" # ON_PROCESS column from PSP_Regulatory_Sites
- )
- ON_PROT_INTERACT = "ON_PROT_INTERACT" # ON_PROT_INTERACT column from PSP_Regulatory_Sites
+ # Gene Name from UniProtKB
+ GENE_NAME = "Gene_Name"
+ # ON_FUNCTION column from PSP_Regulatory_Sites
+ ON_FUNCTION = ("ON_FUNCTION")
+ # NOTES column from PSP_Regulatory_Sites
+ ON_NOTES = "NOTES"
+ # ON_OTHER_INTERACT column from PSP_Regulatory_Sites
+ ON_OTHER_INTERACT = "ON_OTHER_INTERACT"
+ # ON_PROCESS column from PSP_Regulatory_Sites
+ ON_PROCESS = ("ON_PROCESS")
+ # ON_PROT_INTERACT column from PSP_Regulatory_Sites
+ ON_PROT_INTERACT = "ON_PROT_INTERACT"
PHOSPHOPEPTIDE = "Phosphopeptide"
PHOSPHOPEPTIDE_MATCH = "Phosphopeptide_match"
PHOSPHORESIDUE = "Phosphoresidue"
- PUTATIVE_UPSTREAM_DOMAINS = "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains"
+ PUTATIVE_UPSTREAM_DOMAINS = " ".join([
+ "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/",
+ "Phosphatases/Binding Domains"
+ ])
SEQUENCE = "Sequence"
SEQUENCE10 = "Sequence10"
SEQUENCE7 = "Sequence7"
@@ -328,8 +339,26 @@
CitationData
) VALUES (?,?)
"""
- CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."'
- CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122'
+ CITATION_INSERT_PSP = " ".join([
+ "PhosphoSitePlus(R) (PSP) was created by Cell Signaling",
+ "Technology Inc. It is licensed under a Creative Commons",
+ "Attribution-NonCommercial-ShareAlike 3.0 Unported License.",
+ "When using PSP data or analyses in printed publications or",
+ "in online resources, the following acknowledgements must be",
+ "included: (a) the words",
+ '"PhosphoSitePlus(R), www.phosphosite.org" must',
+ "be included at appropriate places in the text or webpage,",
+ "and (b) the following citation must be included in the",
+ 'bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser',
+ "JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations,",
+ "PTMs and recalibrations. Nucleic Acids Res. 2015",
+ '43:D512-20. PMID: 25514926."'
+ ])
+ CITATION_INSERT_PSP_REF = " ".join([
+ 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and',
+ 'recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298,',
+ "https://doi.org/10.1093/nar/gkr1122"
+ ])
MRGFLTR_METADATA_COLUMNS = [
"ppep_id",
@@ -388,7 +417,8 @@
file1.close()
file1_encoded.close()
- # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed
+ # Get the list of phosphopeptides with the p's that represent
+ # the phosphorylation sites removed
re_phos = re.compile("p")
end_time = time.process_time() # timer
@@ -397,7 +427,7 @@
file=sys.stderr,
)
- # ----------- Get SwissProt data from SQLite database (start) -----------
+ # -------- Get SwissProt data from SQLite database (start) -----------
# build UniProt sequence LUT and list of unique SwissProt sequences
# Open SwissProt SQLite database
@@ -465,7 +495,7 @@
Description = ""
Gene_Name = ""
- # ----------- Get SwissProt data from SQLite database (finish) -----------
+ # -------- Get SwissProt data from SQLite database (finish) -----------
end_time = time.process_time() # timer
print(
@@ -473,12 +503,13 @@
file=sys.stderr,
)
- # ----------- Get SwissProt data from SQLite database (start) -----------
+ # -------- Get SwissProt data from SQLite database (start) -----------
# Open SwissProt SQLite database
conn = sql.connect(uniprot_sqlite)
cur = conn.cursor()
- # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide
+ # Set up dictionary to aggregate results for phosphopeptides
+ # corresponding to dephosphoeptide
DephosphoPep_UniProtSeq_LUT = {}
# Set up dictionary to accumulate results
@@ -548,7 +579,7 @@
# Get one tuple for each `phospho_pep`
# in DephosphoPep_UniProtSeq_LUT[dephospho_pep]
for (upid, gn, desc) in r:
- # Append pseudo-tuple per UniProt_ID but only when it is not present
+ # Append pseudo-tuple per UniProt_ID iff not present
if (
upid
not in DephosphoPep_UniProtSeq_LUT[
@@ -571,7 +602,7 @@
phospho_pep = dephospho_pep = sequence = 0
upid = gn = desc = r = ""
- # ----------- Get SwissProt data from SQLite database (finish) -----------
+ # -------- Get SwissProt data from SQLite database (finish) -----------
end_time = time.process_time() # timer
print(
@@ -608,33 +639,6 @@
file=sys.stderr,
) # timer
- # ########################################################################
- # # trim upstream_data to include only the upstream map columns
- # old_cols = upstream_data.columns.tolist()
- # i = 0
- # first_intensity = -1
- # last_intensity = -1
- # intensity_re = re.compile("Intensity.*")
- # for col_name in old_cols:
- # m = intensity_re.match(col_name)
- # if m:
- # last_intensity = i
- # if first_intensity == -1:
- # first_intensity = i
- # i += 1
- # # print('last intensity = %d' % last_intensity)
- # col_PKCalpha = last_intensity + 2
- #
- # data_in_cols = [old_cols[0]] + old_cols[
- # first_intensity: last_intensity + 1
- # ]
- #
- # if upstream_data.empty:
- # print("upstream_data is empty")
- # exit(0)
- #
- # data_in = upstream_data.copy(deep=True)[data_in_cols]
- ########################################################################
# trim upstream_data to include only the upstream map columns
old_cols = upstream_data.columns.tolist()
i = 0
@@ -666,7 +670,7 @@
data_in.columns = data_col_names
print("data_in")
print(data_in)
- ########################################################################
+ ######################################################################
# Convert floating-point integers to int64 integers
# ref: https://stackoverflow.com/a/68497603/15509512
@@ -689,7 +693,7 @@
) # timer
# Produce a dictionary of metadata for a single phosphopeptide.
- # This is a replacement of `UniProtInfo_subdict` in the original code.
+ # This is replaces `UniProtInfo_subdict` from the original code.
def pseq_to_subdict(phospho_pep):
# Strip "p" from phosphopeptide sequence
dephospho_pep = re_phos.sub("", phospho_pep)
@@ -733,19 +737,28 @@
if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
raise PreconditionError(
phospho_pep,
- "no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT",
+ " ".join([
+ "no matching phosphopeptide found in",
+ "PhosphoPep_UniProtSeq_LUT"
+ ])
)
if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
raise PreconditionError(
dephospho_pep,
- "dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT",
+ " ".join([
+ "dephosphorylated phosphopeptide not found",
+ "in DephosphoPep_UniProtSeq_LUT"
+ ])
)
if (
- dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)]
+ dephospho_pep != PhosphoPep_UniProtSeq_LUT[
+ (phospho_pep, DEPHOSPHOPEP)]
):
my_err_msg = "dephosphorylated phosphopeptide does not match "
- my_err_msg += "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = "
- my_err_msg += PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)]
+ my_err_msg += "PhosphoPep_UniProtSeq_LUT"
+ my_err_msg += "[(phospho_pep,DEPHOSPHOPEP)] = "
+ my_err_msg += PhosphoPep_UniProtSeq_LUT[
+ (phospho_pep, DEPHOSPHOPEP)]
raise PreconditionError(dephospho_pep, my_err_msg)
result[SEQUENCE] = [dephospho_pep]
@@ -761,20 +774,18 @@
if (dephospho_pep, SEQUENCE) not in DephosphoPep_UniProtSeq_LUT:
raise PreconditionError(
dephospho_pep,
- "no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT",
+ "".join(
+ "no matching phosphopeptide found",
+ "in DephosphoPep_UniProtSeq_LUT")
)
UniProtSeqList = DephosphoPep_UniProtSeq_LUT[
(dephospho_pep, SEQUENCE)
]
if len(UniProtSeqList) < 1:
print(
- "Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length"
- % dephospho_pep
+ "Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] %s"
+ % (dephospho_pep, "because value has zero length")
)
- # raise PreconditionError(
- # "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)",
- # 'value has zero length'
- # )
for UniProtSeq in UniProtSeqList:
i = 0
phosphoresidues = []
@@ -854,13 +865,15 @@
# add Sequence10
if psite < 10: # phospho_pep at N terminus
seq10 = (
- str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite: psite + 11]
+ str(UniProtSeq)[:psite] + "p" + str(
+ UniProtSeq)[psite: psite + 11]
)
elif (
len(UniProtSeq) - psite < 11
): # phospho_pep at C terminus
seq10 = (
- str(UniProtSeq)[psite - 10: psite] + "p" + str(UniProtSeq)[psite:]
+ str(UniProtSeq)[psite - 10: psite] + "p" + str(
+ UniProtSeq)[psite:]
)
else:
seq10 = str(UniProtSeq)[psite - 10: psite + 11]
@@ -922,23 +935,6 @@
newstring = "; ".join(
[", ".join(prez) for prez in result[PHOSPHORESIDUE]]
)
- # #separate the isoforms in PHOSPHORESIDUE column with ";"
- # oldstring = result[PHOSPHORESIDUE]
- # oldlist = list(oldstring)
- # newstring = ""
- # i = 0
- # for e in oldlist:
- # if e == ";":
- # if numps > 1:
- # if i%numps:
- # newstring = newstring + ";"
- # else:
- # newstring = newstring + ","
- # else:
- # newstring = newstring + ";"
- # i +=1
- # else:
- # newstring = newstring + e
result[PHOSPHORESIDUE] = newstring
# separate sequence7's by |
@@ -973,7 +969,8 @@
) # timer
# Construct dictionary from list of lists
- # ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/
+ # ref: https://www.8bitavenue.com/\
+ # how-to-convert-list-of-lists-to-dictionary-in-python/
UniProt_Info = {
result[0]: result[1]
for result in result_list
@@ -982,8 +979,8 @@
end_time = time.process_time() # timer
print(
- "%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]"
- % (end_time - start_time,),
+ "%0.6f create dictionary mapping phosphopeptide %s"
+ % (end_time - start_time, "to metadata dictionary [C]"),
file=sys.stderr,
) # timer
@@ -1042,14 +1039,13 @@
).reset_index()
seq7_df.columns = [SEQUENCE7, PHOSPHOPEPTIDE]
- # --- -------------- begin read PSP_Regulatory_sites ---------------------------------
- # read in PhosphoSitePlus Regulatory Sites dataset
- # ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) -----------
+ # read in PhosphoSitePlus Regulatory Sites dataset from SQLite
+ # --- -------------- begin read PSP_Regulatory_sites -----
conn = sql.connect(uniprot_sqlite)
regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn)
# Close SwissProt SQLite database
conn.close()
- # ... -------------- end read PSP_Regulatory_sites ------------------------------------
+ # ... -------------- end read PSP_Regulatory_sites -------
# keep only the human entries in dataframe
if len(species) > 0:
@@ -1127,7 +1123,8 @@
fp_series[i] = fp_series[i].replace("; ; ; ", "; ")
fp_series[i] = fp_series[i].replace("; ; ", "; ")
- # turn blanks into N_A to signify the info was searched for but cannot be found
+ # turn blanks into N_A to signify the info
+ # that was searched for but cannot be found
if fp_series[i] == "":
fp_series[i] = N_A
@@ -1203,7 +1200,6 @@
inplace=True,
)
- # data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort')
res2 = sorted(
data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key=lambda s: s.casefold()
)
@@ -1229,21 +1225,6 @@
file=sys.stderr,
) # timer
- # #rename upstream columns in new list
- # new_cols = []
- # for name in cols:
- # if "_NetworKIN" in name:
- # name = name.split("_")[0]
- # if " motif" in name:
- # name = name.split(" motif")[0]
- # if " sequence " in name:
- # name = name.split(" sequence")[0]
- # if "_Phosida" in name:
- # name = name.split("_")[0]
- # if "_PhosphoSite" in name:
- # name = name.split("_")[0]
- # new_cols.append(name)
-
# rename upstream columns in new list
def col_rename(name):
if "_NetworKIN" in name:
@@ -1276,8 +1257,8 @@
upstream_data_cast.columns = new_cols_cast
upstream_data_cast["p_peptide"] = upstream_data.index
- # --- -------------- begin read upstream_data_melt ------------------------------------
- # ----------- Get melted kinase mapping data from SQLite database (start) -----------
+ # Get melted kinase mapping data from SQLite database
+ # --- begin read upstream_data_melt ---------------------------------
conn = sql.connect(uniprot_sqlite)
upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn)
# Close SwissProt SQLite database
@@ -1303,8 +1284,7 @@
% (end_time - start_time, len(upstream_data_melt.axes[0])),
file=sys.stderr,
)
- # ----------- Get melted kinase mapping data from SQLite database (finish) -----------
- # ... -------------- end read upstream_data_melt --------------------------------------
+ # ... end read upstream_data_melt ---------------------------------
end_time = time.process_time() # timer
print(
@@ -1332,10 +1312,13 @@
if p_peptide in melt_dict:
melt_dict[p_peptide].append(characterization)
else:
- exit(
- 'Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping'
- % (p_peptide)
- )
+ los = [
+ "Phosphopeptide %s" % p_peptide,
+ "not found in ppep_mapping_db:",
+ '"phopsphopeptides" and "ppep_mapping_db" must both',
+ "originate from the same run of mqppep_kinase_mapping"
+ ]
+ exit(" ".join(los))
end_time = time.process_time() # timer
print(
@@ -1397,29 +1380,12 @@
]
]
- # cols_output_prelim = output_df.columns.tolist()
- #
- # print("cols_output_prelim")
- # print(cols_output_prelim)
- #
- # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]]
- #
- # print("cols_output with p-peptide")
- # print(cols_output)
- #
- # cols_output = [col for col in cols_output if not col == "p-peptide"]
- #
- # print("cols_output")
- # print(cols_output)
- #
- # output_df = output_df[cols_output]
-
# join output_df back to quantitative columns in data_in df
quant_cols = data_in.columns.tolist()
quant_cols = quant_cols[1:]
quant_data = data_in[quant_cols]
- # ----------- Write merge/filter metadata to SQLite database (start) -----------
+ # ---- Write merge/filter metadata to SQLite database (start) ----
# Open SwissProt SQLite database
conn = sql.connect(output_sqlite)
cur = conn.cursor()
@@ -1467,7 +1433,7 @@
# Close SwissProt SQLite database
conn.close()
- # ----------- Write merge/filter metadata to SQLite database (finish) -----------
+ # ---- Write merge/filter metadata to SQLite database (finish) ----
output_df = output_df.merge(
quant_data,
@@ -1480,15 +1446,18 @@
output_df = output_df[output_cols]
# cosmetic changes to Upstream column
+ # fill the NaN with "" for those Phosphopeptides that got a
+ # "WARNING: Failed match for " in the upstream mapping
output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[
PUTATIVE_UPSTREAM_DOMAINS
].fillna(
""
- ) # fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping
+ )
us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS])
i = 0
while i < len(us_series):
- # turn blanks into N_A to signify the info was searched for but cannot be found
+ # turn blanks into N_A to signify the info
+ # that was searched for but cannot be found
if us_series[i] == "":
us_series[i] = N_A
i += 1
@@ -1530,8 +1499,9 @@
# Rev. 7/1/2016
# Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's
# Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS
- # Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \
- # read from SwissProt SQLite database
+ # Rev. 12/2/2021: Converted to Python from ipynb; use fast \
+ # Aho-Corasick searching; \
+ # read from SwissProt SQLite database
# Rev. 12/9/2021: Transfer code to Galaxy tool wrapper
#
diff -r dbff53e6f75f -r 08678c931f5d perpage.tex
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/perpage.tex Fri Oct 28 18:27:21 2022 +0000
@@ -0,0 +1,547 @@
+% \iffalse
+%%
+%% perpage is part of the bigfoot bundle for critical typesetting
+%% Copyright 2002--2014 David Kastrup
+%%
+%% The license notice and corresponding source code for this file are
+%% contained in perpage.dtx.
+%%
+% This program is free software; you can redistribute it and/or modify
+% it under the terms of the GNU General Public License as published by
+% the Free Software Foundation; either version 2 of the License, or
+% (at your option) any later version.
+%
+% This program is distributed in the hope that it will be useful,
+% but WITHOUT ANY WARRANTY; without even the implied warranty of
+% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+% GNU General Public License for more details.
+%
+% You should have received a copy of the GNU General Public License
+% along with this program; if not, write to the Free Software
+% Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+% \fi
+% \CheckSum{396}
+% \GetFileInfo{perpage.sty}
+% \date{\filedate}
+% \author{David Kastrup\thanks{\texttt{dak@gnu.org}}}
+% \title{The \texttt{perpage} package\\Version \fileversion}
+% \maketitle
+% \section{Description}
+%
+% The \texttt{perpage} package adds the ability to reset counters per
+% page and/or keep their occurences sorted in order of appearance on
+% the page.
+%
+% It works by attaching itself to the code for \cmd{\stepcounter} and
+% will then modify the given counter according to information written
+% to the |.aux| file, which means that multiple passes may be needed.
+% Since it uses the internals of the \cmd{\label} mechanism, the need
+% for additional passes will get announced by \LaTeX\ as ``labels may
+% have changed''.
+%
+% \DescribeMacro{\MakePerPage}
+% \begin{quote}
+% |\MakePerPage[2]{footnote}|
+% \end{quote}
+% will start footnote numbers with~2 on each page (the optional
+% argument defaults to~1). 2~might be a strange number, unless you
+% have used something like
+% \begin{quote}
+% |\renewcommand\thefootnote{\fnsymbol{footnote}}|
+% \end{quote}
+% and want to start off with a dagger. The starting value must not be
+% less than~1 so that the counter logic can detect the reset of a
+% counter
+% reliably.\footnote{This unfortunately means that you can't just use
+% \cmd{\alph} in order to get figures on page~10 numbered as ``10'',
+% ``10a'', ``10b''.}
+% It could be a good idea to redefine |\@cnterr| if you use a format
+% with limited range: at the first pass, footnotes are not reset
+% across pages and things like |\fnsymbol| will quickly run out of
+% characters to use.
+%
+% \DescribeMacro{\theperpage}
+% If you want to label things also on a per page base, for example
+% with
+% \begin{quote}
+% |\renewcommand{\thefigure}{\thepage-\arabic{figure}}|
+% \end{quote}
+% you'll have the problem that \cmd{\thepage} is updated
+% asynchronously with the real page, since \TeX\ does not know which
+% page the figure will end up. If you have used the |perpage| package
+% for modifying the figure counter, however, at the point where the
+% counter is incremented, the macro \cmd{\theperpage} will be set to
+% the correct value corresponding to the actual page location. Note
+% that this macro is shared between all counters, so advancing a
+% different counter under control of |perpage| will render
+% \cmd{\thefigure} incorrect.
+%
+% \DescribeMacro{\MakeSorted}
+% \begin{quote}
+% |\MakeSorted{figure}|
+% \end{quote}
+% will make the |figure| counter get `sorted': this means that counter
+% values will be assigned in order of appearance in the output, not in
+% order of appearance in the source code. For example, the order of
+% interspersed one- and two-column figures might get mixed up by
+% \LaTeX\ in the output. Making the counter sorted will fix the order
+% to match the order of appearance. A similar problem is when
+% ordinary footnotes are present in floating material (this does not
+% work in standard \LaTeX, but might do so when using |manyfoot.sty|
+% or |bigfoot.sty|): this might jumble their order in the output, and
+% making their counter sorted will make things appear fine again.
+%
+% While this would not fix the order in the table of figures,
+% fortunately the respective entries actually get written out in order
+% of appearance in the output anyway, so this indeed fixes the
+% problem.
+%
+% Manually setting the counter does not lead to reliable results in
+% general; as a special case, however, resetting it to zero is
+% recognized (this can also happen automatically when the counter is
+% dependent on some other counter). The point where it is reset in
+% the source code separates `count groups': everything in the source
+% before that point is assigned sorted numbers separately from
+% everything appearing behind it, and the sequence numbers start again
+% with~1 with the first item appearing in the output (not the source)
+% from the new count group.
+%
+% \DescribeMacro{\MakeSortedPerPage}
+% \begin{quote}
+% |\MakeSortedPerPage[2]{table}|
+% \end{quote}
+% will make the table numbers restart at 2 on each page \emph{and}
+% will keep them sorted, to boot. Introducing new count groups by
+% resetting the counter to~0 manually will not work, as it is not
+% clear how to handle count groups scattered between pages. You will
+% usually want to use something like
+% \begin{quote}
+% |\renewcommand{\thefigure}{\theperpage-\arabic{figure}}|
+% \end{quote}
+% to go along with a page-wise figure
+% number.\footnote{Note the use of \cmd{\theperpage} here, see above.}
+% Note that it would be quite silly to start the ranges with~2: this
+% is just an example for the optional argument in case that you ever
+% need it.
+%
+% \DescribeMacro{\AddAbsoluteCounter}
+% \begin{quote}
+% |\AddAbsoluteCounter{equation}|
+% \end{quote}
+% will create a counter |absequation| that will advance together with
+% the counter |equation| but will not get reset along with it. This
+% is not sorted into output order, but just runs along with the
+% sequence in the source file. As a special case, the counter
+% |abspage| is created in this manner and \cmd{\theabspage} is defined
+% as an arabic number that works in the same contexts as \cmd{\page}
+% (namely, gets properly deferred by \cmd{\protected@write}).
+%
+% \StopEventually{}
+% \section{The documentation driver}
+% This is the default driver for typesetting the docs. Running it
+% through as a separate file will include the code section. Running
+% the original |.dtx| file through will omit the code.
+% \begin{macrocode}
+%<*driver>
+%ACE \documentclass{ltxdoc}
+%ACE \usepackage{perpage}
+%ACE \MakePerPage{footnote}
+%ACE \begin{document}
+%ACE \OnlyDescription
+%ACE % \AlsoImplementation
+%ACE \DocInput{perpage.dtx}
+%ACE \end{document}
+%ACE %
+%ACE % \end{macrocode}
+%ACE %
+%ACE % \section{The package interfaces}
+%ACE % First identification.
+%ACE % \begin{macrocode}
+%ACE %<*style>
+%ACE \NeedsTeXFormat{LaTeX2e}
+%ACE \ProvidesPackage{perpage}[2014/10/25 2.0 Reset/sort counters per page]
+%ACE % \end{macrocode}
+%ACE % \begin{macro}{\pp@cl@begin}
+%ACE % \begin{macro}{\pp@cl@end}
+%ACE % These macros are considerable tricky. They are called as
+%ACE % artificial `dependent' counters when the counter they are hooked
+%ACE % into is advanced. The way in which those counters are called are
+%ACE % one of the following:
+%ACE % \begin{quote}
+%ACE % \begin{verbatim}
+%ACE % \def\@stpelt#1{\global\csname c@#1\endcsname \z@}
+%ACE % \end{verbatim}
+%ACE % \end{quote}
+%ACE % which is the default way of resetting a subordinate counter used
+%ACE % in \LaTeX, or
+%ACE % \begin{quote}
+%ACE % \begin{verbatim}
+%ACE % \def\@stpelt#1{\global\csname c@#1\endcsname \m@ne\stepcounter{#1}}
+%ACE % \end{verbatim}
+%ACE % \end{quote}
+%ACE % which is a little present from |fixltx2e.sty| as of 2014/05/01,
+%ACE % quite complicating this feat.
+%ACE %
+%ACE % The startup code swallows either |\global \advance| or |\global|.
+% \begin{macrocode}
+\def\pp@cl@begin{\z@\z@ \begingroup}
+% \end{macrocode}
+% The command used for ending our fake counters checks for the
+% |\m@ne| condition. We don't want to bump our auxiliary counters
+% twice, so we remove the following |\stepcounter| command. Things
+% will go haywire if there is none, of course.
+% \begin{macrocode}
+\def\pp@cl@end{\afterassignment\pp@cl@end@ii \count@}
+\def\pp@cl@end@ii{%
+ \relax
+ \expandafter\endgroup
+ \ifnum\count@<\z@
+ \expandafter\pp@cl@end@iii
+ \fi}
+\def\pp@cl@end@iii\stepcounter#1{}
+% \end{macrocode}
+% \end{macro}
+% \end{macro}
+%
+% \begin{macro}{\AddAbsoluteCounter}
+% adds a counter with prefix |abs| to a given counter. It typesets
+% as an arabic number and never gets reset. And it is advanced
+% whenever the unprefixed counter gets advanced.
+% \begin{macrocode}
+\newcommand\AddAbsoluteCounter[1]
+{\@ifundefined{c@abs#1}{%
+ \expandafter\newcount\csname c@abs#1\endcsname
+ \global\value{abs#1}\@ne
+ \global\expandafter\let\csname cl@abs#1\endcsname\@empty
+ \expandafter\xdef\csname theabs#1\endcsname{%
+ \noexpand\number \csname c@abs#1\endcsname}%
+ \global\@namedef{c@pabs@#1}{\pp@cl@begin
+ \stepcounter{abs#1}%
+ \pp@cl@end}%
+ \@addtoreset{pabs@#1}{#1}}{}}
+% \end{macrocode}
+% \end{macro}
+% \begin{macro}{\c@perpage}
+% We now create the absolute counter |perpage|:
+% \begin{macrocode}
+%ACE \AddAbsoluteCounter{page}
+% \end{macrocode}
+% \end{macro}
+% \begin{macro}{\theabspage}
+% This has to be specially defined so that it will expand as late as
+% \cmd{\thepage} does. Several commands set the latter temporarily
+% to \cmd{\relax} in order to inhibit expansion, and we will more or
+% less imitate its behavior when found set in that manner.
+% \begin{macrocode}
+\def\theabspage{\ifx\thepage\relax
+ \noexpand\theabspage
+ \else
+ \number\c@abspage
+ \fi}
+% \end{macrocode}
+% \end{macro}
+% Here follow the three commands for defining counters per page:
+% \begin{macro}{\MakePerPage}
+% This creates a counter reset per page. An optional second
+% argument specifies the starting point of the sequence.
+% \begin{macrocode}
+\newcommand*\MakePerPage[2][\@ne]{%
+ \pp@makeperpage{#2}\c@pchk@{#1}}
+% \end{macrocode}
+% \end{macro}
+% \begin{macro}{\MakeSorted}
+% This will create a counter sorted in appearance on the page. No
+% optional argument is given: set the counter to a desired starting
+% value manually if you need to. Resetting it to zero will start a
+% new count group, setting it to other values is probably not reliable.
+% \begin{macrocode}
+\newcommand*\MakeSorted[1]{%
+ \setcounter{#1}{\z@}%
+ \pp@makeperpage{#1}\c@schk@{\@ne}}
+% \end{macrocode}
+% \end{macro}
+% \begin{macro}{\MakeSortedPerPage}
+% This will create output in sorted order, reset on each page. Use
+% an optional argument to specify the starting value per page. This
+% must not be~0, unfortunately.
+% \begin{macrocode}
+\newcommand*\MakeSortedPerPage[2][\@ne]{%
+ \pp@makeperpage{#2}\c@spchk@{#1}}
+% \end{macrocode}
+% \end{macro}
+% All of those must only occur in the preamble since we can't do the
+% initialization of the counter values otherwise.
+% \begin{macrocode}
+\@onlypreamble\MakePerPage
+\@onlypreamble\MakeSorted
+\@onlypreamble\MakeSortedPerPage
+% \end{macrocode}
+% \section{Internals}
+%
+% It works in the following manner: The basic work is done through
+% attaching help code to the counter's reset list. Each counter has
+% an associated absolute id that is counted through continuously and
+% is never reset, thus providing a unique frame of reference. Sorted
+% and perpage counters work by writing out information to the
+% |.aux| file.
+%
+% The information we maintain for each counter while processing the
+% source file are:
+% \begin{itemize}
+% \item The absolute counter id.
+% \item The last counter value so that we can check whether the
+% sequence has been interrupted.
+% \item The current scope id.
+% \item Its starting value.
+% \end{itemize}
+%
+% The information written to the file consists of:
+% \begin{itemize}
+% \item The absolute counter id.
+% \item The current scope id.
+% \item The scope's starting value.
+% \item The absolute counter id of a superior counter.
+% \end{itemize}
+%
+% Sorted counters work by writing out the current absolute id and
+% range id into the |.aux| file each time the counter gets incremented.
+% Whenever the counter is changed in a manner different from being
+% incremented, a new counter scope gets started. Each counter scope
+% has its own independently assigned counter numbers and is associated
+% with its absolute id starting value. So as each counter is
+% incremented, we write out the triple of current absolute id, counter
+% scope and initial value for the scope. Scope changes when a value
+% assigned from the file differs from the `natural' value. When the
+% file is read in, counter movements are tracked. Each counter that
+% does not have its `natural' value, is having a counter setting
+% recorded.
+%
+% The stuff works by adding a pseudo-reset counter to the counter's
+% dependent counter list.
+%
+% \begin{macro}{\pp@makeperpage}
+% This does the relevant things for modifying a counter. It defines
+% its reset value, it defines the correspoding absolute counter.
+% The absolute counter serves a double function: it is also used for
+% assigning numbers while reading the |.aux| file. For this purpose
+% it is assigned the initialized values here and in the enddocument
+% hook (which is called before rereading the |.aux| file and
+% checking for changed labels), while the counter is reset to zero
+% at the start of the document.
+% \begin{macrocode}
+\def\pp@makeperpage#1#2#3{%
+ \global\expandafter\mathchardef\csname c@pp@r@#1\endcsname=#3\relax
+ \global\@namedef{c@pchk@#1}{#2{#1}}%
+ \newcounter{pp@a@#1}%
+ \setcounter{pp@a@#1}{#3}%
+ \addtocounter{pp@a@#1}\m@ne
+ \@addtoreset{pchk@#1}{#1}%
+ \AtBeginDocument{\setcounter{pp@a@#1}\z@}%
+ \edef\next{\noexpand\AtEndDocument
+ {\noexpand\setcounter{pp@a@#1}{%
+ \number\value{pp@a@#1}}}}\next}
+\@onlypreamble\pp@makeperpage
+% \end{macrocode}
+% \end{macro}
+% \begin{macro}{\pp@chkvlist}
+% Check for an empty vertical list. If we have one, that is worth
+% warning about.
+% \begin{macrocode}
+\def\pp@chkvlist{%
+ \ifcase
+ \ifvmode
+ \ifx\lastnodetype\@undefined
+ \ifdim-\@m\p@=\prevdepth\ifdim\lastskip=\z@\ifnum\lastpenalty=\z@
+ \@ne
+ \fi\fi\fi
+ \else
+ \ifnum\lastnodetype=\m@ne \@ne \fi
+ \fi
+ \fi \z@
+ \or
+ \PackageWarning{perpage}{\string\stepcounter\space probably at start of
+ vertical list:^^JYou might need to use \string\leavevmode\space
+ before it to avoid vertical shifts}%
+ \fi}
+% \end{macrocode}
+% \end{macro}
+% \begin{macro}{\pp@fetchctr}
+% \begin{macro}{\theperpage}
+% This fetches the counter information and puts it into
+% \cmd{\pp@label}, \cmd{\pp@page} and (globally) into
+% \cmd{\theperpage}.
+% \begin{macrocode}
+\def\pp@fetchctr#1{\expandafter\expandafter\expandafter\pp@fetchctrii
+ \csname pp@r@#1@\number\value{pp@a@#1}\endcsname
+ {}{}}
+
+\global\let\theperpage\@empty
+
+\def\pp@fetchctrii#1#2#3{\def\pp@label{#1}%
+ \def\pp@page{#2}%
+ \gdef\theperpage{#3}}
+% \end{macrocode}
+% \end{macro}
+% \end{macro}
+% Ok, let's put together all the stuff for the simplest case, counters
+% numbered per page without sorting:
+% \begin{macro}{\c@pchk@}
+% This is the code buried into to the reset list. When the reset
+% list is executed in the context of advancing a counter, we call
+% something like
+% \begin{verbatim}
+%\global\c@pchk@{countername}\z@
+% \end{verbatim}
+% since the reset list expected a counter here instead of some
+% generic command. That is the reason we start off this command by
+% giving \cmd{\global} something to chew on.
+% \begin{macrocode}
+\def\c@pchk@#1{\pp@cl@begin
+% \end{macrocode}
+% Now we fetch the page value corresponding to the not yet adjusted
+% value of the absolute counter to see whether the previous counter
+% advance happened on the same page.
+% \begin{macrocode}
+ \pp@fetchctr{#1}\let\next\pp@page
+ \addtocounter{pp@a@#1}\@ne
+ \pp@fetchctr{#1}%
+% \end{macrocode}
+% We compare the pages for current and last advance of the counter.
+% If they differ, we reset the counter to its starting value. We do
+% the same if the counter has been reset to zero manually, likely by
+% being in the reset list of some other counter.
+% \begin{macrocode}
+ \ifcase\ifx\next\pp@page\else\@ne\fi
+ \ifnum\value{#1}=\z@\@ne\fi\z@
+ \else
+ \setcounter{#1}{\value{pp@r@#1}}%
+ \fi
+ \pp@writectr\pp@pagectr{#1}{\noexpand\theabspage}}
+% \end{macrocode}
+% \end{macro}
+% \begin{macro}{\pp@writectr}
+% This is the common ending of all pseudo reset counters. It writes
+% out an appropriate command to the |.aux| file with all required
+% information. We try to replicate any sentinel kerns or penalties.
+% \begin{macrocode}
+\def\pp@writectr#1#2#3{\edef\next{%
+ \string#1{#2}{\number\value{pp@a@#2}}{#3}{\noexpand\thepage}}%
+ \pp@chkvlist
+ \dimen@=\lastkern
+ \ifdim\dimen@=\z@ \else \unkern\fi
+ \count@=\lastpenalty
+ \protected@write\@auxout{}{\next}%
+ \ifdim\dimen@=\z@
+ \penalty \ifnum\count@<\@M \@M \else \count@ \fi
+ \else \kern\dimen@\fi
+ \pp@cl@end}
+% \end{macrocode}
+% \end{macro}
+% \begin{macro}{\pp@labeldef}
+% This is a helper macro.
+% \begin{macrocode}
+\def\pp@labeldef#1#2#3#4#5{\@newl@bel{pp@r@#2}{#3}{{#1}{#4}{#5}}}
+% \end{macrocode}
+% \end{macro}
+%
+% \begin{macro}{\pp@pagectr}
+% This is the workhorse for normal per page counters. It is called
+% whenever the |.aux| file is read in and establishes the
+% appropriate information for each counter advancement in a
+% pseudolabel.
+% \begin{macrocode}
+\def\pp@pagectr#1#2#3#4{\@ifundefined{c@pp@a@#1}{}{%
+ \addtocounter{pp@a@#1}\@ne
+ \expandafter\pp@labeldef\expandafter
+ {\number\value{pp@a@#1}}{#1}{#2}{#3}{#4}}}
+% \end{macrocode}
+% \end{macro}
+% \begin{macro}{\c@schk@}
+% This is called for implementing sorted counters. Sorted counters
+% maintain a ``count group'', and the values in each count group are
+% numbered independently from that of other count groups. Whenever
+% a counter is found to have been reset, it will start a new count
+% group. At the end of document, the count group counters need to
+% get reset, too, so that the check for changed |.aux| files will
+% still work.
+% \begin{macrocode}
+\def\c@schk@#1{\pp@cl@begin
+ \addtocounter{pp@a@#1}\@ne
+ \ifnum\value{#1}=\@ne
+ \expandafter\xdef\csname pp@g@#1\endcsname{\number\value{pp@a@#1}}%
+ \edef\next{\noexpand\AtEndDocument{\global\let
+ \expandafter\noexpand\csname pp@g@#1@\number\value{pp@a@#1}\endcsname
+ \relax}}\next
+ \fi
+ \pp@fetchctr{#1}%
+ \ifx\pp@page\@empty
+ \else \setcounter{#1}{\pp@label}\fi
+ \pp@writectr\pp@spagectr{#1}{\csname pp@g@#1\endcsname}}%
+% \end{macrocode}
+% \end{macro}
+% \begin{macro}{\pp@spagectr}
+% This is the code advancing the respective value of the appropriate
+% count group and assigning the label.
+% \begin{macrocode}
+\def\pp@spagectr#1#2#3#4{\@ifundefined{c@pp@a@#1}{}{%
+ \count@0\csname pp@g@#1@#3\endcsname
+ \advance\count@\@ne
+ \expandafter\xdef\csname pp@g@#1@#3\endcsname{\number\count@}%
+ \expandafter\pp@labeldef\expandafter
+ {\number\count@}{#1}{#2}{#3}{#4}}}
+% \end{macrocode}
+% \end{macro}
+% \begin{macro}{\c@spchk@}
+% And this finally is the counter advance code for sorted counters
+% per page. Basically, we just use one count group per page.
+% Resetting a counter manually will not introduce a new count group,
+% and it would be hard to decide what to do in case count groups and
+% page positions overlap.
+% \begin{macrocode}
+\def\c@spchk@#1{\pp@cl@begin
+ \addtocounter{pp@a@#1}\@ne
+ \pp@fetchctr{#1}%
+ \ifx\pp@page\@empty
+ \else \setcounter{#1}{\pp@label}\fi
+ \pp@writectr\pp@ppagectr{#1}{\noexpand\theabspage}}
+% \end{macrocode}
+% \end{macro}
+% \begin{macro}{\pp@ppagectr}
+% \begin{macrocode}
+\def\pp@ppagectr#1#2#3#4{\@ifundefined{c@pp@a@#1}{}{%
+ \def\next{#3}%
+ \expandafter\ifx\csname pp@page@#1\endcsname\next
+ \addtocounter{pp@a@#1}\@ne
+ \else
+ \setcounter{pp@a@#1}{\value{pp@r@#1}}%
+ \fi
+ \global\expandafter\let\csname pp@page@#1\endcsname\next
+ \expandafter\pp@labeldef\expandafter
+ {\number\value{pp@a@#1}}{#1}{#2}{#3}{#4}}}
+% \end{macrocode}
+% \end{macro}
+% \begin{macro}{\@testdef}
+% \LaTeX's current (2007) definition of this macro causes save stack
+% overflow. We fix this by an additional grouping. Delay to the
+% beginning of document to keep Babel happy.
+% \begin{macrocode}
+\AtBeginDocument{%
+ \begingroup
+ \@testdef{}{undefined}{}%
+ \expandafter
+ \endgroup
+ \ifx\@undefined\relax
+ \let\pp@@testdef\@testdef
+ \def\@testdef#1#2#3{{\pp@@testdef{#1}{#2}{#3}%
+ \if@tempswa\aftergroup\@tempswatrue\fi}}%
+ \fi}
+%
+% \end{macrocode}
+% \end{macro}
+%
+% \Finale
+% \endinput
+% Local Variables:
+% mode: doctex
+% TeX-master: "perpage.drv"
+% End:
diff -r dbff53e6f75f -r 08678c931f5d search_ppep.py
--- a/search_ppep.py Mon Jul 11 19:22:25 2022 +0000
+++ b/search_ppep.py Fri Oct 28 18:27:21 2022 +0000
@@ -237,7 +237,10 @@
# Parse Command Line
parser = argparse.ArgumentParser(
- description="Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB)."
+ description=" ".join([
+ "Phopsphoproteomic Enrichment",
+ "phosphopeptide SwissProt search (in place in SQLite DB)."
+ ])
)
# inputs:
@@ -249,7 +252,11 @@
nargs=1,
required=True,
dest="phosphopeptides",
- help="Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool",
+ help=" ".join([
+ "Phosphopeptide data for experimental results,",
+ "generated by the Phopsphoproteomic Enrichment Localization",
+ "Filter tool"
+ ]),
)
parser.add_argument(
"--uniprotkb",
@@ -257,7 +264,10 @@
nargs=1,
required=True,
dest="uniprotkb",
- help="UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool",
+ help=" ".join([
+ "UniProtKB/Swiss-Prot data, converted from FASTA format by the",
+ "Phopsphoproteomic Enrichment Kinase Mapping tool"
+ ]),
)
parser.add_argument(
"--schema",
@@ -310,7 +320,8 @@
cur.executescript(DROP_TABLES_SQL)
# if options.db_schema:
- # print("\nAfter dropping tables/views that are to be created, schema is:")
+ # print("\nAfter dropping tables/views that are to be created,"
+ # + schema is:")
# cur.execute("SELECT * FROM sqlite_schema")
# for row in cur.fetchall():
# if row[4] is not None:
@@ -403,7 +414,11 @@
deppep_count = row[0]
cur.execute(
- "SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)"
+ """
+ SELECT count(*) FROM (
+ SELECT Sequence FROM UniProtKB GROUP BY Sequence
+ )
+ """
)
for row in cur.fetchall():
sequence_count = row[0]
@@ -431,9 +446,11 @@
old_seq = ""
for row in cur.fetchall():
if duplicate_count == 0:
- print(
- "\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column)."
- )
+ print(" ".join([
+ "\nEach of the following sequences is associated with several",
+ "accession IDs (which are listed in the first column) but",
+ "the same gene ID (which is listed in the second column)."
+ ]))
if row[2] != old_seq:
old_seq = row[2]
duplicate_count += 1
@@ -480,13 +497,19 @@
)
else:
raise ValueError(
- "UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID"
- % (UniProtKB_id,)
+ "UniProtKB_id %s, but Sequence is None: %s %s"
+ % (
+ UniProtKB_id,
+ "Check whether SwissProt file is missing",
+ "the sequence for this ID")
)
ker.execute(
"""
- SELECT count(*) || ' accession-peptide-phosphopeptide combinations were found'
- FROM uniprotkb_pep_ppep_view
+ SELECT
+ count(*) ||
+ ' accession-peptide-phosphopeptide combinations were found'
+ FROM
+ uniprotkb_pep_ppep_view
"""
)
for row in ker.fetchall():
@@ -494,7 +517,9 @@
ker.execute(
"""
- SELECT count(*) || ' accession matches were found', count(*) AS accession_count
+ SELECT
+ count(*) || ' accession matches were found',
+ count(*) AS accession_count
FROM (
SELECT accession
FROM uniprotkb_pep_ppep_view
@@ -520,7 +545,9 @@
ker.execute(
"""
- SELECT count(*) || ' phosphopeptide matches were found', count(*) AS phosphopeptide_count
+ SELECT
+ count(*) || ' phosphopeptide matches were found',
+ count(*) AS phosphopeptide_count
FROM (
SELECT phosphopeptide
FROM uniprotkb_pep_ppep_view