Mercurial > repos > galaxyp > mqppep_anova
changeset 1:08678c931f5d draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 43e7a43b545c24b2dc33d039198551c032aa79be
author | galaxyp |
---|---|
date | Fri, 28 Oct 2022 18:27:21 +0000 |
parents | dbff53e6f75f |
children | 2336fbff8866 |
files | KSEA_impl_flowchart.dia KSEA_impl_flowchart.pdf MaxQuantProcessingScript.R kinase_name_uniprot_lut.tabular.bz2 kinase_uniprot_description_lut.tabular.bz2 macros.xml mqppep_anova.R mqppep_anova.xml mqppep_anova_preamble.tex mqppep_anova_script.Rmd mqppep_mrgfltr.py perpage.tex search_ppep.py |
diffstat | 13 files changed, 5301 insertions(+), 1569 deletions(-) [+] |
line wrap: on
line diff
--- a/MaxQuantProcessingScript.R Mon Jul 11 19:22:25 2022 +0000 +++ b/MaxQuantProcessingScript.R Fri Oct 28 18:27:21 2022 +0000 @@ -220,7 +220,6 @@ type = "character", help = "pY or pST enriched samples (ie, 'Y' or 'ST')" ) - # default = "^Number of Phospho [(]STY[)]$", , make_option( c("-p", "--phosphoCol"), @@ -229,7 +228,6 @@ help = paste0("PERL-compatible regular expression matching", " header of column having number of 'Phospho (STY)'") ) - # default = "^Intensity[^_]", , make_option( c("-s", "--startCol"), @@ -238,7 +236,6 @@ help = paste0("PERL-compatible regular expression matching", " header of column having first sample intensity") ) - # default = 1, , make_option( c("-I", "--intervalCol"), @@ -247,7 +244,6 @@ help = paste0("Column interval between the Intensities of samples", " (eg, 1 if subsequent column; 2 if every other column") ) - # default = 0.75, , make_option( c("-l", "--localProbCutoff"), @@ -255,7 +251,6 @@ type = "double", help = "Localization Probability Cutoff" ) - # default = "sum", , make_option( c("-f", "--collapse_func"), @@ -264,7 +259,6 @@ help = paste0("merge identical phosphopeptides", " by ('sum' or 'average') the intensities") ) - # default = "filtered_data.txt", , make_option( c("-r", "--filtered_data"), @@ -272,7 +266,6 @@ type = "character", help = "filtered_data.txt" ) - # default = "quantData.txt", , make_option( c("-q", "--quant_data"),
--- a/macros.xml Mon Jul 11 19:22:25 2022 +0000 +++ b/macros.xml Fri Oct 28 18:27:21 2022 +0000 @@ -1,89 +1,47 @@ <macros> - <token name="@TOOL_VERSION@">0.1.13</token> + <token name="@TOOL_VERSION@">0.1.15</token> <token name="@VERSION_SUFFIX@">0</token> <xml name="requirements"> <requirements> <requirement type="package" version="1.56.0" >bioconductor-preprocesscore</requirement> - <requirement type="package" version="1.22.2" >numpy</requirement> + <requirement type="package" version="6.2.1" >gmp</requirement> + <requirement type="package" version="1.23.4" >numpy</requirement> <requirement type="package" version="0.3.3" >openblas</requirement> - <requirement type="package" version="1.4.1" >pandas</requirement> - <requirement type="package" version="1.64" >perl-dbd-sqlite</requirement> - <requirement type="package" version="5.26.2" >perl</requirement> - <requirement type="package" version="1.4.0" >pyahocorasick</requirement> - <requirement type="package" version="3.9.10" >python</requirement> - <requirement type="package" version="1.14.2" >r-data.table</requirement> - <requirement type="package" version="1.1.2" >r-dbi</requirement> - <requirement type="package" version="3.3.5" >r-ggplot2</requirement> + <requirement type="package" version="1.5.1" >pandas</requirement> + <requirement type="package" version="1.70" >perl-dbd-sqlite</requirement> + <requirement type="package" version="5.32.1" >perl</requirement> + <requirement type="package" version="1.4.4" >pyahocorasick</requirement> + <requirement type="package" version="3.10.6" >python</requirement> + <requirement type="package" version="4.1.3" >r-base</requirement> + <requirement type="package" version="6.0_93" >r-caret</requirement> + <requirement type="package" version="1.14.4" >r-data.table</requirement> + <requirement type="package" version="1.1.3" >r-dbi</requirement> + <requirement type="package" version="3.3.6" >r-ggplot2</requirement> <requirement type="package" version="3.1.3" >r-gplots</requirement> - <requirement type="package" version="0.9.4" >r-latex2exp</requirement> - <requirement type="package" version="1.7.1" >r-optparse</requirement> + <requirement type="package" version="0.9.5" >r-latex2exp</requirement> + <requirement type="package" version="1.7.3" >r-optparse</requirement> <requirement type="package" version="1.4.4" >r-reshape2</requirement> - <requirement type="package" version="2.11" >r-rmarkdown</requirement> - <requirement type="package" version="2.2.8" >r-rsqlite</requirement> - <requirement type="package" version="0.4.0" >r-sass</requirement> + <requirement type="package" version="2.17" >r-rmarkdown</requirement> + <!-- + <requirement type="package" version="2.2.18" >r-rsqlite</requirement> + <requirement type="package" version="0.4.2" >r-sass</requirement> + --> + <requirement type="package" version="1.2.2" >r-sessioninfo</requirement> <requirement type="package" version="0.4_11" >r-sqldf</requirement> - <requirement type="package" version="1.4.0" >r-stringr</requirement> - <requirement type="package" version="0.37" >r-tinytex</requirement> + <requirement type="package" version="1.4.1" >r-stringr</requirement> + <requirement type="package" version="0.42" >r-tinytex</requirement> <requirement type="package" version="0.3.7" >r-vioplot</requirement> <!-- It would be nice to use conda-forge/texlive-core rather than r-tinytex because the - former installs texlive when the package is built, but issue 23 blocked PDF-creation. + former installs texlive when the package is built, but issue 19/61 blocked PDF-creation. Also, texlive-core also gave pango font errors (output had missing symbols replaced with boxes) unless I specified the build as well as the version when building a conda environment, e.g.: texlive-core=20210325=h97429d4_0 --> </requirements> - <!-- I specified the versions above because it takes a VERY long time to search for package versions when they are not omitted; also, version numbers should lead to reproducible behavior. Contrast execution times of this (about 18 seconds): - echo n | time conda create -n mqppep_ver -c conda-forge -c bioconda \ - bioconductor-preprocesscore=1.56.0 \ - numpy=1.22.2 \ - openblas=0.3.3 \ - pandas=1.4.1 \ - perl-dbd-sqlite=1.64 \ - perl-dbd-sqlite=1.64 \ - perl=5.26.2 \ - pyahocorasick=1.4.0 \ - python=3.9.10 \ - r-data.table=1.14.2 \ - r-dbi=1.1.2 \ - r-ggplot2=3.3.5 \ - r-gplots=3.1.3 \ - r-latex2exp=0.9.4 \ - r-optparse=1.7.1 \ - r-reshape2=1.4.4 \ - r-rmarkdown=2.11 \ - r-rsqlite=2.2.8 \ - r-sass=0.4.0 \ - r-sqldf=0.4_11 \ - r-stringr=1.4.0 \ - r-tinytex=0.37 \ - r-vioplot=0.3.7 - with this (42 or more seconds): - echo n | time conda create -n mqppep_nover -c conda-forge -c bioconda \ - bioconductor-preprocesscore= \ - numpy \ - openblas=0.3.3 \ - pandas \ - perl \ - perl-dbd-sqlite \ - perl-dbd-sqlite \ - pyahocorasick \ - python \ - r-data.table \ - r-dbi \ - r-ggplot2 \ - r-gplots \ - r-latex2exp \ - r-optparse \ - r-reshape2 \ - r-rmarkdown \ - r-rsqlite \ - r-sass \ - r-sqldf \ - r-stringr \ - r-tinytex \ - r-vioplot - + <!-- I specified the versions above because it takes a VERY long time + to search for package versions when they are not omitted; also, + locking version numbers might lead to more-reproducible behavior. --> </xml> </macros>
--- a/mqppep_anova.R Mon Jul 11 19:22:25 2022 +0000 +++ b/mqppep_anova.R Fri Oct 28 18:27:21 2022 +0000 @@ -1,20 +1,15 @@ #!/usr/bin/env Rscript # libraries library(optparse) -library(data.table) library(stringr) +library(tinytex) # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 # parse options option_list <- list( - make_option( - c("-i", "--inputFile"), - action = "store", - default = NA, - type = "character", - help = "Phosphopeptide Intensities sparse input file path" - ), + + # files make_option( c("-a", "--alphaFile"), action = "store", @@ -24,64 +19,11 @@ " path to text file having one column and no header") ), make_option( - c("-S", "--preproc_sqlite"), - action = "store", - default = NA, - type = "character", - help = "Path to 'preproc_sqlite' produced by `mqppep_mrgfltr.py`" - ), - make_option( - c("-K", "--ksea_sqlite"), + c("-M", "--anova_ksea_metadata"), action = "store", - default = NA, - type = "character", - help = "Path to 'ksea_sqlite' output produced by this tool" - ), - make_option( - c("-f", "--firstDataColumn"), - action = "store", - default = "^Intensity[^_]", - type = "character", - help = "First column of intensity values" - ), - make_option( - c("-m", "--imputationMethod"), - action = "store", - default = "random", + default = "anova_ksea_metadata.tsv", type = "character", - help = paste0("Method for missing-value imputation,", - " one of c('group-median','median','mean','random')") - ), - make_option( - c("-p", "--meanPercentile"), - action = "store", - default = 3, - type = "integer", - help = paste0("Mean percentile for randomly generated imputed values;", - ", range [1,99]") - ), - make_option( - c("-d", "--sdPercentile"), - action = "store", - default = 3, - type = "double", - help = paste0("Adjustment value for standard deviation of", - " randomly generated imputed values; real") - ), - make_option( - c("-s", "--regexSampleNames"), - action = "store", - default = "\\.(\\d+)[A-Z]$", - type = "character", - help = "Regular expression extracting sample-names" - ), - make_option( - c("-g", "--regexSampleGrouping"), - action = "store", - default = "(\\d+)", - type = "character", - help = paste0("Regular expression extracting sample-group", - " from an extracted sample-name") + help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments" ), make_option( c("-o", "--imputedDataFile"), @@ -102,11 +44,56 @@ ) ), make_option( + c("-i", "--inputFile"), + action = "store", + default = NA, + type = "character", + help = "Phosphopeptide Intensities sparse input file path" + ), + make_option( + c("-K", "--ksea_sqlite"), + action = "store", + default = NA, + type = "character", + help = "Path to 'ksea_sqlite' output produced by this tool" + ), + make_option( + c("-S", "--preproc_sqlite"), + action = "store", + default = NA, + type = "character", + help = "Path to 'preproc_sqlite' produced by `mqppep_mrgfltr.py`" + ), + make_option( c("-r", "--reportFile"), action = "store", - default = "QuantDataProcessingScript.html", + default = "mqppep_anova.pdf", + type = "character", + help = "PDF report file path" + ), + + # parameters + make_option( + c("-f", "--firstDataColumn"), + action = "store", + default = "^Intensity[^_]", type = "character", - help = "HTML report file path" + help = "First column of intensity values" + ), + make_option( + c("-m", "--imputationMethod"), + action = "store", + default = "random", + type = "character", + help = paste0("Method for missing-value imputation,", + " one of c('group-median','median','mean','random')") + ), + make_option( + c("-C", "--intensityMinValuesPerClass"), + action = "store", + default = "0", + type = "integer", + help = "Minimum number of observed values per class" ), make_option( c("-k", "--ksea_cutoff_statistic"), @@ -124,14 +111,120 @@ help = paste0("Maximum score to be used to score a kinase enrichment as significant") ), make_option( - c("-M", "--anova_ksea_metadata"), + c("-c", "--kseaMinSubstrateCount"), + action = "store", + default = "1", + type = "integer", + help = "Minimum number of substrates to consider any kinase for KSEA" + ), + make_option( + c("--kseaUseAbsoluteLog2FC"), + action = "store_true", + default = "FALSE", + type = "logical", + help = paste0("Should abs(log2(fold-change)) be used for KSEA?", + " (TRUE may alter number of hits.)") + ), + make_option( + c("-p", "--meanPercentile"), + action = "store", + default = 3, + type = "integer", + help = paste0("Mean percentile for randomly generated imputed values;", + ", range [1,99]") + ), + make_option( + c("--minQuality"), + action = "store", + default = 0, + type = "integer", + help = paste0("Minimum quality (higher value reduces number of substrates", + " accepted; you may want to keep below 100), range [0,infinity]") + ), + make_option( + c("--oneWayManyCategories"), + action = "store", + default = "aov", + type = "character", + help = "Name of R function for one-way tests among more than two categories" + ), + make_option( + c("--oneWayTwoCategories"), action = "store", - default = "anova_ksea_metadata.tsv", + default = "two.way", + type = "character", + help = "Name of R function for one-way tests between two categories" + ), + make_option( + c("-s", "--regexSampleNames"), + action = "store", + default = "\\.(\\d+)[A-Z]$", + type = "character", + help = "Regular expression extracting sample-names" + ), + make_option( + c("-g", "--regexSampleGrouping"), + action = "store", + default = "(\\d+)", type = "character", - help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments" + help = paste0("Regular expression extracting sample-group", + " from an extracted sample-name") + ), + make_option( + c("-d", "--sdPercentile"), + action = "store", + default = 3, + type = "double", + help = paste0("Adjustment value for standard deviation of", + " randomly generated imputed values; real") + ), + make_option( + c("-F", "--sampleGroupFilter"), + action = "store", + default = "none", + type = "character", + help = paste0("Should no filter be applied to sample group names (none)", + " or should the filter specify samples to include or exclude?") + ), + make_option( + c("--sampleGroupFilterMode"), + action = "store", + default = "r", + type = "character", + help = paste0("First character ('f', 'p', or 'r') indicating regular", + "expression matching mode ('fixed', 'perl', or 'grep'; ", + "see https://rdrr.io/r/base/grep.html). Second character may be 'i;", + "to make search ignore case.") + ), + make_option( + c("-G", "--sampleGroupFilterPatterns"), + action = "store", + default = ".*", + type = "character", + help = paste0("Regular expression extracting sample-group", + " from an extracted sample-name") ) ) -args <- parse_args(OptionParser(option_list = option_list)) + +tryCatch( + args <- parse_args( + OptionParser( + option_list = option_list, + add_help_option = TRUE + ), + print_help_and_exit = TRUE + ), + error = function(e) { + parse_args( + OptionParser( + option_list = option_list, + add_help_option = TRUE + ), + print_help_and_exit = TRUE + ) + stop(as.character(e)) + } +) print("args is:") cat(str(args)) @@ -140,16 +233,34 @@ if (! file.exists(args$inputFile)) { stop((paste("Input file", args$inputFile, "does not exist"))) } -input_file <- args$inputFile -alpha_file <- args$alphaFile -preproc_sqlite <- args$preproc_sqlite -imputed_data_file_name <- args$imputedDataFile -imp_qn_lt_data_filenm <- args$imputedQNLTDataFile -anova_ksea_metadata <- args$anova_ksea_metadata -report_file_name <- args$reportFile -ksea_sqlite <- args$ksea_sqlite -ksea_cutoff_statistic <- args$ksea_cutoff_statistic -ksea_cutoff_threshold <- args$ksea_cutoff_threshold + +# files +alpha_file <- args$alphaFile +anova_ksea_metadata_file <- args$anova_ksea_metadata +imp_qn_lt_data_file <- args$imputedQNLTDataFile +imputed_data_file <- args$imputedDataFile +input_file <- args$inputFile +ksea_sqlite_file <- args$ksea_sqlite +preproc_sqlite_file <- args$preproc_sqlite +report_file_name <- args$reportFile + +# parameters +# firstDataColumn - see below +group_filter <- args$sampleGroupFilter +group_filter_mode <- args$sampleGroupFilterMode +# imputationMethod - see below +intensity_min_values_per_class <- args$intensityMinValuesPerClass +ksea_cutoff_statistic <- args$ksea_cutoff_statistic +ksea_cutoff_threshold <- args$ksea_cutoff_threshold +ksea_min_substrate_count <- args$kseaMinSubstrateCount +ksea_use_absolute_log2_fc <- args$kseaUseAbsoluteLog2FC +# mean_percentile - see below +min_quality <- args$minQuality +# regexSampleNames - see below +# regexSampleGrouping - see below +# sampleGroupFilterPatterns - see below (becomes group_filter_patterns) +# sd_percentile - see below + if ( sum( grepl( @@ -192,32 +303,59 @@ # - regexSampleNames # - regexSampleGrouping read_config_file_string <- function(fname, limit) { + cat(sprintf("read_config_file_string: fname = '%s'\n", fname)) + cat(sprintf("length(fname) = '%s'\n", length(fname))) + result <- + if (file.exists(fname)) { + cat(sprintf("reading '%s' ...\n", fname)) + readChar(fname, limit) + } else { + cat(sprintf("not a file: '%s'\n", fname)) + fname + } + #AC print(paste0("read_config_file_string: opening file '", as.character(fname), "'")) # eliminate any leading whitespace - result <- gsub("^[ \t\n]*", "", readChar(fname, limit)) + result <- gsub("^[ \t\n]*", "", result) # eliminate any trailing whitespace - result <- gsub("[ \t\n]*$", "", result) + result <- gsub("[ \t\n]*$", "", result) # substitute characters escaped by Galaxy sanitizer - result <- gsub("__lt__", "<", result) - result <- gsub("__le__", "<=", result) - result <- gsub("__eq__", "==", result) - result <- gsub("__ne__", "!=", result) - result <- gsub("__gt__", ">", result) - result <- gsub("__ge__", ">=", result) - result <- gsub("__sq__", "'", result) - result <- gsub("__dq__", '"', result) - result <- gsub("__ob__", "[", result) - result <- gsub("__cb__", "]", result) + result <- gsub("__lt__", "<", result) + result <- gsub("__le__", "<=", result) + result <- gsub("__eq__", "==", result) + result <- gsub("__ne__", "!=", result) + result <- gsub("__gt__", ">", result) + result <- gsub("__ge__", ">=", result) + result <- gsub("__sq__", "'", result) + result <- gsub("__dq__", '"', result) + result <- gsub("__ob__", "[", result) + result <- gsub("__cb__", "]", result) } +nc <- 1000 + +sink(stderr()) + cat(paste0("first_data_column file: ", args$firstDataColumn, "\n")) -cat(paste0("regex_sample_names file: ", args$regexSampleNames, "\n")) -cat(paste0("regex_sample_grouping file: ", args$regexSampleGrouping, "\n")) -nc <- 1000 -regex_sample_names <- read_config_file_string(args$regexSampleNames, nc) -regex_sample_grouping <- read_config_file_string(args$regexSampleGrouping, nc) first_data_column <- read_config_file_string(args$firstDataColumn, nc) cat(paste0("first_data_column: ", first_data_column, "\n")) + +cat(paste0("regex_sample_grouping file: ", args$regexSampleGrouping, "\n")) +regex_sample_grouping <- read_config_file_string(args$regexSampleGrouping, nc) +cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\n")) + +cat(paste0("regex_sample_names file: ", args$regexSampleNames, "\n")) +regex_sample_names <- read_config_file_string(args$regexSampleNames, nc) cat(paste0("regex_sample_names: ", regex_sample_names, "\n")) -cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\n")) + +if (group_filter != "none") { + cat(paste0("group_filter_patterns file: '", args$sampleGroupFilterPatterns, "'\n")) + group_filter_patterns <- read_config_file_string(args$sampleGroupFilterPatterns, nc) +} else { + group_filter_patterns <- ".*" +} +cat(paste0("group_filter_patterns: ", group_filter_patterns, "\n")) + +sink() + # from: https://github.com/molgenis/molgenis-pipelines/wiki/ # How-to-source-another_file.R-from-within-your-R-script @@ -253,45 +391,72 @@ return(NULL) } -script_dir <- location_of_this_script() +# validation of input parameters is complete; it is now justifiable to +# install LaTeX tools to render markdown as PDF; this involves a big +# download from GitHub +if (!tinytex::is_tinytex()) tinytex::install_tinytex() rmarkdown_params <- list( - inputFile = input_file - , alphaFile = alpha_file - , preprocDb = preproc_sqlite + + # files + alphaFile = alpha_file + , anovaKseaMetadata = anova_ksea_metadata_file + , imputedDataFilename = imputed_data_file + , imputedQNLTDataFile = imp_qn_lt_data_file + , inputFile = input_file + , kseaAppPrepDb = ksea_sqlite_file + , preprocDb = preproc_sqlite_file + + # parameters , firstDataColumn = first_data_column + , groupFilter = group_filter + , groupFilterMode = group_filter_mode # arg sampleGroupFilterMode + , groupFilterPatterns = group_filter_patterns # arg sampleGroupFilterPatterns , imputationMethod = imputation_method + , intensityMinValuesPerGroup = intensity_min_values_per_class + , kseaCutoffStatistic = ksea_cutoff_statistic + , kseaCutoffThreshold = ksea_cutoff_threshold + , kseaMinSubstrateCount = ksea_min_substrate_count + , kseaUseAbsoluteLog2FC = ksea_use_absolute_log2_fc # add , meanPercentile = mean_percentile - , sdPercentile = sd_percentile + , minQuality = min_quality # add + , regexSampleGrouping = regex_sample_grouping , regexSampleNames = regex_sample_names - , regexSampleGrouping = regex_sample_grouping - , imputedDataFilename = imputed_data_file_name - , imputedQNLTDataFile = imp_qn_lt_data_filenm - , anovaKseaMetadata = anova_ksea_metadata - , kseaAppPrepDb = ksea_sqlite - , kseaCutoffThreshold = ksea_cutoff_threshold - , kseaCutoffStatistic = ksea_cutoff_statistic + , sdPercentile = sd_percentile ) print("rmarkdown_params") -str(rmarkdown_params) +print(rmarkdown_params) +print( + lapply( + X = rmarkdown_params, + FUN = function(x) { + paste0( + nchar(as.character(x)), + ": '", + as.character(x), + "'" + ) + } + ) +) + # freeze the random number generator so the same results will be produced # from run to run set.seed(28571) -# BUG (or "opportunity") -# To render as PDF for the time being requires installing the conda -# package `r-texlive` until this issue in `texlive-core` is resolved: -# https://github.com/conda-forge/texlive-core-feedstock/issues/19 -# This workaround is detailed in the fourth comment of: -# https://github.com/conda-forge/texlive-core-feedstock/issues/61 +script_dir <- location_of_this_script() -library(tinytex) -tinytex::install_tinytex() rmarkdown::render( input = paste(script_dir, "mqppep_anova_script.Rmd", sep = "/") -, output_format = rmarkdown::pdf_document(toc = TRUE) , output_file = report_file_name , params = rmarkdown_params +, output_format = rmarkdown::pdf_document( + includes = rmarkdown::includes(in_header = "mqppep_anova_preamble.tex") + , dev = "pdf" + , toc = TRUE + , toc_depth = 2 + , number_sections = FALSE + ) )
--- a/mqppep_anova.xml Mon Jul 11 19:22:25 2022 +0000 +++ b/mqppep_anova.xml Fri Oct 28 18:27:21 2022 +0000 @@ -7,6 +7,28 @@ <description>Runs ANOVA and KSEA for phosphopeptides.</description> <macros> <import>macros.xml</import> + <xml name="group_matching_parm"> + <param name="group_filter_mode" type="select" + help="Regular expression matching mode 'fixed', 'perl', or 'grep' with option for case insensitivity. See https://rdrr.io/r/base/grep.html" + label="Sample-group matching mode" + > + <option value="r">ERE ("extended regular expressions")</option> + <option value="ri"> - ERE, case insensitive</option> + <option value="p" selected="true">PCRE ("PERL-compatible regular expressions")</option> + <option value="pi"> - PCRE, case insensitive</option> + <option value="f">fixed strings ("no regular expressions")</option> + <option value="fi"> - fixed strings, case insensitive</option> + </param> + <param name="group_filter_patterns" type="text" value=".+" + help="Comma-separated list of regular expressions matching group-names" + label="Sample-group matching pattern"> + <sanitizer> + <valid initial="string.printable"> + <remove value="'"/> + </valid> + </sanitizer> + </param> + </xml> </macros> <edam_topics> <edam_topic>topic_0121</edam_topic><!-- proteomics --> @@ -27,29 +49,58 @@ both need access to a writeable directory, but most directories in a biocontainer are read-only, so this builds a pseudo-home under /tmp --> + <required_files> + <include path="KSEA_impl_flowchart.pdf" /> + <include path="kinase_name_uniprot_lut.tabular.bz2" /> + <include path="kinase_uniprot_description_lut.tabular.bz2" /> + <include path="kinase_uniprot_description_lut.tabular.bz2" /> + <include path="mqppep_anova.R" /> + <include path="mqppep_anova_preamble.tex" /> + <include path="mqppep_anova_script.Rmd" /> + <include path="perpage.tex" /> + </required_files> <command detect_errors="exit_code"><![CDATA[ + (printenv | sort) && cp '$__tool_directory__/mqppep_anova_script.Rmd' . && - cp '$__tool_directory__/mqppep_anova.R' . && + cp '$__tool_directory__/mqppep_anova.R' . && + cp '$__tool_directory__/kinase_name_uniprot_lut.tabular.bz2' . && + cp '$__tool_directory__/kinase_uniprot_description_lut.tabular.bz2' . && + cp '$__tool_directory__/mqppep_anova_preamble.tex' . && + cp '$__tool_directory__/perpage.tex' . && + cp '$__tool_directory__/KSEA_impl_flowchart.pdf' . && Rscript mqppep_anova.R --inputFile '$input_file' --alphaFile '$alpha_file' --preproc_sqlite '$preproc_sqlite' - --firstDataColumn $intensity_column_regex_f + --firstDataColumn '$intensity_column_regex_f' --imputationMethod $imputation.imputation_method #if $imputation.imputation_method == "random" --meanPercentile '$imputation.meanPercentile' --sdPercentile '$imputation.sdPercentile' #end if - --regexSampleNames $sample_names_regex_f - --regexSampleGrouping $sample_grouping_regex_f - --imputedDataFile $imputed_data_file + --regexSampleNames '$sample_names_regex_f' + --regexSampleGrouping '$sample_grouping_regex_f' + #if $group_filter.group_filter_method == "none" + --sampleGroupFilter 'none' + #else + --sampleGroupFilter '$group_filter.group_filter_method' + --sampleGroupFilterPatterns '$group_filter_patterns_f' + --sampleGroupFilterMode '$group_filter.group_filter_mode' + #end if + --intensityMinValuesPerClass '$intnsty_min_vals_per_smpl_grp' + --imputedDataFile '$imputed_data_file' --imputedQNLTDataFile '$imp_qn_lt_file' --ksea_sqlite '$ksea_sqlite' + --kseaMinSubstrateCount '$ksea_min_substrate_count' --ksea_cutoff_threshold '$ksea_cutoff_threshold' --ksea_cutoff_statistic 'FDR' - --reportFile '$report_file' + --kseaUseAbsoluteLog2FC '$ksea_use_absolute_log2_fc' + --minQuality '$ksea_min_quality' --anova_ksea_metadata '$anova_ksea_metadata' + --reportFile '$report_file' ]]></command> + <!-- + --> <configfiles> <configfile name="sample_names_regex_f"> $sample_names_regex @@ -57,18 +108,39 @@ <configfile name="sample_grouping_regex_f"> $sample_grouping_regex </configfile> + <configfile name="group_filter_patterns_f"> + #if $group_filter.group_filter_method != "none" + $group_filter.group_filter_patterns + #end if + </configfile> <configfile name="intensity_column_regex_f"> $intensity_column_regex </configfile> </configfiles> <inputs> - <param name="input_file" type="data" format="tabular" label="Filtered Phosphopeptide Intensities" - help="Phosphopeptide intensities filtered for minimal quality. First column label 'Phosphopeptide'; sample-intensities must begin in column 10 and must have column labels to match argument [sample_names_regex]" + <!-- + needed inputs: + - # should filters be used to identify sample-groups to be included or excluded + sampleGroupFilter: !r c("none", "exclude", "include")[3] + - # what patterns should be used to match sample-groups + # (extracted by regexSampleGrouping) when determining sample-groups + # that should be included or excluded + sampleGroupFilterPatterns: ".*CR,N.*" + - # minimum number of observed values per class + intensityMinPerClass: 0 + - # what should be the primary criterion to eliminate excessive heatmap rows + intensityHeatmapCriteria: !r c("quality", "na_count", "p_value")[1] + suggested or advanced inputs: + - kinaseNameUprtLutBz2: "./kinase_name_uniprot_lut.tabular.bz2" + - kinaseUprtDescLutBz2: "./kinase_uniprot_description_lut.tabular.bz2" + --> + <param name="input_file" type="data" format="tabular" label="Filtered phosphopeptide intensities (tabular)" + help="'preproc_tab' dataset produced by 'MaxQuant Phosphopeptide Preprocessing' tool" /> - <param name="alpha_file" type="data" format="tabular" label="ANOVA alpha cutoff level" + <param name="alpha_file" type="data" format="tabular" label="ANOVA alpha cutoff level (tabular)" help="ANOVA alpha cutoff values for significance testing: tabular data having one column and no header" /> - <param name="preproc_sqlite" type="data" format="sqlite" label="preproc_sqlite dataset from mqppep_preproc" + <param name="preproc_sqlite" type="data" format="sqlite" label="Database from mqppep_preproc (sqlite)" help="'preproc_sqlite' dataset produced by 'MaxQuant Phosphopeptide Preprocessing' tool" /> <param name="intensity_column_regex" type="text" value="^Intensity[^_]" @@ -78,7 +150,7 @@ <!-- imputation_method <- c("group-median","median","mean","random")[1] --> <conditional name="imputation"> <param name="imputation_method" type="select" label="Imputation method" - help="Impute missing values by (1) using median for each sample-group; (2) using median across all samples; (3) using mean across all samples; or (4) using randomly generated values having same std. dev. as across all samples (with mean specified by [meanPercentile])" + help="Impute missing values by (1) using median for each sample-group; (2) using median across all samples; (3) using mean across all samples; or (4) using randomly generated values having same SD as across all samples (with mean specified by 'Mean percentile for random values')" > <option value="random" selected="true">random</option> <option value="group-median">group-median</option> @@ -93,15 +165,15 @@ label="Mean percentile for random values" help="Percentile center of random values; range [1,99]" /> - <param name="sdPercentile" type="float" value="1.0" - label="Percentile std. dev. for random values" - help="Standard deviation adjustment-factor for random values; real number. (1.0 means SD equal to the SD for the entire data set.)" + <param name="sdPercentile" type="float" value="1" + label="Percentile SD for random values" + help="Standard deviation adjustment-factor for random values; real number. (1.0 means SD of random values equal to the SD for the entire data set.)" /> </when> </conditional> <param name="sample_names_regex" type="text" value="\.\d+[A-Z]$" - help="Pattern extracting sample-names from names of columns that have peptide intensity data (PERL-compatible regular expression)" - label="Sample-extraction pattern"> + help="Pattern extracting sample-names from names of columns of 'Filtered phosphopeptide intensities' that have peptide intensity data (PERL-compatible regular expression)" + label="Sample-name extraction pattern"> <sanitizer> <valid initial="string.printable"> <remove value="'"/> @@ -109,32 +181,65 @@ </sanitizer> </param> <param name="sample_grouping_regex" type="text" value="\d+" - help="Pattern extracting sample-group from the sample-names that are extracted by 'Sample-extraction pattern' (PERL-compatible regular expression)" - label="Group-extraction pattern"> + help="Pattern extracting sample-group from the extracted sample-names (PERL-compatible regular expression)" + label="Sample-group extraction pattern"> <sanitizer> <valid initial="string.printable"> <remove value="'"/> </valid> </sanitizer> </param> + <param name="intnsty_min_vals_per_smpl_grp" type="integer" value="1" min="0" + label="Minimum number of values per sample-group" + help="Only consider as comparable those intensities having at least this number of values in each sample-group (range [0,∞])" + /> + <conditional name="group_filter"> + <param name="group_filter_method" type="select" label="Filter sample-groups" + help="What filter should be applied to sample-group names? (1) 'none', no filter; (2) 'include', match is required; (3) 'exclude', match is forbidden." + > + <option value="none" selected="true">none</option> + <option value="include">include</option> + <option value="exclude">exclude</option> + </param> + <when value="none" /> + <when value="include"> + <expand macro="group_matching_parm"/> + </when> + <when value="exclude"> + <expand macro="group_matching_parm"/> + </when> + </conditional> + <param name="ksea_min_substrate_count" type="integer" value="1" min="1" + label="Minimum number of kinase-substrates for KSEA" + help="Minimum number of substrates to consider any kinase for KSEA (range [1,∞])" + /> <param name="ksea_cutoff_threshold" type="float" value="0.05" label="KSEA threshold level" - help="Maximum FDR to be used to score a kinase enrichment as significant" + help="Maximum FDR to be used to score a kinase enrichment as significant; see warning against setting this too low in help text below." + /> + <param name="ksea_use_absolute_log2_fc" + type="boolean" + label="Use abs(log2(fold-change)) for KSEA" + help="Should log2(fold-change) be used for KSEA? (Checking this may alter (possibly reduce) the number of hits.)" + checked="false" + truevalue="TRUE" + falsevalue="FALSE" + /> + <param name="ksea_min_quality" type="integer" value="0" min="0" + label="Minimum quality of substrates for KSEA" + help="Minimum 'quality' of substrates to be considered for KSEA (range [0,∞]); higher numbers reduce the number of substrates considered - see help text below." /> </inputs> <outputs> - <data name="imputed_data_file" format="tabular" label="${input_file.name}.${imputation.imputation_method}-imputed_intensities" ></data> - <data name="imp_qn_lt_file" format="tabular" label="${input_file.name}.${imputation.imputation_method}-imputed_QN_LT_intensities" ></data> - <data name="anova_ksea_metadata" format="tabular" label="${input_file.name}.${imputation.imputation_method}-anova_ksea_metadata" ></data> - <!-- - <data name="report_file" format="html" label="${input_file.name}.${imputation.imputation_method}-imputed_report (download/unzip to view)" ></data> - --> - <data name="report_file" format="pdf" label="${input_file.name}.${imputation.imputation_method}-imputed_report" ></data> - <data name="ksea_sqlite" format="sqlite" label="${input_file.name}..${imputation.imputation_method}-imputed_ksea_sqlite"> - </data> + <!-- earlier outputs will appear lower in the history list; therefore, put report at the top --> + <data name="ksea_sqlite" format="sqlite" label="${input_file.name}..${imputation.imputation_method}-imputed_ksea_sqlite" /> + <data name="anova_ksea_metadata" format="tabular" label="${input_file.name}.${imputation.imputation_method}-anova_ksea_metadata" /> + <data name="imputed_data_file" format="tabular" label="${input_file.name}.${imputation.imputation_method}-imputed_intensities" /> + <data name="imp_qn_lt_file" format="tabular" label="${input_file.name}.${imputation.imputation_method}-imputed_QN_LT_intensities" /> + <data name="report_file" format="pdf" label="${input_file.name}.${imputation.imputation_method}-imputed_report" /> </outputs> <tests> - <test> + <test><!-- test #1 --> <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/> <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/> <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/> @@ -156,20 +261,32 @@ <has_text text="Phosphopeptide" /> <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> <!-- missing missing observed missing observed observed --> - <has_text_matching expression="pSQKQEEENPAEETGEEK.*6.962256.*6.908828.*6.814580.*6.865411.*6.908828.*7.088909" /> + <has_text_matching expression="pSQKQEEENPAEETGEEK.*6.962256.*6.908828.*6.814580.*6.865411.*6.908828.*7.093748" /> <has_text text="pSQKQEEENPAEETGEEK" /> </assert_contents> </output> </test> - <test> + <test><!-- test #2 --> <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/> <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/> <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/> <param name="intensity_column_regex" value="^Intensity[^_]"/> <param name="imputation_method" value="mean"/> + <!-- + <param name="meanPercentile" value="1"/> + <param name="sdPercentile" value="1"/> + --> <param name="sample_names_regex" value="\.\d+[A-Z]$"/> <param name="sample_grouping_regex" value="\d+"/> + <param name="intnsty_min_vals_per_smpl_grp" value="1"/> + <param name="group_filter_method" value="none"/> + <!-- + <param name="group_filter_mode" value="r"/> + <param name="group_filter_patterns" value="\.+"/> + --> + <param name="ksea_min_substrate_count" value="1"/> + <param name="ksea_cutoff_threshold" value="0.5"/> <output name="imputed_data_file"> <assert_contents> <has_text text="Phosphopeptide" /> @@ -184,18 +301,30 @@ <has_text text="Phosphopeptide" /> <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> <!-- missing missing observed missing observed observed --> - <has_text_matching expression="pSQKQEEENPAEETGEEK.*6.839850.*6.797424.*6.797424.*6.797424.*6.896609.*7.092451" /> + <has_text_matching expression="pSQKQEEENPAEETGEEK.*6.839850.*6.797424.*6.797424.*6.797424.*6.896609.*7.097251" /> </assert_contents> </output> </test> - <test> + <test><!-- test #3 --> <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/> <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/> <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/> <param name="intensity_column_regex" value="^Intensity[^_]"/> <param name="imputation_method" value="group-median"/> + <!-- + <param name="meanPercentile" value="1"/> + <param name="sdPercentile" value="1"/> + --> <param name="sample_names_regex" value="\.\d+[A-Z]$"/> <param name="sample_grouping_regex" value="\d+"/> + <param name="intnsty_min_vals_per_smpl_grp" value="1"/> + <param name="group_filter_method" value="none"/> + <!-- + <param name="group_filter_mode" value="r"/> + <param name="group_filter_patterns" value="\.+"/> + --> + <param name="ksea_min_substrate_count" value="1"/> + <param name="ksea_cutoff_threshold" value="0.5"/> <output name="imputed_data_file"> <assert_contents> <has_text text="Phosphopeptide" /> @@ -214,7 +343,7 @@ </assert_contents> </output> </test> - <test> + <test><!-- test #4 --> <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/> <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/> <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/> @@ -237,8 +366,8 @@ <assert_contents> <has_text text="Phosphopeptide" /> <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> - <has_text text="5.409549" /> <!-- log-transformed value for pTYVDPFTpYEDPNQAVR .1B --> - <has_text text="6.464714" /> <!-- log-transformed value for pSQKQEEENPAEETGEEK .2A --> + <has_text text="5.522821" /> <!-- log-transformed value for pTYVDPFTpYEDPNQAVR .1B --> + <has_text text="6.638251" /> <!-- log-transformed value for pSQKQEEENPAEETGEEK .2A --> </assert_contents> </output> </test> @@ -248,14 +377,34 @@ Phopsphoproteomic Enrichment Pipeline ANOVA and KSEA ==================================================== -**Input files** +**Overview** +============ + +Perform statistical analysis of preprocessed MaxQuant output data collected as described in `[Cheng, 2018] <https://doi.org/10.3791/57996>`_. + + - Extracts sample-group IDs from sample names. + - Imputes missing values. + - Performs ANOVA analysis for each phosphopeptide. + - Performs Kinase-Substrate Enrichment Analysis (KSEA) using the method described by `Casado et al. (2013) <doi:10.1126/scisignal.2003573>`_; see *"Algorithms"* section below. -``Filtered Phosphopeptide Intensities`` +**Workflow position** +===================== + +Upstream tool + The "MaxQuant Phosphopeptide Preprocessing" tool (``mqppep_preproc``) that transforms MaxQuant output for phospoproteome-enriched samples into a form suitable for statistical analysis. + +**Input datasets** +================== + +``Filtered phosphopeptide intensities`` (tabular) Phosphopeptides annotated with SwissProt and phosphosite metadata (in tabular format). - This is the output from the "Phopsphoproteomic Enrichment Pipeline Merge and Filter" - (``mqppep_mrgflt``) tool. + This is the output from the "MaxQuant Phopsphopeptide Preprocessing" + (``mqppep_preproc``) tool. -``ANOVA alpha cutoff level`` + - First column label 'Phosphopeptide'. + - Sample-intensities must begin in first column matching 'Intensity-column pattern' and must have column labels to match argument 'Sample-name extraction pattern'. + +``ANOVA alpha cutoff level`` (tabular) List of alpha cutoff values for significance testing; text file having one column and no header. For example: :: @@ -264,60 +413,99 @@ 0.1 0.05 +``Database from mqppep_preproc`` (sqlite) + SQLite database produced by the "MaxQuant Phopsphopeptide Preprocessing" + (``mqppep_preproc``) tool. + **Input parameters** +==================== ``Intensity-column pattern`` - First column of ``input_file`` having intensity values (integer or PERL-compatible regular expression matching column label). Default: **Intensity** + First column of ``Filtered phosphopeptide intensities`` having intensity values (integer or PERL-compatible regular expression matching column label). Default:: + + ^Intensity[^_] ``Imputation method`` Impute missing values by: - 1. ``group-median`` - use median for each sample-group; - 2. ``mean`` - use mean across all samples; or - 3. ``median`` - use median across all samples; - 4. ``random`` - use randomly generated values where: + 1. ``group-median`` - use median for each sample-group; + 2. ``mean`` - use mean across all samples; or + 3. ``median`` - use median across all samples; + 4. ``random`` - use randomly generated values where: - - ``Mean percentile for random values`` specifies the percentile among non-missing values to be used as mean of random values, and - - ``Percentile std. dev. for random values`` specifies the factor to be multiplied by the standard deviation among the non-missing values (across all samples) to determine the standard deviation of random values. + (i) ``Mean percentile for random values`` specifies the percentile among non-missing values to be used as mean of random values, and + (ii) ``Percentile SD for random values`` specifies the factor to be multiplied by the standard deviation among the non-missing values (across all samples) to determine the standard deviation of random values. -``Sample-extraction pattern`` - PERL-compatible regular expression extracting the sample-name from the the name of a column of instensities (from ``input_file``) for one sample. +``Sample-name extraction pattern`` + PERL-compatible regular expression extracting the sample-name from the the name of a column of intensities (from ``Filtered phosphopeptide intensities``) for one sample. - - For example, ``"\.\d+[A-Z]$"`` applied to ``Intensity.splunge.10A`` would produce ``.10A`` + - For example, ``"\.\d+[A-Z]$"`` applied to "``Intensity.splunge.10A``" would produce "``.10A``". - Note that *this is case sensitive* by default. -``Group-extraction pattern`` - PERL-compatible regular expression extracting the sample-grouping from the sample-name that was extracted with ``sample_names_regex`` from a column of intensites (from ``input_file``). +``Sample-group extraction pattern`` + PERL-compatible regular expression extracting the sample-grouping from the sample-name (that was in turn extracted with ``Sample-name extraction pattern`` from a column of intensites from ``Filtered phosphopeptide intensities``). + + - For example, ``"\d+$"`` applied to "``.10A``" would produce "``10``". + - Note that *this is case sensitive* by default. + +``Minimum number of values per sample-group`` + Sometimes you may wish to filter out the intensities that are poorly represented among some sample groups because they complicate the comparison process. You can use this parameter to specify the minimum number of values in any sample-group (range [0,]]>∞<![CDATA[]) - - For example, ``"\d+$"`` applied to ``.10A`` would produce ``10`` - - Note that *this is case sensitive* by default. +``Filter sample-groups`` + Sometimes you may have spectra that are for treatments that you are not considering for your comparison. You can specify a filter (or not) for sample-group names; if you do, you can specify whether groups that match your criteria should be excluded from the analysis ("forbidden") or included in the analysis ("required"). + +``Sample-group matching mode`` + The R `base::grep` function that is used here for pattern matching is exhaustively documented at https://rdrr.io/r/base/grep.html. There are two choices you make here. The first is whether to differentiate lowercase and uppercase characters. The second is wheter to require exact matches ("fixed" pattern-matching mode) or to use "PERL-compatible regular expressions) ("perl") or "extendd regular expressions" ("grep"). See https://rdrr.io/r/base/grep.html for further info. + +``Sample-group matching pattern`` + This is a comma-separated list of patterns to match to group-names, according to the ``Sample-group matching mode`` that you have chosen. + +``Minimum number of kinase-substrates for KSEA`` + For KSEA, you may decide that you wish to ignore kinases having fewer substrates than some minimum; specify that minimum here (range [1,]]>∞<![CDATA[]) ``KSEA threshold level`` - Specifies minimum FDR at which a kinase will be considered to be enriched; the default choice of 0.05 is arbitrary. + Specifies minimum FDR at which a kinase will be considered to be enriched; the default choice of ``0.05`` is arbitrary and may exclude kinases that are interesting. The KSEA FDR perhaps should not be treated as conservatively as would be appropriate for hypothesis testing. For example, at an FDR of ``0.05``, for every ``20`` kinases that on discards, ``19`` are likely truely enriched. + +``Use abs(log2(fold-change)) for KSEA`` + When TRUE, consider only the magnitude of the differences across the contrast for all of the substrates when aggregating them to assess the enrichment of a given kinase's substrates. When FALSE, also consider the direction. Surprisingly, setting this to TRUE may decrease the enriched kinases. + +``Minimum quality of substrates for KSEA`` + An arbitrary "quality score" is assigned to each substrate, as described in the PDF report produced by the tool. This score takes into account both FDR-adjusted p-value and the number of missing values for each substrate. Setting the minimum to zero retains all substrates, which may be a large number. **Outputs** +=========== -``imputed_intensities (input_file.imputation_method-imputed_intensities)`` - Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, in tabular format. +Report dataset + *[input file].[imputation method]*-``imputed_report`` + + Summary report for normalization, imputation, and **ANOVA**, in PDF format. -``imputed_QN_LT_intensities (input_file.imputation_method-imputed_QN_LT_intensities)`` - Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, quantile-normalized (**QN**), and log10-transformed (**LT**), in tabular format. +Imputed intensities + *[input file].[imputation method]*-``imputed_intensities`` + + Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, in tabular format. -``report_file (input_file.imputation_method-imputed_report)`` - Summary report for normalization, imputation, and **ANOVA**, in PDF format. +Imputed quantum-normalized log-transformed intensities + *[input file].[imputation method]*-``imputed_QN_LT_intensities`` + + Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, quantile-normalized (**QN**), and log10-transformed (**LT**), in tabular format. -``anova_ksea_metadata (input_file.imputation_method-imputed_anova_ksea_metadata)`` - Phosphopeptide metadata including ANOVA significance and KSEA enrichments. +ANOVA KSEA metadata + *[input file].[imputation method]*-``imputed_anova_ksea_metadata`` + Phosphopeptide metadata including ANOVA significance and KSEA enrichments. -``ksea_sqlite (input_file.imputation_method-imputed_ksea_sqlite)`` - SQLite database for ad-hoc report creation. +KSEA SQLite database sqlite + *[input file].[imputation method]*-``imputed_ksea_sqlite`` + An SQLite database that is usable for *ad hoc* report creation. **Algorithm** +============= -The KSEA algorithm used here is as in the KSEAapp package as reported in [Wiredja 2017]. -The code is adapted from "Danica D. Wiredja (2017). KSEAapp: Kinase-Substrate Enrichment Analysis. R package version 0.99.0." to work with output from the "MaxQuant Phosphopeptide Preprocessing" Galaxy tool. +The KSEA algorithm used here is as in the KSEAapp package as reported in `[Wiredja 2017] <https://doi.org/10.1093/bioinformatics/btx415>`_. +The code is adapted from `"Danica D. Wiredja (2017). KSEAapp: Kinase-Substrate Enrichment Analysis. R package version 0.99.0." <https://cran.r-project.org/package=KSEAapp>`_ to work with output from the "MaxQuant Phosphopeptide Preprocessing" Galaxy tool and the multiple kinase-substrate databases that the latter tool searches. **Authors** +=========== ``Larry C. Cheng`` (`ORCiD 0000-0002-6922-6433 <https://orcid.org/0000-0002-6922-6433>`_) wrote the original script. @@ -337,5 +525,11 @@ <citation type="doi">10.3791/57996</citation> <!-- Wiredja_2017 "The KSEA App ..." PMID: 28655153 --> <citation type="doi">10.1093/bioinformatics/btx415</citation> + <citation type="bibtex">@Manual{, + title = {KSEAapp: Kinase-Substrate Enrichment Analysis}, + author = {Danica D. Wiredja}, + year = {2017}, + note = {R package version 0.99.0}, + }</citation> </citations> </tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mqppep_anova_preamble.tex Fri Oct 28 18:27:21 2022 +0000 @@ -0,0 +1,90 @@ +% ----------------------------------------------------------------------------- +% preamble includes BEGIN +% ----------------------------------------------------------------------------- +\usepackage{longtable, lscape, ifthen} + +% ----------------------------------------------------------------------------- +% put \T or \B at the ends of lines to add space for super- or sub- +% scripts above or below, respectively +% ref: ? +\newcommand\T{\rule{0pt}{2.6ex}} % Top strut +\newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut + +% ----------------------------------------------------------------------------- +% horizontal line commands; ideally, these would compute the width rather than +% hardcoding it +% ref: ? +\def\hlinport{\makebox[6.5in]{\hrulefill} \\} % hline outside tabular, port +\def\hlinlscp{\makebox[9in]{\hrulefill} \\} % hline outside tabular, lndscp +%ref: https://stackoverflow.com/a/67335722 +\def\hlinnotab{\\makebox[1.0\linewidth]{\hrulefill}\\[1ex]} + +% ----------------------------------------------------------------------------- +% ref: https://latex.org/forum/viewtopic.php?p=23257#p23257 +\newcommand{\nonemptyline}[1]{% + %\ifthenelse{\equal{#1}{}}{do when empty}{do when not empty} + \ifthenelse{\equal{#1}{}}{}{#1}% +} + +% ----------------------------------------------------------------------------- +% For RMarkdown, I needed to put this into a preamble.tex file and include it +% via `output: pdf_document: includes: in_header: preamble.tex` because +% Markdown was expanding the \tabfill command before writing the tex file +% ref: https://tex.stackexchange.com/a/119477 in reply to +% https://tex.stackexchange.com/questions/119473/tabbing-and-line-wrapping +\makeatletter +\newlength\tdima +\newcommand\tabfill[1]{\setlength\tdima{\linewidth}% + \addtolength\tdima{\@totalleftmargin}% + \addtolength\tdima{-\dimen\@curtab}% + \parbox[t]{\tdima}{#1\ifhmode\strut\fi}} + %\parbox[t]{\tdima}{\nonemptyline{#1}\ifhmode\strut\fi}} +\makeatother +% +% Create a tabbing environment in which to use tabfill +% param #1 is specified the tabstops (as expected by the tabbing +% environment) and is provided in braces after invocation, e.g.: +% \begin{tabwrap}{\hspace{1.25in}\=} +% param #2 is the contents of the envirnent +\newenvironment{tabwrap}[2]{% + \begin{tabbing}#1\kill\ignorespaces% + #2}% + {\end{tabbing}% +} + +% ----------------------------------------------------------------------------- +% Make a caption for a non-floating figure or table, e.g., +% ref: https://github.com/rf-latex/capt-of/blob/main/capt-of.dtx +% https://texfaq.org/FAQ-figurehere +% +% Usage: \captionof{*type*}[*move*]{*caption*} +% *type* is `figure` or `table` (or some type you've +% defined with the`float` package) +% *move* is the optional moving argument *caption* (the thing +% that goes to the list of tables/figures) +% *caption* is the text of the caption +\makeatletter +\newcommand\captionof[1]{\def\@captype{#1}\caption} +\makeatother +% +%%ACE \captionof{table}{Hello world from line 210} +% To circumvent mis-numbering of interleaved float and non-float table +% and figure captions, it is necessary to include the `perpage` package and +% "make them sorted" (FFI see https://texfaq.org/FAQ-figurehere) +% I (ACE) don't know how to get this package to include: +% \usepackage{bigfoot} +% so I included the source instead: +\makeatletter +\input{perpage.tex} +\makeatother +% +% Ensure that table numbers are sorted +\MakeSorted{table} +% Ensure that figure numbers are sorted +\MakeSorted{figure} + +% ----------------------------------------------------------------------------- + +% ----------------------------------------------------------------------------- +% preamble includes END +% -----------------------------------------------------------------------------
--- a/mqppep_anova_script.Rmd Mon Jul 11 19:22:25 2022 +0000 +++ b/mqppep_anova_script.Rmd Fri Oct 28 18:27:21 2022 +0000 @@ -7,81 +7,153 @@ date: - "May 28, 2018" - "; revised June 23, 2022" +lot: true output: pdf_document: toc: true - toc_depth: 3 + toc_depth: 2 keep_tex: true -header-includes: - - \usepackage{longtable} - - \newcommand\T{\rule{0pt}{2.6ex}} % Top strut - - \newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut + dev: pdf + includes: + in_header: mqppep_anova_preamble.tex +latex_macros: false +raw_tex: true +urlcolor: blue params: alphaFile: "test-data/alpha_levels.tabular" inputFile: "test-data/test_input_for_anova.tabular" preprocDb: "test-data/test_input_for_anova.sqlite" kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] - show_toc: true - firstDataColumn: "^Intensity[^_]" - imputationMethod: !r c("group-median", "median", "mean", "random")[1] - meanPercentile: 1 - sdPercentile: 1.0 regexSampleNames: "\\.\\d+[A-Z]$" regexSampleGrouping: "\\d+" + groupFilterPatterns: ".+" + groupFilter: !r c("none", "exclude", "include")[1] + imputationMethod: !r c("group-median", "median", "mean", "random")[4] + kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5, 0.9)[5] + #imputationMethod: !r c("group-median", "median", "mean", "random")[1] + + # how should sample groups be interpreted? + # - "f": fixed patterns (like `grep -F`) + # - "p": PERL-compatible (like `grep -P`) + # - "r": extended grep patterns (like `grep -E`) + # use what case sensitivity? + # - "i": case insensitive matching (like `grep -i`) + groupFilterMode: !r c("r", "ri", "p", "pi", "f", "fi")[1] + # what pattern should be used for the first column + # (extended grep pattern, case sensitive) + firstDataColumn: "^Intensity[^_]" + # for small random value imputation, what percentile should be center? + meanPercentile: 50 + #meanPercentile: 1 + # for small random value imputation, what should `s / mean(x)` ratio be? + sdPercentile: 1.0 + # output path for imputed data file imputedDataFilename: "test-data/limbo/imputedDataFilename.txt" + # output path for imputed/quantile-normalized/log-transformed data file imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt" + # output path for contents of `stats_metadata_v` table anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt" + # how to test one variable with > 2 categories (e.g., aov or kruskal.test) oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1] + # how to test one variable with 2 categories (e.g., oneway.test) oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3] - kseaCutoffStatistic: !r c("p.value", "FDR")[2] - kseaCutoffThreshold: !r c( 0.1, 0.05)[2] - kseaMinKinaseCount: 1 - intensityHeatmapRows: 75 + # what should be the minimum quality for consideration in both + minQuality: 0 + # correct KSEA with FDR (recommended) or raw p-value + kseaCutoffStatistic: !r c("FDR", "p.value")[1] + # correct KSEA threshold 0.05 (conventional) or higher (perhaps better) + # "perhaps better" meaning that KSEA is an hypothesis-generator, not -test + #kseaCutoffThreshold: !r c(0.05, 0.1, 0.25, 0.5)[1] + # minimum number of substrates required for a kinase to be considered in KSEA + kseaMinSubstrateCount: 1 + # Should KSEA be performed aggregating signed log2FC or absolute? + # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores + # TRUE use abs(log2FC) for KSEA as Justin Drake requested; this is a + # justifiable deviation from the KSEAapp::KSEA.Scores algorithm. + kseaUseAbsoluteLog2FC: TRUE + #kseaUseAbsoluteLog2FC: FALSE + # minimum number of observed values per sample-group + intensityMinValuesPerGroup: 1 + # maximum number of heatmap rows (result are poor when > 50) + intensityHeatmapRows: 50 + # what should be the primary criterion to eliminate excessive heatmap rows + intensityHeatmapCriteria: !r c("quality", "na_count", "p_value")[1] + # should correlation among substrates be used (rather than covariance) + correlateSubstrates: TRUE + # only show covariance among variables having variance > 1 + filterCovVarGT1: TRUE + # maximum number of residues to display for ppeps in rownames or columnames + ppepTruncN: 10 + # maximum number of characters of subgenes to display in rownames or columnames + subgeneTruncN: 10 + # maximum number of characters for paste(subgene, ppep) for enrichment plots + substTruncN: 20 + # should boxplots use variable-width boxes to reflect # of samples + boxPlotVarWidth: TRUE + # should boxplots use notched boxes to reflect difference between samples + boxPlotNotch: TRUE + # look-up tables for kinase descriptions + kinaseNameUprtLutBz2: "./kinase_name_uniprot_lut.tabular.bz2" + kinaseUprtDescLutBz2: "./kinase_uniprot_description_lut.tabular.bz2" + # should debugging trace messages be printed? + showEnrichedSubstrates: FALSE + + # should debugging nb/nbe messages be printed? + printNBMsgs: FALSE + # should debugging trace messages be printed? + printTraceMsgs: FALSE + # when debugging files are needed, set debugFileBasePath to the path + # to the directory where they should be writtn + debugFileBasePath: !r if (TRUE) NULL else "test-data" --- -<!-- - kseaCutoffStatistic: !r c("p.value", "FDR")[2] - kseaCutoffThreshold: !r c(0.05, 0.1)[1] - - alphaFile: "test-data/alpha_levels.tabular" - inputFile: "test-data/test_input_for_anova.tabular" - preprocDb: "test-data/test_input_for_anova.sqlite" - kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] - - alphaFile: "test-data/alpha_levels.tabular" - inputFile: "test-data/UT_phospho_ST_sites.preproc.tabular" - preprocDb: "test-data/UT_phospho_ST_sites.preproc.sqlite" - kseaAppPrepDb: !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2] - - alphaFile: "test-data/alpha_levels.tabular" - inputFile: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.tabular" - preprocDb: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.sqlite" - kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] - - alphaFile: "test-data/alpha_levels.tabular" - inputFile: "test-data/pST_Sites_NancyDu.txt.preproc.tabular" - preprocDb: "test-data/pST_Sites_NancyDu.txt.preproc.sqlite" - kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] - - inputFile: "test-data/density_failure.preproc_tab.tabular" - kseaAppPrepDb: !r c(":memory:", "mqppep.sqlite")[2] - latex_document: default ---> -```{r setup, include = FALSE} + +```{r setup, include = FALSE, results = 'asis'} + +# simple debug messaging +print_nb_messages <- params$printNBMsgs + +nb <- if (!print_nb_messages) { + function(...) invisible() + } else { + function(..., f = cat) f("\n$\\exists{}\\supset\\forall{}$", ...) + } + +nbe <- if (!print_nb_messages) { + function(...) invisible() + } else { + function(..., f = cat, file = stderr()) { + cat( + stringi::stri_unescape_unicode("\nNBE \\u2203\\u2283\\u2200"), + ..., + file = file + ) + } + } + #ref for debugging: https://yihui.org/tinytex/r/#debugging options(tinytex.verbose = TRUE) # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 -knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) +knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10), dpi = 300) # freeze the random number generator so the same results will be produced # from run to run set.seed(28571) ### LIBRARIES + +if (print_nb_messages) nbe("library(gplots)") library(gplots) +if (print_nb_messages) nbe("library(caret)") +# load caret for nearZeroVar +if (print_nb_messages) nbe("Please ignore the messages about systemd, if any.\n") +library(caret) +if (print_nb_messages) nbe("library(DBI)") library(DBI) +if (print_nb_messages) nbe("library(RSQLite)") library(RSQLite) +if (print_nb_messages) nbe("library(sqldf)\n") # Suppress "Warning: no DISPLAY variable so Tk is not available" suppressWarnings(suppressMessages(library(sqldf))) @@ -97,30 +169,42 @@ ### CONSTANTS -const_parfin <- par("fin") -const_boxplot_fill <- "grey94" -const_stripchart_cex <- 0.5 -const_stripsmall_cex <- - sqrt(const_stripchart_cex * const_stripchart_cex / 2) -const_stripchart_jitter <- 0.3 -const_write_debug_files <- FALSE -const_table_anchor_bp <- "bp" -const_table_anchor_ht <- "ht" -const_table_anchor_p <- "p" -const_table_anchor_tbp <- "tbp" - - +const_boxplot_fill <- "grey94" const_ksea_astrsk_kinases <- 1 const_ksea_nonastrsk_kinases <- 2 const_ksea_all_kinases <- 3 - -const_log10_e <- log10(exp(1)) - -### FUNCTIONS - -# from `demo(error.catching)` +const_log10_e <- log10(exp(1)) +const_stripchart_cex <- 0.5 +const_stripchart_jitter <- 0.3 +const_table_anchor_bp <- "bp" +const_table_anchor_ht <- "ht" +const_table_anchor_p <- "p" +const_table_anchor_t <- "t" +const_table_anchor_tbp <- "tbp" + + +### GLOBAL VARIABLES (params) + +## functions to process params + +is_string_null_or_empty <- function(x) { + # N. B. non-strings are intentionally treated as NULL + if (is.null(x)) + TRUE + else if (!is.character(x)) + TRUE + else x == "" +} + ##' Catch *and* save both errors and warnings, and in the case of ##' a warning, also keep the computed result. +##' return result as list(value = ..., warning = ...) +##' - value will be: +##' - the result if no exception is thrown +##' - the exception if an exception is caught +##' - warning will be a string except perhaps when warning argument is not NULL +##' +##' adapted from `demo(error.catching)` ##' ##' @title tryCatch both warnings (with value) and errors ##' @param expr an \R expression to evaluate @@ -128,32 +212,428 @@ ##' 'value' may be an error caught. ##' @author Martin Maechler; ##' Copyright (C) 2010-2012 The R Core Team -try_catch_w_e <- function(expr) { - wrn <- NULL - # warning handler - w_handler <- function(w) { - wrn <<- w - invokeRestart("muffleWarning") +try_catch_w_e <- + function(expr, error = function(e) e, warning = NULL) { + wrn <- NULL + # warning handler + w_handler <- + if (is.function(warning)) + warning + else + function(w) { + wrn <<- w + invokeRestart("muffleWarning") + } + e_handler <- + if (is.function(error)) + error + else + function(e) e + # return result as list(value = ..., warning = ...) + # - value will be: + # - the result if no exception is thrown + # - the exception if an exception is caught + list( + value = withCallingHandlers( + tryCatch( + expr, + error = e_handler + ), + warning = w_handler + ), + warning = wrn + ) + } + +see_kvp <- + function(format, key, value, suffix = "") { + if ( + !all( + is.character(format), + is.character(key), + is.character(value), + is.character(suffix) + ) + ) { + cat("all arguments to see_kvp should be character") + knitr::knit_exit() + } + result <- sprintf(format, value) + if (length(result) > 1) { + sprintf( + "%s = c(%s)%s", + whack_underscores(key), + paste(result, collapse = ", "), + suffix + ) + } else { + sprintf( + "%s = %s%s", + key, + result, + suffix + ) + } + } + +see_logical <- + function(x, suffix = "", xprssn = deparse1(substitute(x))) { + result <- as.character(as.logical(x)) + # handle NAs and NaNs + result[is.na(result)] <- "NA" + see_kvp( + format = "%s", + key = xprssn, + value = result, + suffix = suffix + ) + } + +see_numeric <- + function(x, suffix = "", digits = 3, xprssn = deparse1(substitute(x))) { + if (is.numeric(digits) && is.numeric(x)) { + digits <- as.integer(digits) + digits <- min(16, max(0, digits)) + format <- paste0("%0.", as.character(digits), "g") + result <- sprintf(format, x) + see_kvp( + format = "%s", + key = xprssn, + value = result, + suffix = suffix + ) + } + } + +see_character <- + function(x, suffix = "", xprssn = deparse1(substitute(x))) { + if (is.character(x)) { + see_kvp( + format = "%s", + key = xprssn, + value = sprintf("\"%s\"", x), + suffix = suffix + ) + } + } + +see_variable <- + function(x, suffix = "", digits = 3, xprssn = deparse1(substitute(x))) { + if (is.character(x)) { + see_character(x, suffix, xprssn) + } else if (is.numeric(x)) { + see_numeric(x, suffix, digits, xprssn) + } else if (is.logical(x)) { + see_logical(x, suffix, xprssn) + } else { + f <- file("") + sink(f) + str(x) + msg <- paste(readLines(f), collapse = "\n") + sink() + close(f) + paste0( + "see_variable - str(", + xprssn, + "):\n", + msg, "\n" + ) + } + } + +# ref: https://tug.org/texinfohtml/latex2e.html +# LaTeX sets aside the following characters for special purposes. +# For example, the percent sign % is for comments. +# They are called reserved characters or special characters. +# They are all discussed elsewhere in this manual. +# +# $ % & { } _ ~ ^ \ # +# +# If you want a reserved character to be printed as itself, in the text body +# font, for all but the final three characters in that list simply put +# a backslash \ in front of the character. +# Thus, typing \$1.23 will produce $1.23 in your output. +# +# As to the last three characters, to get a tilde in the text body font, +# use \~{} (omitting the curly braces would result in the next character +# receiving a tilde accent). +# Similarly, to get a text body font circumflex use \^{}. +# To get a backslash in the font of the text body enter \textbackslash{}. +whack_math <- + function(v) { + v <- as.character(v) + w <- gsub("\\", "\\textbackslash ", v, fixed = TRUE) + w <- Reduce( + f = function(l, r) { + gsub(r, paste0("\\", r), l, fixed = TRUE) + }, + x = c("#", "$", "%", "&", "{", "}", "_"), + init = w + ) + w <- gsub("^", "\\^{}", w, fixed = TRUE) + return(w) + if (all(v == w)) + v + else + paste0("\\texttt{", w, "}") } - list( - value = withCallingHandlers( - tryCatch( - expr, - error = function(e) e - ), - warning = w_handler - ), - warning = wrn +whack_underscores <- whack_math + +## dump params to stderr (remove this eventually) + +if (FALSE) nbe(see_variable(params)) + +## unlist params for eventual output + +param_unlist <- unlist(as.list(params)) + +# no need to whack underscores and dollars because this is verbatim +param_df <- data.frame( + parameter = paste0("\\verb@", names(param_unlist), "@"), + value = paste0( + "\n\\begin{tiny}\n\\verb@", + param_unlist, + "@\n\\end{tiny}" + ) + ) +param_df <- data.frame( + parameter = names(param_unlist), + value = param_unlist + ) +param_df <- param_df[order(param_df$parameter), ] + +## general output control + +debug_file_base_path <- params$debugFileBasePath +print_trace_messages <- params$printTraceMsgs +show_enriched_substrates <- params$showEnrichedSubstrates +boxplot_varwidth <- params$boxPlotVarWidth +boxplot_notch <- params$boxPlotNotch + +## parameters for static data + +kinase_name_uprt_lut_bz2 <- params$kinaseNameUprtLutBz2 +kinase_uprt_desc_lut_bz2 <- params$kinaseUprtDescLutBz2 + +## parameters for input file + +preproc_db <- params$preprocDb +alpha_file <- params$alphaFile +input_file <- params$inputFile + +# First data column - ideally, this could be detected via +# regexSampleNames, but for now leave it as is. +first_data_column <- params$firstDataColumn +fdc_is_integer <- is.integer(first_data_column) +if (fdc_is_integer) { + first_data_column <- as.integer(params$firstDataColumn) +} + +## parameters for output files + +ksea_app_prep_db <- params$kseaAppPrepDb +imputed_data_filename <- params$imputedDataFilename +imp_qn_lt_data_filenm <- params$imputedQNLTDataFile +anova_ksea_mtdt_file <- params$anovaKseaMetadata + +## parameters for imputation + +# Imputation method, should be one of +# "random", "group-median", "median", or "mean" +imputation_method <- params$imputationMethod + +# Selection of percentile of logvalue data to set the mean for random number +# generation when using random imputation +mean_percentile <- params$meanPercentile / 100.0 + +# deviation adjustment-factor for random values; real number. +sd_percentile <- params$sdPercentile + +## parameters for group parsing and filtering + +# Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" +regex_sample_names <- params$regexSampleNames +# Regular expression to extract Sample Grouping from Sample Name; +# if error occurs, compare smpl_trt vs. sample_name_matches +# to see if groupings/pairs line up +# e.g., "(\\d+)" + +regex_sample_grouping <- params$regexSampleGrouping +# What are the patterns for filtering sample groups? +# How should sample groups be filtered? +# - none: do not filter +# - include: include sample groups matching filter +# - exclude: include sample groups not matching filter + +sample_group_filter <- params$groupFilter +if (grepl("f", params$groupFilterMode, fixed = TRUE)) { + sample_group_filter_perl <- FALSE + sample_group_filter_fixed <- TRUE +} else if (grepl("p", params$groupFilterMode, fixed = TRUE)) { + sample_group_filter_perl <- TRUE + sample_group_filter_fixed <- FALSE +} else { # normal regex + sample_group_filter_perl <- FALSE + sample_group_filter_fixed <- FALSE +} + +sample_group_filter_nocase <- + grepl("i", params$groupFilterMode, fixed = TRUE) + +# What PCRE patterns should be included or excluded +group_filter_patterns_csv <- params$groupFilterPatterns +sample_group_filter_patterns <- strsplit( + x = group_filter_patterns_csv, + split = ",", + fixed = TRUE + )[[1]] + +## parameters for hypothesis testing + +one_way_all_categories_fname <- params$oneWayManyCategories + +one_way_all_categories <- try_catch_w_e( + match.fun(one_way_all_categories_fname)) + +if (!is.function(one_way_all_categories$value)) { + write("fatal error for parameter oneWayManyCategories:", stderr()) + write(one_way_all_categories$value$message, stderr()) + if (sys.nframe() > 0) { + cat("Cannot continue and quit() failed. Goodbye.") + knitr::knit_exit() + quit(save = "no", status = 1) + } +} + +one_way_all_categories <- one_way_all_categories$value + +one_way_two_categories_fname <- params$oneWayManyCategories +one_way_two_categories <- try_catch_w_e( + match.fun(one_way_two_categories_fname)) +if (!is.function(one_way_two_categories$value)) { + cat("fatal error for parameter oneWayTwoCategories: \n") + cat(one_way_two_categories$value$message, fill = TRUE) + if (sys.nframe() > 0) { + cat("Cannot continue and quit() failed. Goodbye.") + knitr::knit_exit() + quit(save = "no", status = 1) + } +} +one_way_two_categories <- one_way_two_categories$value + +## parameters for KSEA + +ksea_cutoff_statistic <- params$kseaCutoffStatistic +ksea_cutoff_threshold <- params$kseaCutoffThreshold +ksea_min_substrate_count <- params$kseaMinSubstrateCount + +## parameters for global variables consumed by functions + +# intensityHeatmapCriteria: !r c("na_count", "p_value")[2] # TODO switch to 1 +# TODO Validate within list +g_intensity_hm_criteria <- params$intensityHeatmapCriteria +if (is_string_null_or_empty(g_intensity_hm_criteria)) { + cat("invalid intensityHeatmapCriteria parameter (must be string)") + knitr::knit_exit() +} +switch( + g_intensity_hm_criteria, + "quality" = NULL, + "na_count" = NULL, + "p_value" = NULL, + { + with( + params, + cat( + sprintf( + "invalid %s (must be %s)", + see_variable(intensityHeatmapCriteria), + "one of quality or na_count or p_value" + ) + ) + ) + knitr::knit_exit() + } +) + +# intensityHeatmapRows: 50 +# TODO Validate >> 0 < 75 +g_intensity_hm_rows <- params$intensityHeatmapRows +if (!is.integer(g_intensity_hm_rows) || g_intensity_hm_rows < 1) { + cat("invalid intensityHeatmapRows (must be integer > 0)") + knitr::knit_exit() +} + +g_intensity_min_per_class <- params$intensityMinValuesPerGroup +if (!is.integer(g_intensity_min_per_class) || g_intensity_min_per_class < 0) { + cat("invalid intensityMinValuesPerGroup (must be integer > -1") + knitr::knit_exit() +} + +if (is.na(as.logical(g_correlate_substrates <- params$correlateSubstrates))) { + cat("invalid correlateSubstrates (must be TRUE or FALSE)") + knitr::knit_exit() +} + +if (is.na(as.logical(g_filter_cov_var_gt_1 <- params$filterCovVarGT1))) { + cat("invalid filterCovVarGT1 parameter (must be TRUE or FALSE)") + knitr::knit_exit() +} + +# TODO Validate >> 0 < 30 +g_ppep_trunc_n <- params$ppepTruncN + +# TODO Validate >> 0 < 30 +g_subgene_trunc_n <- params$subgeneTruncN + +# TODO Validate >> 0 < 30 +g_sbstr_trunc_n <- params$substTruncN + + +### OPERATORS + +# Test for exclusion +# ref: https://www.reneshbedre.com/blog/in-operator-r.html +`%notin%` <- Negate(`%in%`) + +# Augmented assignment +# ref: https://www2.cs.arizona.edu/icon/refernce/infix2.htm#aug_assign +`%||:=%` <- function(lvalue, ...) { + pf <- parent.frame() + rvalue <- Reduce(paste0, x = ..., init = lvalue) + assign( + x = as.character(substitute(lvalue)), + value = rvalue, + pos = pf + ) + invisible(rvalue) +} + +### FUNCTIONS + +no_op <- + function() { + } +# this function is not used in this file and should be removed while +# factoring out reusable code +all_apply <- function(f, v, na_rm = TRUE, ...) { + Reduce( + f = function(l, r) if (na_rm && is.na(r)) TRUE else l && r, + x = sapply(X = v, FUN = f, ...), + init = TRUE ) } - -write_debug_file <- function(s) { - if (const_write_debug_files) { - s_path <- sprintf("test-data/%s.txt", deparse(substitute(s))) - print(sprintf("DEBUG writing file %s", spath)) +write_debug_file <- function(data_frame) { + if (!is.null(debug_file_base_path)) { + s_path <- + sprintf( + "%s/%s.txt", + debug_file_base_path, + deparse(substitute(data_frame)) + ) write.table( - s, + data_frame, file = s_path, sep = "\t", col.names = TRUE, @@ -174,6 +654,137 @@ new.env(parent = emptyenv()) } +# make apply readable for rows +row_apply <- function(x, fun, ..., simplify = TRUE) { + apply(x, MARGIN = 1, fun, ..., simplify = TRUE) +} + +# make apply readable for columns +column_apply <- function(x, fun, ..., simplify = TRUE) { + apply(x, MARGIN = 2, fun, ..., simplify = TRUE) +} + +##' Produce a vector of boolean values whose i-th value is TRUE when any +##' member of v matches the i-th membr of s, where i in 1:seq_len(length(s)) +##' +##' @title Search multiple strings for matches of multiple substrings +##' @param v a vector of substrings to match +##' @param s a vector of strings to search for matches +##' @param ... additional arguments to grepl +##' @return a list with keys in s and valuse that are vectors of elements of v +##' @author Art Eschenlauer +##' Copyright (C) 2022 Art Eschenlauer; +##' MIT License; https://en.wikipedia.org/wiki/MIT_License#License_terms +mgrepl <- function(v, s, ...) { + grpl_rslt <- rep_len(0, length(s)) + for (vi in v) { + grpl_rslt_v <- sapply( + X = s, + FUN = function(t) { + Reduce( + f = function(l, r) if (is.null(l)) r else c(l, r), + x = sapply( + X = vi, + FUN = function(f) grepl(f, t, ...) + ), + init = c() + ) + }, + simplify = "array" + ) + grpl_rslt <- grpl_rslt + grpl_rslt_v + } + rslt <- unname(grpl_rslt > 0) +} + +##' Produce positions in a vector where succeeding value != current valus +##' +##' @title Search vector for neighboring positions having different values +##' @param v a vector of comparable numeric values (e.g. integers) +##' @return a vector of positions i where v[i] != v[i + 1] +##' @author Art Eschenlauer +##' Copyright (C) 2022 Art Eschenlauer; +##' MIT License; https://en.wikipedia.org/wiki/MIT_License#License_terms +transition_positions <- function(v) { + Reduce( + f = function(l, i) if ((i != 1) && (v[i - 1] != v[i])) c(l, i - 1) else l, + x = seq_along(v)[-1:0], + init = c() + ) +} + +### figure debug functions + +cat_par_vector <- function(par_name, lbl = "", newlines = TRUE) { + cat( + sprintf( + "%spar(%s) = c(%s)%s", + lbl, + par_name, + paste(par(par_name), collapse = ", "), + if (newlines) "\n\n" else "" + ) + ) +} + +cat_margins <- function(lbl = NULL) { + for (p in c("fig", "fin", "mar", "mai", "omd", "omi", "oma")) + cat_par_vector(p, if (!is.null(lbl)) paste0(lbl, " ") else NULL) +} + +cat_variable <- + function(x, suffix = "", digits = 3, force_str = FALSE) { + xprssn <- deparse1(substitute(x)) + if (force_str || is.matrix(x) || is.list(x) || is.data.frame(x)) { + cat( + paste0( + "\n\\texttt{\\textbf{", + whack_underscores(xprssn), + "}} [", + typeof(x), + ",", + mode(x), + "] =\n" + ) + ) + cat("\n\\begin{verbatim}\n") + str(x) + cat("\n\\end{verbatim}\n") + } else { + cat("\n", see_variable(x, suffix, digits, xprssn)) + } + } + +### structure helper functions + +# ref: staque.R - Icon-oriented stack and queue operations +# - https://gist.github.com/eschen42/917690355e53918b9e7ba7138a02d1f8 +# +# sq_get(v):x produces the leftmost element of v and removes it from v, +# but produces NA if v is empty +sq_get <- function(v) { + if (length(v) == 0) return(NA) + assign(as.character(substitute(v)), v[-1], parent.frame()) + return(v[1]) +} +# +# sq_put(v,x1,...,xn):v puts x1, x2, ..., xn onto the right end of v, +# producing v. +# Values are pushed in order from left to right, +# so xn becomes the last (rightmost) value on v. +# sq_put(v) with no second argument does nothing. +sq_put <- function(v, x = NA, ...) { + pf <- parent.frame() + if (is.null(x)) return(pf$v) + if ( + !(length(x) > 1) && + !rlang::is_closure(x) && + is.na(x) + ) return(pf$v) + assign(as.character(substitute(v)), c(v, x, ...), pf) + pf[[as.character(substitute(v))]] +} + ### numerical/statistical helper functions any_nan <- function(x) { @@ -186,6 +797,7 @@ sd(x[ok]) } +# compute anova raw p-value anova_func <- function(x, grouping_factor, one_way_f) { subject <- data.frame( intensity = x @@ -203,12 +815,421 @@ pvalue } +# This code adapted from matrixcalc::is.positive.definite +# Notably, this simply tests without calling stop() +is_positive_definite <- function(x, tol = 1e-08) { + if (!is.matrix(x)) + return(FALSE) + if (!is.numeric(x)) + return(FALSE) + if (nrow(x) < 1) + return(FALSE) + if (ncol(x) < 1) + return(FALSE) + if (nrow(x) != ncol(x)) + return(FALSE) + sum_symm <- sum(x == t(x), na.rm = TRUE) + value_count <- Reduce("*", dim(x)) + if (sum_symm != value_count) + return(FALSE) + eigenvalues <- eigen(x, only.values = TRUE)$values + n <- nrow(x) + for (i in 1:n) { + if (abs(eigenvalues[i]) < tol) { + eigenvalues[i] <- 0 + } + } + if (any(eigenvalues <= 0)) { + return(FALSE) + } + return(TRUE) +} ### LaTeX functions -latex_collapsed_vector <- function(collapse_string, v, underscore_whack = TRUE) { - v_sub <- if (underscore_whack) gsub("_", "\\\\_", v) else v - cat( +# Use this like print.data.frame, from which it is adapted: +data_frame_table_latex <- + function( + x, + # digits to pass to format.data.frame + digits = NULL, + # TRUE -> right-justify columns; FALSE -> left-justify + right = TRUE, + # maximumn number of rows to print + max = NULL, + # string with justification of each column + justification = NULL, + # TRUE to center on page + centered = TRUE, + # optional caption + caption = NULL, + # h(inline); b(bottom); t (top) or p (separate page) + anchor = "h", + # set underscore_whack to TRUE to escape underscores + underscore_whack = TRUE, + # how to emit results + emit = cat + ) { + if (is.null(justification)) + justification <- + Reduce( + f = paste, + x = rep_len(if (right) "r" else "l", length(colnames(x))) + ) + n <- length(rownames(x)) + if (length(x) == 0L) { + emit( + sprintf( + # if n is one, use singular 'row', else use plural 'rows' + ngettext( + n, + "data frame with 0 columns and %d row", + "data frame with 0 columns and %d rows" + ), + n + ), + "\n", + sep = "" + ) + } else if (n == 0L) { + emit("0 rows for:\n") + latex_itemized_list( + v = names(x), + underscore_whack = underscore_whack + ) + } else { + if (is.null(max)) + max <- getOption("max.print", 99999L) + if (!is.finite(max)) { + cat("Abend because: invalid 'max' / getOption(\"max.print\"): ", max) + knitr::knit_exit() + } + omit <- (n0 <- max %/% length(x)) < n + m <- as.matrix( + format.data.frame( + if (omit) x[seq_len(n0), , drop = FALSE] else x, + digits = digits, + na.encode = FALSE + ) + ) + emit( + # h(inline); b(bottom); t (top) or p (separate page) + paste0("\\begin{table}[", anchor, "]"), + "\\leavevmode", + sep = "\n" + ) + if (!is.null(caption)) + emit(paste0(" \\caption{", caption, "}")) + if (centered) emit("\\centering\n") + emit( + paste( + " \\begin{tabular}{", + justification, + "}\n", + sep = "" + ) + ) + + # ref for top and bottom struts (\T and \B): + # https://tex.stackexchange.com/a/50355 + if (!is.null(caption)) + emit("\\B \\\\\n") + latex_table_row( + v = colnames(m), + extra = " \\T \\B", + underscore_whack = underscore_whack + ) + emit("\\hline \\\\\n") + for (i in seq_len(length(m[, 1]))) { + latex_table_row( + v = m[i, ], + underscore_whack = underscore_whack + ) + } + emit( + paste( + " \\end{tabular}", + "\\end{table}", + sep = "\n" + ) + ) + if (omit) + emit(" [ reached 'max' / getOption(\"max.print\") -- omitted", + n - n0, "rows ]\n") + } + invisible(x) + } + +# Use this like print.data.frame, from which it is adapted: +data_frame_tabbing_latex <- + function( + x, + # vector of tab stops, in inches + tabstops, + # vector of headings, registered with tab-stops + headings = colnames(x), + # digits to pass to format.data.frame + digits = NULL, + # maximumn number of rows to print + max = NULL, + # optional caption + caption = NULL, + # set underscore_whack to TRUE to escape underscores + underscore_whack = TRUE, + # flag for landscape mode + landscape = FALSE, + # flag indicating that subsubsection should be used for caption + # rather than subsection + use_subsubsection_header = TRUE, + # character-size indicator; for possible values, see: + # https://tug.org/texinfohtml/latex2e.html#Font-sizes + charactersize = "small", + # set verbatim to TRUE to debug output + verbatim = FALSE + ) { + + hlinport <- if (landscape) { + function() cat("\\hlinlscp \\\\\n") + } else { + function() cat("\\hlinport \\\\\n") + } + + tabstops_tex <- + Reduce( + f = function(l, r) paste0(l, r), + x = sprintf("\\hspace{%0.2fin}\\=", tabstops), + init = "" + ) + + n <- length(rownames(x)) + if (length(x) == 0L) { + cat( + sprintf( + # if n is one, use singular 'row', else use plural 'rows' + ngettext( + n, + "data frame with 0 columns and %d row", + "data frame with 0 columns and %d rows" + ), + n + ), + "\n", + sep = "" + ) + } else if (n == 0L) { + cat("0 rows for:\n") + latex_itemized_list( + v = names(x), + underscore_whack = underscore_whack + ) + } else { + if (is.null(max)) + max <- getOption("max.print", 99999L) + if (!is.finite(max)) { + cat("Abend because: invalid 'max' / getOption(\"max.print\"): ", max) + knitr::knit_exit() + } + omit <- (n0 <- max %/% length(x)) < n + m <- as.matrix( + format.data.frame( + if (omit) x[seq_len(n0), , drop = FALSE] else x, + digits = digits, + na.encode = FALSE + ) + ) + if (landscape) + cat("\n\\begin{landscape}") + tex_caption <- + if (!is.null(caption)) sprintf("\\captionof{table}{%s}\n", caption) + else "\n" + # build the column names, which have multiple lines when + # length(headings) is a multiple of the number of columns + column_names <- "" + while (length(headings) > 0) { + my_row <- c() + for (i in 1:(1 + length(tabstops))) { + my_field <- sq_get(headings) + sq_put(my_row, if (is.na(my_field)) "" else my_field) + } + column_names %||:=% latex_tabbing_row( + v = my_row, + underscore_whack = underscore_whack, + action = paste0 + ) + } + + # Begin tabbing environment after beginning charactersize environment + if (verbatim) cat("\n\\begin{verbatim}") + cat( + paste0( + "\n\\begin{", charactersize, "}", tex_caption, + "\\begin{tabwrap}{", tabstops_tex, "}\n" + ) + ) + # emit column names + cat(column_names) + # emit hline + hlinport() + for (i in seq_len(length(m[, 1]))) { + my_row <- latex_tabbing_row( + v = m[i, ], + underscore_whack = underscore_whack, + action = paste0 + ) + if (FALSE) + cat(my_row) + else + cat(my_row) + } + hlinport() + if (omit) + cat(" [ reached 'max' / getOption(\"max.print\") -- omitted", + n - n0, "rows ]\n") + # End charactersize environment after ending tabbing environment + cat(paste0("\\end{tabwrap}\n\\end{", charactersize, "}\n")) + if (verbatim) cat("\\end{verbatim}\n") + if (landscape) + cat("\\end{landscape}\n") + } + invisible(x) + } + +param_df_noexit <- + function(e = NULL) { + data_frame_tabbing_latex( + x = param_df, + tabstops = c(1.75), + underscore_whack = TRUE, + caption = "Input parameters", + verbatim = FALSE + ) + if (!is.null(e)) { + sink(stderr()) + cat("Caught fatal error:\n\n") + str(e) + sink() + } + } + +param_df_exit <- + function(e = NULL) { + param_df_noexit(e) + knitr::knit_exit() + exit(-1) + } + +# exit with exit code (default 0) and optional msg +exit <- + function(code = 0, msg = NULL, use_stderr = FALSE) { + if (!is.null(msg)) { + if (use_stderr) sink(stderr()) + cat("\n\n", msg, "\n\n") + if (use_stderr) sink() + } + q(save = "no", status = code) + } + +# make control sequences into printable latex sequences +latex_printable_control_seqs <- + function(s) { + s <- gsub("[\\]", "xyzzy_plugh", s) + s <- gsub("[$]", "\\\\$", s) + s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) + return(s) + } +nolatex_verbatim <- + function(expr) eval(expr) + +latex_verbatim <- + function(expr) { + arg_string <- deparse1(substitute(expr)) + cat("\n\\begin{verbatim}\n___\n") + tryCatch( + expr = expr, + error = param_df_exit, + #ACE error = + #ACE function(e) { + #ACE cat("Caught error:\n\n") + #ACE str(e) + #ACE knitr::knit_exit() + #ACE stop(e) + #ACE }, + finally = cat("...\n\\end{verbatim}\n") + ) + } + +latex_samepage <- + function(expr) { + arg_string <- deparse1(substitute(expr)) + cat("\n\\begin{samepage}\n") + tryCatch( + expr = expr, + error = param_df_exit, + #ACE error = + #ACE function(e) { + #ACE cat("Caught error:\n\n") + #ACE str(e) + #ACE knitr::knit_exit() + #ACE stop(e) + #ACE }, + finally = cat("\n\\end{samepage}\n") + ) + } + +# return the result of invocation after showing parameters +# ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/ +latex_show_invocation <- + function(f, f_name = deparse1(substitute(f)), head_patch = FALSE) { + function(...) { + my_env <- (as.list(environment())) + va <- list(...) + my_rslt <- new_env() + my_rslt$rslt <- NULL + latex_verbatim( + expr = { + cat(sprintf("\n .. Local variables for '%s':\n\n", f_name)) + str(va) + if (!head_patch) { + # return this result + # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/ + cat(sprintf("\n .. Invoking '%s'\n", f_name)) + tryCatch( + { + cat("\n\\end{verbatim}\n") + rslt <- do.call(f, va) + }, + error = param_df_exit, + #ACE error = function(e) { + #ACE cat("\n\\begin{verbatim}\n") + #ACE str(e) + #ACE cat("\n\\end{verbatim}\n") + #ACE knitr::knit_exit() + #ACE stop(e) + #ACE }, + finally = cat("\n\\begin{verbatim}\n") + ) + cat(sprintf("\n .. '%s' returned:\n", f_name)) + str(rslt) + my_rslt$rslt <- rslt + } + } + ) + # return the result of invocation with the shown parameters + # ref: https://www.r-bloggers.com/2013/08/a-new-r-trick-for-me-at-least/ + if (head_patch) my_rslt$rslt <- do.call(f, va) + (my_rslt$rslt) + } + } + +latex_collapsed_vector <- function( + collapse_string, + v, + underscore_whack = TRUE, + action = cat0 + ) { + v_sub <- + if (underscore_whack) whack_underscores(v) else v + action( paste0( v_sub, collapse = collapse_string @@ -242,113 +1263,36 @@ cat(" \\\\\n") } -# Use this like print.data.frame, from which it is adapted: -data_frame_latex <- - function( - x, - ..., - # digits to pass to format.data.frame - digits = NULL, - # TRUE -> right-justify columns; FALSE -> left-justify - right = TRUE, - # maximumn number of rows to print - max = NULL, - # string with justification of each column - justification = NULL, - # TRUE to center on page - centered = TRUE, - # optional caption - caption = NULL, - # h(inline); b(bottom); t (top) or p (separate page) - anchor = "h", - # set underscore_whack to TRUE to escape underscores - underscore_whack = TRUE +latex_tabbing_row <- function( + v, + extra = "", + underscore_whack = TRUE, + action = cat0 ) { - if (is.null(justification)) - justification <- - Reduce( - f = paste, - x = rep_len(if (right) "r" else "l", length(colnames(x))) - ) - n <- length(rownames(x)) - if (length(x) == 0L) { - cat( - sprintf( - # if n is one, use singular 'row', else use plural 'rows' - ngettext( - n, - "data frame with 0 columns and %d row", - "data frame with 0 columns and %d rows" - ), - n - ), - "\n", - sep = "" - ) - } else if (n == 0L) { - cat("0 rows for:\n") - latex_itemized_list( - v = names(x), - underscore_whack = underscore_whack - ) - } else { - if (is.null(max)) - max <- getOption("max.print", 99999L) - if (!is.finite(max)) - stop("invalid 'max' / getOption(\"max.print\"): ", - max) - omit <- (n0 <- max %/% length(x)) < n - m <- as.matrix( - format.data.frame( - if (omit) x[seq_len(n0), , drop = FALSE] else x, - digits = digits, - na.encode = FALSE - ) - ) - cat( - # h(inline); b(bottom); t (top) or p (separate page) - paste0("\\begin{table}[", anchor, "]\n") - ) - if (!is.null(caption)) - cat(paste0(" \\caption{", caption, "}")) - if (centered) cat("\\centering\n") - cat( - paste( - " \\begin{tabular}{", - justification, - "}\n", - sep = "" - ) - ) - # ref: https://tex.stackexchange.com/a/50353 - # Describes use of \rule{0pt}{3ex} - if (!is.null(caption)) - cat("\\B \\\\ \\hline\\hline\n") - # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 - latex_table_row( - v = colnames(m), - extra = "\\T\\B", - underscore_whack = underscore_whack - ) - cat("\\hline\n") - for (i in seq_len(length(m[, 1]))) { - latex_table_row( - v = m[i, ], - underscore_whack = underscore_whack - ) - } - cat( - paste( - " \\end{tabular}", - "\\end{table}", - sep = "\n" - ) - ) - if (omit) - cat(" [ reached 'max' / getOption(\"max.print\") -- omitted", - n - n0, "rows ]\n") - } - invisible(x) + # latex_collapsed_vector applies action to result of paste0; + # by default, action = cat; + # hence, a scalar string is assigned to v_collapsed + v_collapsed <- + latex_collapsed_vector( + "} \\> \\tabfill{", + v, + underscore_whack, + action = paste0 + ) + action( + "\\tabfill{", + v_collapsed, + "}", + extra, + " \\\\\n" + ) + } + +# N.B. use con = "" to emulate regular cat +fcat0 <- + function(..., sprtr = " ", cnnctn = file()) { + cat0(..., sep = sprtr, file = cnnctn) + invisible(cnnctn) } hypersub <- @@ -356,32 +1300,39 @@ hyper <- tolower(s) hyper <- gsub("[^a-z0-9]+", "-", hyper) hyper <- gsub("[-]+", "-", hyper) + hyper <- gsub("[_]+", "-", hyper) hyper <- sub("^[-]", "", hyper) hyper <- sub("[-]$", "", hyper) return(hyper) } -subsection_header <- - function(s) { - hyper <- hypersub(s) - cat( - sprintf( - "\\hypertarget{%s}\n{\\subsection{%s}\\label{%s}}\n", - hyper, s, hyper - ) - ) +table_href <- function(s = "offset", caption = "") { + paste0("\\hyperlink{table.\\arabic{", s, "}}{Table \\arabic{", s, "}}") + } + +table_offset <- function(i = 0, s = "offset", new = FALSE) { + paste0( + if (new) paste0("\\newcounter{", s, "}\n") else "", + "\\setcounter{", s, "}{\\value{table}}\n", + paste0(if (i > 0) rep(paste0("\\stepcounter{", s, "}"), i), "\n") + ) } -subsubsection_header <- - function(s) { +a_section_header <- + function(s, prefix = "") { hyper <- hypersub(s) - cat( - sprintf( - "\\hypertarget{%s}\n{\\subsubsection{%s}\\label{%s}}\n", - hyper, s, hyper - ) + my_subsection_header <- sprintf( + "\\hypertarget{%s}{\\%ssection{%s}\\label{%s}}\n", + hyper, + prefix, + gsub("_", "\\_", s, fixed = TRUE), + hyper ) + my_subsection_header } +section_header <- function(s) a_section_header(s, "") +subsection_header <- function(s) a_section_header(s, "sub") +subsubsection_header <- function(s) a_section_header(s, "subsub") ### SQLite functions @@ -419,10 +1370,54 @@ ### KSEA functions and helpers -# Adapted from KSEAapp::KSEA.Scores to allow retrieval of: -# - maximum log2(FC) +#' The KSEA App Analysis (KSEA Kinase Scores Only) +#' +#' Compute KSEA kinase scores and statistics from phoshoproteomics data input +#' Adapted from KSEAapp::KSEA.Scores to allow retrieval of maximum log2(FC) +#' +#' Result is an R data.frame with column names +#' "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" +#' "Please refer to the original Casado et al. publication for detailed +#' description of these columns and what they represent: +#' +#' - Kinase.Gene indicates the gene name for each kinase. +#' - mS represents the mean log2(fold change) of all the +#' kinase's substrates. +#' - Enrichment is the background-adjusted value of the kinase's mS. +#' - m is the total number of detected substrates +#' from the experimental dataset for each kinase. +#' - z.score is the normalized score for each kinase, weighted by +#' the number of identified substrates. +#' - p.value represents the statistical assessment for the z.score. +#' - FDR is the p-value adjusted for multiple hypothesis testing +#' using the Benjamini & Hochberg method." +#' +#' @param ksdata the Kinase-Substrate dataset uploaded from the file +#' prefaced with "PSP&NetworKIN_" +#' available from github.com/casecpb/KSEA/ +#' @param px the experimental data file formatted as described in +#' the KSEA.Complete() documentation +#' @param networkin a binary input of TRUE or FALSE, indicating whether +#' or not to include NetworKIN predictions, where +#' \code{NetworKIN = TRUE} +#' means include NetworKIN predictions +#' @param networkin_cutoff a numeric value between 1 and infinity setting +#' the minimum NetworKIN score +#' (this can be omitted if NetworKIN = FALSE) +#' +#' @return creates a new R data.frame with all the KSEA kinase +#' scores, along with each one's statistical +#' assessment, as described herein. +#' +#' @references +#' +#' Casado et al. (2013) Sci Signal. 6(268):rs6 +#' +#' Hornbeck et al. (2015) Nucleic Acids Res. 43:D512-20 +#' +#' Horn et al. (2014) Nature Methods 11(6):603-4 +#' ksea_scores <- function( - # For human data, typically, ksdata = KSEAapp::ksdata ksdata, @@ -444,9 +1439,13 @@ # A numeric value between 1 and infinity setting the minimum NetworKIN # score (can be left out if networkin = FALSE) - networkin_cutoff + networkin_cutoff, + + # Minimum substrate count, necessary to adjust the p-value appropriately. + minimum_substrate_count ) { + # no px$FC should be <= 0, but abs(px$FC) is used below as a precaution. if (length(grep(";", px$Residue.Both)) == 0) { # There are no Residue.Both entries having semicolons, so new is # simply px except two columns are renamed and a column is added @@ -492,18 +1491,53 @@ # Eliminate rows having missing values (e.g., non-imputed data) new <- new[complete.cases(new$log2_fc), ] } - if (networkin == TRUE) { - # When NetworKIN is true, filter on NetworKIN.cutoff which includes - # PhosphoSitePlus data *because its networkin_score is set to Inf* - ksdata_filtered <- ksdata[grep("[a-z]", ksdata$Source), ] - ksdata_filtered <- ksdata_filtered[ - (ksdata_filtered$networkin_score >= networkin_cutoff), ] - } else { - # Otherwise, simply use PhosphSitePlus rows - ksdata_filtered <- ksdata[ - grep("PhosphoSitePlus", ksdata$Source), ] + # At this point, new$log2_fc is signed according to which contrast has + # the greater intensity + # To take the magnitude into account without taking the direction into + # account, set params$kseaUseAbsoluteLog2FC to TRUE + # + # Should KSEA be performed aggregating signed log2FC or absolute? + # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores + if (params$kseaUseAbsoluteLog2FC) { + # TRUE use abs(log2FC) for KSEA as Justin requested; this is a + # justifiable deviation from the KSEAapp::KSEA.Scores algorithm. + new$log2_fc <- abs(new$log2_fc) + } + + monitor_filtration_on_stderr <- TRUE + if (monitor_filtration_on_stderr) { + # set to TRUE to monitor filtration on stderr + sink(stderr()) + cat(see_variable(networkin, "\n")) } - # Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD + ksdata_filtered <- + sqldf( + sprintf("%s %s", + "select * from ksdata where not Source = 'NetworKIN'", + if (networkin) + sprintf("or networkin_score >= %d", networkin_cutoff) + else + "" + ) + ) + if (monitor_filtration_on_stderr) { + cat(see_variable(sqldf( + "select count(*), Source from ksdata group by Source"), "\n")) + cat(see_variable(sqldf( + "select count(*), Source from ksdata_filtered group by Source"), "\n")) + sink() + } + + ############################################################################ + # Line numbers below refer to lines of: + # https://github.com/casecpb/KSEAapp/blob/master/R/KSEA.Scores.R + # I would put the original line in a comment but then lint would complain... + # - Indeed, I had to rename all the variables because lint didn't like names + # containing periods or capital letters. + # ACE + ############################################################################ + # + # (1) Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD # colnames of ksdata_filtered: # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID" # "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID" @@ -516,6 +1550,8 @@ # INNER JOIN new b # ON a.SUB_GENE = b.SUB_GENE # AND a.SUB_MOD_RSD = b.SUB_MOD_RSD + # (KSEA.Scores.R line # 105) + # "Extract KSData.filtered annotations that are only found in new" ksdata_dataset <- base::merge(ksdata_filtered, new) # colnames of ksdata_dataset: # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" @@ -523,24 +1559,31 @@ # "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein" # "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform) # Re-order dataset; prior to accounting for isoforms + # (KSEA.Scores.R line # 106) ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ] # Extract non-isoform accession in UniProtKB + # (KSEA.Scores.R line # 107) ksdata_dataset$uniprot_no_isoform <- sapply( ksdata_dataset$KIN_ACC_ID, function(x) unlist(strsplit(as.character(x), split = "-"))[1] ) + # "last expression collapses isoforms ... for easy processing" # Discard previous results while selecting interesting columns ... + # (KSEA.Scores.R line # 110) ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)] # Column names are now: # "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p" # "FC" "log2_fc" "Source" # Make column names human-readable + # (KSEA.Scores.R line # 111) colnames(ksdata_dataset_abbrev) <- c( "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p", "FC", "log2FC", "Source" ) # SELECT * FROM ksdata_dataset_abbrev # ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p + # (KSEA.Scores.R line # 112) + # "Extract KSData.filtered annotations that are only found in new" ksdata_dataset_abbrev <- ksdata_dataset_abbrev[ order( @@ -549,6 +1592,7 @@ ksdata_dataset_abbrev$Substrate.Mod, ksdata_dataset_abbrev$p), ] + if (print_nb_messages) nbe(see_variable(ksdata_dataset_abbrev)) # First aggregation step to account for multiply phosphorylated peptides # and differing peptide sequences; the goal here is to combine results # for all measurements of the same substrate. @@ -560,12 +1604,16 @@ # ORDER BY `Kinase.Gene`; # in two steps: # (1) compute average log_2(fold-change) + # "take the mean of the log2FC amongst phosphosite duplicates" + # (KSEA.Scores.R line # 115) ksdata_dataset_abbrev <- aggregate( log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source, data = ksdata_dataset_abbrev, FUN = mean ) + if (print_nb_messages) nbe(see_variable(ksdata_dataset_abbrev)) # (2) order by Kinase.Gene + # (KSEA.Scores.R line # 117) ksdata_dataset_abbrev <- ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ] # SELECT `Kinase.Gene`, count(*) @@ -573,9 +1621,14 @@ # GROUP BY `Kinase.Gene`; # in two steps: # (1) Extract the list of Kinase.Gene names + # "@@@@@@@@@@@@@@@@@@@@" + # "Do analysis for KSEA" + # "@@@@@@@@@@@@@@@@@@@@" + # (KSEA.Scores.R line # 124) kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene) # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev, # named by Kinase.Gene + # (KSEA.Scores.R line # 125) kinase_list <- as.matrix(table(kinase_list)) # Second aggregation step to account for all substrates per kinase # CREATE TABLE mean_fc @@ -583,50 +1636,123 @@ # SELECT `Kinase.Gene`, avg(log2FC) AS log2FC # FROM ksdata_dataset_abbrev # GROUP BY `Kinase.Gene` - mean_fc <- aggregate( - log2FC ~ Kinase.Gene, - data = ksdata_dataset_abbrev, - FUN = mean - ) - # mean_fc columns: "Kinase.Gene", "log2FC" - if (FALSE) { - # I need to re-think this; I was trying to find the most-represented - # peptide, but that horse has already left the barn - # SELECT `Kinase.Gene`, max(abs(log2FC)) AS log2FC - # FROM ksdata_dataset_abbrev - # GROUP BY `Kinase.Gene` - max_fc <- aggregate( + # (KSEA.Scores.R line # 127) + if (print_nb_messages) nb(see_variable(ksdata_dataset_abbrev), "\n") + mean_fc <- + aggregate( log2FC ~ Kinase.Gene, data = ksdata_dataset_abbrev, - FUN = function(r) max(abs(r)) + FUN = mean ) - } + if (print_nb_messages) nbe(see_variable(mean_fc), "\n") + + # for contrast j + # for each kinase i + # extract log2 of fold-change (from `new` above) + # (used in KSEA.Scores.R lines # 130 & 132) + log2_fc_j_each_i <- + new$log2_fc + + # for contrast j + # for all kinases i + # compute mean of abs(log2 of fold-change) + # (used in KSEA.Scores.R lines # 130) + mean_abs_log2_fc_j_all_i <- + mean(abs(log2_fc_j_each_i), na.rm = TRUE) + + # for contrast j + # for all kinases i + # compute mean of log2 of fold-change + # (used in KSEA.Scores.R lines # 132) + mean_log2_fc_j_all_i <- + mean(log2_fc_j_each_i, na.rm = TRUE) + + # Reorder mean_fc, although I don't see why + # (KSEA.Scores.R line 128 + mean_fc <- mean_fc[order(mean_fc[, 1]), ] + # mean_fc columns so far: "Kinase.Gene", "log2FC" + # - Kinase.Gene + # indicates the gene name for each kinase. # Create column 3: mS - mean_fc$m_s <- mean_fc[, 2] + # - mS + # represents the mean log2(fold change) of all the + # kinase's substrates. + # (KSEA.Scores.R line # 129) + mean_fc$m_s <- + mean_fc_m_s <- mean_fc[, 2] + # Create column 4: Enrichment - mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = TRUE)) - # Create column 5: m, count of substrates - mean_fc$m <- kinase_list + # - Enrichment + # is the background-adjusted value of the kinase's mS. + # (KSEA.Scores.R line # 130) + mean_fc$enrichment <- + mean_fc_m_s / mean_abs_log2_fc_j_all_i + + # Create column 5: m, count of substrates of kinase (count of j for i) + # - m + # is the total number of detected substrates + # from the experimental dataset for each kinase. + # (KSEA.Scores.R line # 131) + mean_fc$m <- + mean_fc_m <- kinase_list + + # Create column 6: z-score - mean_fc$z_score <- ( - (mean_fc$m_s - mean(new$log2_fc, na.rm = TRUE)) * - sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = TRUE) + # - z.score + # is the normalized score for each kinase, weighted by + # the number of identified substrates. + # (KSEA.Scores.R line # 132) + mean_fc$z_score <- + (mean_fc_m_s - mean_log2_fc_j_all_i) * sqrt(mean_fc_m) / + sd(log2_fc_j_each_i, na.rm = TRUE) + # Create column 7: p-value, deduced from z-score - mean_fc$p_value <- pnorm(-abs(mean_fc$z_score)) + # - p.value + # represents the statistical assessment for the z.score. + # (KSEA.Scores.R line # 133) + # "one-tailed p-value" + mean_fc$p_value <- + pnorm(-abs(mean_fc$z_score)) + + # zap excluded kinases; this must be done before adjusting p-value + if (TRUE) { + mean_fc <- + mean_fc[ + mean_fc$m >= minimum_substrate_count, + , + drop = FALSE + ] + } + + #ACE nb(see_variable(nrow(mean_fc)), "\n") # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value - mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr") - - # Remove log2FC column, which is duplicated as mS - mean_fc <- mean_fc[order(mean_fc$Kinase.Gene), -2] + # - FDR + # is the p-value adjusted for multiple hypothesis testing + # using the Benjamini & Hochberg method." + # (KSEA.Scores.R line # 134) + mean_fc$fdr <- + p.adjust(mean_fc$p_value, method = "fdr") + + # It makes no sense to leave Z-scores negative when using + # absolute value of fold-change + if (params$kseaUseAbsoluteLog2FC) { + mean_fc$z_score <- abs(mean_fc$z_score) + } + + # Remove second column (log2FC), which is duplicated as mS + # (KSEA.Scores.R line # 136) + mean_fc <- + mean_fc[order(mean_fc$Kinase.Gene), -2] # Correct the column names which we had to hack because of the linter... colnames(mean_fc) <- c( "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" ) + # (KSEA.Scores.R line # 138) return(mean_fc) } -low_fdr_barplot <- function( +ksea_low_fdr_barplot_factory <- function( rslt, i_cntrst, i, @@ -658,57 +1784,94 @@ "p.value" = { k$p_value }, - stop( - sprintf( - "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", - ksea_cutoff_statistic + { + cat( + sprintf( + "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", + ksea_cutoff_statistic + ) ) - ) - ) + param_df_exit() + knitr::knit_exit() + } + ) k <- k[selector < ksea_cutoff_threshold, ] - - if (nrow(k) > 1) { - op <- par(mai = c(1, 1.5, 0.4, 0.4)) + nrow_k <- nrow(k) + + #ACE nbe(see_variable(fdr_barplot_dataframe <- k)) + + if (nrow_k > 0) { + max_nchar_rowname <- max(nchar(rownames(k))) + my_cex_names <- 1.0 / (1 + nrow_k / 50) + + if (print_trace_messages) cat_margins("Initially") + if (print_trace_messages) cat_variable(nrow_k, "\n\n", 0) + if (print_trace_messages) cat_variable(my_cex_names, "\n\n", 0) + if (print_trace_messages) cat_variable(max_nchar_rowname, "\n\n", 0) + + # fin: The figure region dimensions, (width, height), in inches. + # mar: A numerical vector of the form c(bottom, left, top, right) + # that gives the number of lines of margin to be specified + # on the four sides of the plot; default: c(5, 4, 4, 2) + 0.1 + +# mar: The figure region dimensions, (width, height), in inches. numeric_z_score <- as.numeric(k$z_score) - z_score_order <- order(numeric_z_score) + bar_order <- order(-as.numeric(k$p_value)) kinase_name <- k$kinase_gene long_caption <- sprintf( - "Kinase z-score, %s < %s, %s", + "Kinase z-score, %s, KSEA %s < %s", + caption, ksea_cutoff_statistic, - ksea_cutoff_threshold, - caption + ksea_cutoff_threshold ) my_cex_caption <- 65.0 / max(65.0, nchar(long_caption)) - cat("\n\\clearpage\n") - barplot( - height = numeric_z_score[z_score_order], - border = NA, - xpd = FALSE, - cex.names = 1.0, - cex.axis = 1.0, - main = long_caption, - cex.main = my_cex_caption, - names.arg = kinase_name[z_score_order], - horiz = TRUE, - srt = 45, - las = 1) - par(op) + # return a function that draws the plot + function() { + par_fin <- par("fin") # vector of width_in_inches and height_in_inches) + op <- par( + bg = if (print_trace_messages) "yellow" else "white", + fin = c(par_fin[1], min(par_fin[2], 2.5 + nrow_k / 6)), + mar = par("mar") + + c(3 / nrow_k, (1 + max_nchar_rowname * my_cex_names) / 2, 0, 0) + # bottom, left, top, right + ) + on.exit(par(op)) + if (print_trace_messages) cat_margins("Eventually") + + barplot( + height = numeric_z_score[bar_order], + border = NA, + xpd = FALSE, + cex.names = my_cex_names, + main = long_caption, + cex.main = my_cex_caption, + names.arg = kinase_name[bar_order], + horiz = TRUE, + srt = 45, + las = 1, + cex.axis = 0.9 + ) + } } + } else { + no_op } } # note that this adds elements to the global variable `ksea_asterisk_hash` -low_fdr_print <- function( +ksea_low_fdr_print <- function( rslt, i_cntrst, i, a_level, b_level, fold_change, - caption + caption, + write_db = TRUE, # if TRUE, write to DB, else print table + anchor = c(const_table_anchor_p, const_table_anchor_t) ) { rslt_score_list_i <- rslt$score_list[[i]] if (!is.null(rslt_score_list_i)) { @@ -734,13 +1897,17 @@ "p.value" = { k$p_value }, - stop( - sprintf( - "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", - ksea_cutoff_statistic + { + cat( + sprintf( + "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", + ksea_cutoff_statistic + ) ) - ) - ) + param_df_exit() + knitr::knit_exit() + } + ) k <- k[selector < ksea_cutoff_threshold, ] # save kinase names to ksea_asterisk_hash @@ -748,110 +1915,175 @@ ksea_asterisk_hash[[kinase_name]] <- 1 } - db_write_table_overwrite <- (i_cntrst < 2) - db_write_table_append <- !db_write_table_overwrite - RSQLite::dbWriteTable( - conn = db, - name = "contrast_ksea_scores", - value = contrast_ksea_scores, - append = db_write_table_append + if (write_db) { + db_write_table_overwrite <- (i_cntrst < 2) + db_write_table_append <- !db_write_table_overwrite + RSQLite::dbWriteTable( + conn = db, + name = "contrast_ksea_scores", + value = contrast_ksea_scores, + append = db_write_table_append + ) + "" + } else { + selector <- switch( + ksea_cutoff_statistic, + "FDR" = { + contrast_ksea_scores$fdr + }, + "p.value" = { + contrast_ksea_scores$p_value + }, + { + cat( + sprintf( + "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", + ksea_cutoff_statistic + ) + ) + param_df_exit() + knitr::knit_exit() + } ) - selector <- switch( - ksea_cutoff_statistic, - "FDR" = { - contrast_ksea_scores$fdr - }, - "p.value" = { - contrast_ksea_scores$p_value - }, - stop( - sprintf( - "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", - ksea_cutoff_statistic + if (print_nb_messages) nbe(see_variable(contrast_ksea_scores)) + output_df <- contrast_ksea_scores[ + selector < ksea_cutoff_threshold, + c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count", + "z_score", "p_value", "fdr") + ] + output_df$kinase_gene <- + gsub( + "_", + "\\\\_", + output_df$kinase_gene + ) + colnames(output_df) <- + c( + colnames(output_df)[1], + colnames(output_df)[2], + "enrichment", + "m_s", + "z_score", + "p_value", + "fdr" + ) + #ACE output_order <- with(output_df, order(fdr)) + output_order <- with(output_df, order(p_value)) + output_df <- output_df[output_order, ] + + output_df[, 2] <- sprintf("%0.3g", output_df[, 2]) + output_df$fdr <- sprintf("%0.4f", output_df$fdr) + output_df$p_value <- sprintf("%0.2e", output_df$p_value) + output_df$z_score <- sprintf("%0.2f", output_df$z_score) + output_df$m_s <- sprintf("%d", output_df$m_s) + output_df$enrichment <- sprintf("%0.3g", output_df$enrichment) + output_ncol <- ncol(output_df) + colnames(output_df) <- + c( + "Kinase", + "\\(\\overline{{\\lvert}\\log_2 (\\text{fold-change}){\\rvert}}\\)", + "Enrichment", + "Substrates", + "z-score", + "p-value", + "FDR" + ) + selector <- switch( + ksea_cutoff_statistic, + "FDR" = { + rslt$score_list[[i]]$FDR + }, + "p.value" = { + rslt$score_list[[i]]$p.value + }, + { + cat( + sprintf( + "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", + ksea_cutoff_statistic + ) + ) + param_df_exit() + knitr::knit_exit() + } + ) + if (sum(selector < ksea_cutoff_threshold) > 0) { + if (print_nb_messages) nbe(see_variable(output_df)) + math_caption <- gsub("{", "\\{", caption, fixed = TRUE) + math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE) + # with ( + # output_df, + # ) + if (TRUE) { + output_df$Kinase <- whack_underscores(output_df$Kinase) + data_frame_tabbing_latex( + x = output_df, + # vector of tab stops, in inches + tabstops = c(1.0, 1.2, 1.0, 1.0, 1.0, 1.0), + # vector of headings, registered with tab-stops + headings = colnames(output_df), + # digits to pass to format.data.frame + digits = NULL, + # maximumn number of rows to print + max = NULL, + # optional caption + caption = sprintf( + "\\text{%s}, KSEA %s < %s", + math_caption, + ksea_cutoff_statistic, + ksea_cutoff_threshold + ), + # set underscore_whack to TRUE to escape underscores + underscore_whack = FALSE, + # flag for landscape mode + landscape = FALSE, + # flag indicating that subsubsection should be used for caption + # rather than subsection + use_subsubsection_header = TRUE, + # character-size indicator; for possible values, see: + # https://tug.org/texinfohtml/latex2e.html#Font-sizes + charactersize = "small", + # set verbatim to TRUE to debug output + verbatim = FALSE + ) + } else { + data_frame_table_latex( + x = output_df, + justification = "l c c c c c c", + centered = TRUE, + caption = sprintf( + "\\text{%s}, KSEA %s < %s", + math_caption, + ksea_cutoff_statistic, + ksea_cutoff_threshold + ), + anchor = anchor, + underscore_whack = FALSE + ) + } + } else { + cat( + sprintf( + "\\break + No kinases had + \\(\\text{KSEA %s}_\\text{enrichment} < %s\\) + for contrast %s\\hfill\\break\n", + ksea_cutoff_statistic, + ksea_cutoff_threshold, + caption ) ) - ) - output_df <- contrast_ksea_scores[ - selector < ksea_cutoff_threshold, - c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count", - "z_score", "p_value", "fdr") - ] - output_order <- with(output_df, order(mean_log2_fc, kinase_gene)) - output_df <- output_df[output_order, ] - colnames(output_df) <- - c( - colnames(output_df)[1], - colnames(output_df)[2], - "enrichment", - "m_s", - "z_score", - "p_value", - "fdr" - ) - output_df$fdr <- sprintf("%0.4f", output_df$fdr) - output_df$p_value <- sprintf("%0.2e", output_df$p_value) - output_df$z_score <- sprintf("%0.2f", output_df$z_score) - output_df$m_s <- sprintf("%d", output_df$m_s) - output_df$enrichment <- sprintf("%0.2f", output_df$enrichment) - output_ncol <- ncol(output_df) - colnames(output_df) <- - c( - "Kinase", - "\\(\\overline{\\log_2 (|\\text{fold-change}|)}\\)", - "Enrichment", - "Substrates", - "z-score", - "p-value", - "FDR" - ) - selector <- switch( - ksea_cutoff_statistic, - "FDR" = { - rslt$score_list[[i]]$FDR - }, - "p.value" = { - rslt$score_list[[i]]$p.value - }, - stop( - sprintf( - "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", - ksea_cutoff_statistic - ) - ) - ) - if (sum(selector < ksea_cutoff_threshold) > 0) { - math_caption <- gsub("{", "\\{", caption, fixed = TRUE) - math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE) - data_frame_latex( - x = output_df, - justification = "l c c c c c c", - centered = TRUE, - caption = sprintf( - "\\text{%s}, %s < %s", - math_caption, - ksea_cutoff_statistic, - ksea_cutoff_threshold - ), - anchor = const_table_anchor_p - ) - } else { - cat( - sprintf( - "\\break - No kinases had - \\(\\text{%s}_\\text{enrichment} < %s\\) - for contrast %s\\hfill\\break\n", - ksea_cutoff_statistic, - ksea_cutoff_threshold, - caption - ) - ) + } } + } else { + "" } } # create_breaks is a helper for ksea_heatmap create_breaks <- function(merged_scores) { + if (sum(!is.na(merged_scores)) < 2) + return(NULL) if (min(merged_scores, na.rm = TRUE) < -1.6) { breaks_neg <- seq(-1.6, 0, length.out = 30) breaks_neg <- @@ -889,14 +2121,135 @@ return(color_breaks) } +hm2plus <- function( + x, + mat = matrix( + c( + c(0, 4, 0), + c(0, 3, 3), + c(2, 1, 1) + ), + nrow = 3, + ncol = 3, + byrow = TRUE + ), + denwid = 0.5, + denhgt = 0.15, + widths = c(0.5, 2.5, 1.5), + heights = c(0.4, 0.15, 3.95), + divergent = FALSE, + notecol = "grey50", + trace = "none", + margins = c(6, 20), + srtcol = 90, + srtrow = 0, + density_info = "none", + key_xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), + key_par = list(), + hclustfun = hclust, + ... +) { + + varargs <- list(...) + if (FALSE) # this is to avoid commenting out code to pass linting... + my_hm2 <- latex_show_invocation(heatmap.2, head_patch = FALSE) + else + my_hm2 <- heatmap.2 + + x <- as.matrix(x) + if (sum(!is.na(x)) < 1) + return(NULL) + color_count <- 1 + max(64, length(as.vector(x))) # 8 was not enough + break_count <- 1 + color_count + min_nonax <- min(x, na.rm = TRUE) + max_nonax <- max(x, na.rm = TRUE) + if (print_nb_messages) nb("within hm2plus", see_variable(divergent), "\n") + if (divergent) { + zlim <- max(abs(min_nonax), abs(max_nonax)) + if (print_nb_messages) nb(see_variable(pre_zlim <- zlim, "\n")) + breaks <- (zlim) / (break_count:1) + if (print_nb_messages) nb(see_variable(breaks, "\n")) + breaks <- breaks - median(breaks) + zlim <- c(-zlim, zlim) + if (print_nb_messages) nb(see_variable(zlim, "\n")) + } else { + zlim <- max(abs(min_nonax), abs(max_nonax)) + if (print_nb_messages) nb(see_variable(pre_zlim <- zlim, "\n")) + breaks <- zlim / (break_count:1) + if (print_nb_messages) nb(see_variable(breaks, "\n")) + if (max_nonax < 0) { + breaks <- breaks - zlim + zlim <- c(-zlim, 0) + } else { + zlim <- c(0, zlim) + } + if (print_nb_messages) nb(see_variable(zlim, "\n")) + } + nonax <- x + nonax[is.na(x)] <- min_nonax + if (is.null(widths)) widths <- c(denwid, 4 - denwid, 1.5) + if (is.null(heights)) heights <- c(0.4, denhgt, 4.0) + colors <- + if (divergent && min_nonax < 0) { + # divergent colors on both sides of zero + colorRampPalette(c("red", "white", "blue"))(color_count) + } else if (divergent && min_nonax > 0) { + # "divergent" colors > zero + colorRampPalette(c("white", "blue"))(color_count) + } else if (divergent && max_nonax < 0) { + # "divergent" colors < zero + colorRampPalette(c("red", "white"))(color_count) + } else { + # "non-divergent" colors including zero + hcl.colors(color_count, "YlOrRd", rev = TRUE) + } + + #ACE if (print_nb_messages) nb("within hm2plus", see_variable(key_par), "\n") + #ACE if (print_nb_messages) nb(see_variable(colors, "\n")) + #ACE key_par$col = colors + #ACE key_par$breaks = breaks + + if (print_nb_messages) nb(see_variable(par(), "\n")) #ACE TODO remove me + if (print_nb_messages) cat("\\leavevmode\n\\linebreak\n") #ACE TODO remove me + suppressWarnings( + my_hm2( + x = x, + col = colors, + #ACE symkey = FALSE, + density.info = density_info, + srtCol = srtcol, + srtRow = srtrow, + margins = margins, + lwid = widths, + lhei = heights, + key.title = NA, + key.xlab = key_xlab, + key.par = key_par, + lmat = mat, + notecol = notecol, + trace = trace, + bg = "yellow", + hclustfun = hclustfun, + #ACE breaks = breaks, + oldstyle = FALSE, + ... # varargs + ) + ) + # implicitly returning value returned by heatmap.2 +} + # draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap draw_kseaapp_summary_heatmap <- function( - x, - sample_cluster, - merged_asterisk, - my_cex_row, - color_breaks, - margins, + x, # matrix with row/col names already formatted + sample_cluster, # a binary input of TRUE or FALSE, + # indicating whether or not to perform + # hierarchical clustering of the sample columns + merged_asterisk, # matrix having dimensions of x, values "*" or "" + color_breaks, # breaks for color gradation, from create_breaks + # passed to `breaks` argument of `image` + margins = c(8, 15), # two integers setting the bottom and right margins + # to accommodate row and column labels + master_cex = 0.7, # basis for text sizes ... ) { merged_scores <- x @@ -908,44 +2261,128 @@ "' rather than 'matrix'.\n\n" ) ) - } else if (nrow(x) < 2) { - cat("No plot because matrix x has ", nrow(x), " rows.\n\n") - cat("\\begin{verbatim}\n") - str(x) - cat("\\end{verbatim}\n") - } else if (ncol(x) < 2) { - cat("No plot because matrix x has ", ncol(x), " columns.\n\n") - cat("\\begin{verbatim}\n") - str(x) - cat("\\end{verbatim}\n") - } else { - gplots::heatmap.2( - x = merged_scores, - Colv = sample_cluster, - scale = "none", - cellnote = merged_asterisk, - notecol = "white", - cexCol = 0.9, - # Heuristically assign size of row labels - cexRow = min(1.0, ((3 * my_cex_row) ^ 1.7) / 2.25), - srtCol = 45, - srtRow = 45, - notecex = 3 * my_cex_row, - col = color_breaks[[2]], - density.info = "none", - trace = "none", - breaks = color_breaks[[1]], - lmat = rbind(c(0, 3), c(2, 1), c(0, 4)), - lhei = c(0.4, 8.0, 1.1), - lwid = c(0.5, 3), - key = FALSE, - margins = margins, - ... - ) + cat_variable(x) + return(FALSE) + } + if (print_trace_messages) cat(sprintf("master_cex = %03f\n\n", master_cex)) + nrow_x <- nrow(x) + ncol_x <- ncol(x) + #if (nrow_x < 2) { + if (nrow_x < 1) { + cat("No plot because matrix has no rows.\n\n") + return(FALSE) + } else if (nrow_x < 2) { + cat("No plot because matrix has one row. Matrix looks like this:\n\n") + cat("\n\\begin{verbatim}\n") + print(x) + cat("\n\\end{verbatim}\n") + return(FALSE) + } else if (ncol_x < 2) { + cat("No plot because matrix x has ", ncol_x, " columns.\n\n") + cat_variable(x) + return(FALSE) } + max_nchar_rowname <- max(nchar(rownames(x))) + max_nchar_colname <- max(nchar(colnames(x))) + my_limit <- g_intensity_hm_rows + + my_row_cex_scale <- master_cex * 150 / nrow_x + my_col_cex_scale <- 3.0 + my_asterisk_scale <- 0.4 * my_row_cex_scale + my_row_warp <- 1 + my_note_warp <- 2 + my_row_warp <- 1 + my_row_cex_asterisk <- + master_cex * my_row_warp * my_asterisk_scale + + + my_col_cex <- my_col_cex_scale * master_cex + my_row_cex <- min(3.5 * my_row_cex_asterisk, my_col_cex) + my_key_cex <- 1.286 + my_hm2_cex <- 1 * master_cex + my_offset <- (4.8 / (9 + nrow_x / 10)) - 0.4 + if (print_trace_messages) cat(sprintf("nrow_x = %03f\n\n", nrow_x)) + if (print_trace_messages) cat(sprintf("my_offset = %03f\n\n", my_offset)) + my_offset <- 0.05 + if (print_trace_messages) cat(sprintf("my_offset = %03f\n\n", my_offset)) + my_scale <- 3.0 + if (ncol_x < 10 && nrow_x < 10) + my_scale <- my_scale * 10 / (10 - nrow_x) * 10 / (10 - ncol_x) + + my_heights <- c( + 0.15, + 3.85 - my_offset, + 0.5 + my_offset + ) + my_margins <- c(1, 1) + + c( + margins[1] * 0.08 * max_nchar_colname * my_col_cex, + margins[2] * 0.04 * max_nchar_rowname * my_row_cex + ) + + my_notecex <- + my_scale * + min( + 1.1, + my_row_cex_asterisk * my_note_warp, + my_col_cex * my_note_warp + ) + + if (print_trace_messages) { + cat_variable(my_heights, suffix = "; ") + cat_variable(my_margins, suffix = "\n\n") + cat_variable(my_row_cex_scale, suffix = "; ") + cat_variable(my_col_cex_scale, suffix = "\n\n") + cat_variable(my_row_cex_asterisk, suffix = "\n\n") + cat_variable(my_row_cex, suffix = "; ") + cat_variable(my_col_cex, suffix = "\n\n") + cat_variable(my_row_cex, suffix = "; ") + cat_variable(my_col_cex, suffix = "\n\n") + } + + hm2plus( + x = merged_scores, + Colv = sample_cluster, + cellnote = merged_asterisk, + cex = my_hm2_cex, + cexCol = my_col_cex, + cexRow = my_row_cex, + denhgt = 0.15, + density_info = "none", + denwid = 0.5, + divergent = TRUE, + key_par = list(cex = my_key_cex), + key_xlab = "Z-score", + margins = my_margins, + notecex = my_scale * min( + 1.5, + my_row_cex_asterisk * my_note_warp, + my_col_cex * my_note_warp + ), + notecol = "white", + scale = "none", + srtcol = 90, + srtrow = 0, + trace = "none", + mat = matrix( + c( + c(0, 3, 3), + c(2, 1, 1), + c(0, 4, 0) + ), + nrow = 3, + ncol = 3, + byrow = TRUE + ), + widths = c(0.5, 3.1, 0.9), + heights = my_heights, + ... + ) + return(TRUE) } -# Adapted from KSEAapp::KSEA.Heatmap +# function drawing heatmap of contrast fold-change for each kinase, +# adapted from KSEAapp::KSEA.Heatmap ksea_heatmap <- function( # the data frame outputs from the KSEA.Scores() function, in list format score_list, @@ -961,16 +2398,19 @@ m_cutoff, # a numeric value between 0 and 1 indicating the p-value/FDR cutoff # for indicating significant kinases in the heatmap - p_cutoff = - stop("argument 'p_cutoff' is required for function 'ksea_heatmap'"), + p_cutoff = { + cat("argument 'p_cutoff' is required for function 'ksea_heatmap'") + param_df_exit() + knitr::knit_exit() + }, # a binary input of TRUE or FALSE, indicating whether or not to perform # hierarchical clustering of the sample columns sample_cluster, # a binary input of TRUE or FALSE, indicating whether or not to export # the heatmap as a .png image into the working directory export = FALSE, - # bottom and right margins; adjust as needed if contrast names are too long - margins = c(6, 20), + # bottom and right margins; adjust as needehttps://tex.stackexchange.com/a/56795d if contrast names are too long + margins = c(6, 6), # print which kinases? # - Mandatory argument, must be one of const_ksea_.*_kinases which_kinases, @@ -993,7 +2433,7 @@ master <- Reduce( f = function(...) { - base::merge(..., by = "Kinase.Gene", all = FALSE) + base::merge(..., by = "Kinase.Gene", all = TRUE) }, x = score_list_m ) @@ -1019,15 +2459,13 @@ } merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff)) - # begin hack to print only significant rows asterisk_rows <- rowSums(merged_asterisk == "*") > 0 all_rows <- rownames(merged_stats) names(asterisk_rows) <- all_rows non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE]) asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE]) - merged_scores_asterisk <- merged_scores[names(asterisk_rows), ] - merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), ] - # end hack to print only significant rows + merged_scores_asterisk <- merged_scores[names(asterisk_rows), , drop = FALSE] + merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), , drop = FALSE] row_list <- list() row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows @@ -1036,14 +2474,17 @@ i <- which_kinases my_row_names <- row_list[[i]] - scrs <- merged_scores[my_row_names, ] - stts <- merged_stats[my_row_names, ] + scrs <- merged_scores[my_row_names, , drop = FALSE] + stts <- merged_stats[my_row_names, , drop = FALSE] merged_asterisk <- as.matrix(asterisk(stts, p_cutoff)) color_breaks <- create_breaks(scrs) + if (is.null(color_breaks)) { + cat("No plot because matrix has too few rows.\n\n") + return(NULL) + } plot_height <- nrow(scrs) ^ 0.55 plot_width <- ncol(scrs) ^ 0.7 - my_cex_row <- 0.25 * 16 / plot_height if (export == "TRUE") { png( "KSEA.Merged.Heatmap.png", @@ -1053,96 +2494,600 @@ pointsize = 14 ) } - draw_kseaapp_summary_heatmap( + did_draw <- draw_kseaapp_summary_heatmap( x = scrs, sample_cluster = sample_cluster, merged_asterisk = merged_asterisk, - my_cex_row = my_cex_row, color_breaks = color_breaks, margins = margins ) if (export == "TRUE") { dev.off() } + if (!did_draw) + return(NULL) return(my_row_names) } -# helper for heatmaps of phosphopeptide intensities - -draw_intensity_heatmap <- +# helpers for heatmaps of phosphopeptide intensities + +# factory producing function to truncate string after n characters +trunc_n <- function(n) { + function(x) { + sapply( + X = x, + FUN = function(s) { + if (is.na(s)) + return("NA") + cond <- try_catch_w_e(nchar(s) > n) + if (!is.logical(cond$value)) { + return(cond$value$message) + } else if (cond$value) { + paste0( + strtrim(s, n), + "..." + ) + } else { + s + } + }, + USE.NAMES = FALSE + ) + } + } +trunc_long_ppep <- function(x) trunc_n(40)(x) +trunc_ppep <- function(x) trunc_n(g_ppep_trunc_n)(x) +trunc_subgene <- function(x) trunc_n(g_subgene_trunc_n)(x) +trunc_enriched_substrate <- function(x) trunc_n(g_sbstr_trunc_n)(x) + +# factory producing a function that returns a covariance +# matrix's rows (and columns) having variance > v_min +keep_cov_w_var_gtr_min <- function(v_min) { + function(x) { + if (!is.matrix(x)) + return(NULL) + keepers <- sapply( + X = seq_len(nrow(x)), + FUN = function(i) { + if (x[i, i] < v_min) + NA + else + x[i, i] + } + ) + names(keepers) <- rownames(x) + keepers <- keepers[!is.na(keepers)] + keepers <- names(keepers) + if (length(keepers) == 0) + return(NULL) + x[keepers, keepers] + } +} +# function that returns a matrix's rows having variance > 1 +keep_cov_w_var_gtr_1 <- keep_cov_w_var_gtr_min(1) + +# factory producing a function that returns +# - either a matrix's rows (rows = TRUE) +# - or a matrix's columns (rows = FALSE) +# having variance > v_min +keep_var_gtr_min <- function(v_min) { + function(x, rows = TRUE) { + nrowcol <- if (rows) nrow else ncol + if (!is.matrix(x)) + return(NULL) + keepers <- sapply( + X = seq_len(nrowcol(x)), + FUN = function(i) { + row_var <- var( + if (rows) x[i, ] else x[, i], + na.rm = TRUE + ) + if (is.na(row_var) || row_var <= v_min) NA else i + } + ) + keepers <- keepers[!is.na(keepers)] + if (rows) x[keepers, ] else x[, keepers] + } +} + +keep_var_gtr_0 <- keep_var_gtr_min(0) + +# function drawing heatmap of phosphopeptide intensities +ppep_heatmap <- function( m, # matrix with rownames already formatted cutoff, # cutoff used by hm_heading_function - hm_heading_function, # construct and cat heading from m and cutoff + hm_heading_function, # construct $ cat heading from m and cutoff hm_main_title, # main title for plot (drawn below heading) suppress_row_dendrogram = TRUE, # set to false to show dendrogram - max_peptide_count # experimental: - = intensity_hm_rows, # values of 50 and 75 worked well - ... # passthru parameters for heatmap + max_peptide_count = # experimental: + g_intensity_hm_rows, # values of 50 and 75 worked well + master_cex = 1.0, # basis for text sizes + margins = NULL, # optional margins (bottom, right) + cellnote = NULL, # optional matrix of character; dim = dim(m) + adj = 0.5, # adjust text: 0 left, 0.5 middle, 1 right + ... # passthru to hm2plus or heatmap.2 ) { + use_heatmap_1 <- FALSE peptide_count <- 0 # emit the heading for the heatmap if (hm_heading_function(m, cutoff)) { - peptide_count <- min(max_peptide_count, nrow(m)) - if (nrow(m) > 1) { + nrow_m <- nrow(m) + peptide_count <- min(max_peptide_count, nrow_m) + if (nrow_m > 1) { m_margin <- m[peptide_count:1, ] - # Margin setting was heuristically derived - margins <- - c(0.5, # col - max(80, sqrt(nchar(rownames(m_margin)))) * 5 / 16 # row - ) - } - if (nrow(m) > 1) { + # Margin was heuristically derived to accommodate the widest label + row_mchar_max <- max(nchar(rownames(m_margin))) + col_mchar_max <- max(nchar(colnames(m_margin))) + row_margin <- master_cex * row_mchar_max * 2.6 + col_margin <- master_cex * col_mchar_max * 2.6 + if (print_trace_messages) cat(sprintf("row_margin = %0.3f; ", row_margin)) + if (print_trace_messages) cat(sprintf("col_margin = %0.3f; ", col_margin)) + hm_call <- NULL tryCatch( { - old_oma <- par("oma") - par(cex.main = 0.6) - # Heuristically determined character size adjustment formula - char_contractor <- - 250000 / ( - max(4500, (nchar(rownames(m_margin)))^2) * intensity_hm_rows + # set non-argument parameters for hm_call inner function + my_row_cex <- + master_cex * 200000 / ( + (max(nchar(rownames(m_margin)))^2) * g_intensity_hm_rows ) - heatmap( - m[peptide_count:1, ], - Rowv = if (suppress_row_dendrogram) NA else NULL, - Colv = NA, - cexRow = char_contractor, - cexCol = char_contractor * 50 / max_peptide_count, - scale = "row", - margins = margins, - main = - "Unimputed, unnormalized log(intensities)", - xlab = "", - las = 1, - ... + m_hm <- m[peptide_count:1, , drop = FALSE] + if (is.null(cellnote)) { + cellnote <- matrix("", nrow = nrow(m_hm), ncol = ncol(m_hm)) + cellnote[is.na(m_hm)] <- "NA" + } else { + cellnote <- cellnote[peptide_count:1, , drop = FALSE] + } + m_hm[is.na(m_hm)] <- 0 + nrow_m_hm <- nrow(m_hm) + ncol_m_hm <- ncol(m_hm) + if (nrow_m_hm < 1 || ncol_m_hm < 1) + return(peptide_count) # return zero as initialized above + my_limit <- g_intensity_hm_rows + + + my_row_cex <- master_cex * (100 / (2 + row_mchar_max)) + my_col_cex <- master_cex * 6 * row_margin / col_margin + my_col_adj <- min(my_col_cex, my_row_cex) / my_col_cex + my_col_cex <- min(my_col_cex, my_row_cex) + col_margin <- sqrt(my_col_adj) * col_margin + if (print_trace_messages) cat(sprintf("my_row_cex = %0.3f; ", my_row_cex)) + if (print_trace_messages) cat(sprintf("my_col_cex = %0.3f; ", my_col_cex)) + if (is.null(margins)) my_margins <- + c( + (my_col_cex + col_margin), # col + (my_row_cex + row_margin) / my_row_cex # row + ) + else + my_margins <- margins + + if (print_trace_messages) cat( + sprintf( + "my_margins = c(%s)\n\n", + paste(my_margins, collapse = ", ") + ) + ) + my_hm2_cex <- 2 * master_cex + my_key_cex <- 0.9 - 0.1 * (g_intensity_hm_rows + nrow_m_hm) / g_intensity_hm_rows + my_key_warp <- 1.5 * 22.75 / row_margin + my_key_cex <- min(1.10, my_key_warp * my_key_cex) + my_hgt_scale <- 3.70 - 0.4 * (max(1, 0.9 * my_row_cex) - 1) + my_hgt_scale <- 3.75 # 3.615 + my_hgt_scale <- 3.60 # 3.615 + if (print_trace_messages) + cat_variable(my_hgt_scale, "\n\n", 3) + my_warp <- max(0.1, 1.4 * (7.5 + nrow_m) / g_intensity_hm_rows) + if (print_trace_messages) + cat_variable(my_warp, "\n\n", 3) + # added 0.9 heuristically... + my_plot_height <- + (0.566 + 0.354 * (nrow_m / g_intensity_hm_rows)) * + min(my_hgt_scale, my_hgt_scale * my_warp) + my_plot_height <- min(3.65, my_plot_height * g_intensity_hm_rows / 50) + my_heights <- c( + 0.3, # title and top dendrogram + my_plot_height, # plot and bottom margin + 4.15 - my_hgt_scale, # legend + 0.05 + my_hgt_scale - my_plot_height # whitespace below legend ) + my_note_cex <- min(0.8, my_row_cex, my_col_cex) + if (print_trace_messages) { + cat_variable(my_plot_height, "\n\n", 3) + cat_variable(4.19 - my_hgt_scale, "\n\n", 3) + cat_variable(nrow_m_hm, "; ", 0) + cat_variable(ncol_m_hm, "; ", 0) + cat_variable(my_row_cex, "; ", 3) + cat_variable(my_col_cex, "; ", 3) + cat_variable(my_note_cex, "; ", 3) + cat_variable(my_key_cex, "\n\n", 3) + cat_variable(my_hgt_scale, "; ", 3) + cat_variable(my_plot_height, "; ", 3) + cat_variable(my_warp, "\n\n", 3) + cat_variable(my_heights, "; ", 2) + cat_variable(sum(my_heights), "\n\n", 3) + } + + # define hm_call inner function + hm_call <- function(x, scaling, title) { + my_cex_main <- min(5.0, 220 / nchar(title)) + op <- par( + cex.main = my_cex_main * master_cex, + adj = adj + ) + if ( + !is.null( + hm2plus( + x, + Colv = NA, + Rowv = TRUE, + cexRow = my_row_cex, + cexCol = my_col_cex, + dendrogram = "row", + las = 1, + main = title, + key_xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), + cex = my_hm2_cex, + key_par = list(cex = my_key_cex), + margins = my_margins, + widths = c(0.4, 2.6, 1.5), + heights = my_heights, + mat = matrix( + c( + c(0, 3, 3), + c(2, 1, 1), + c(0, 4, 0), + c(0, 0, 0) + ), + nrow = 4, + ncol = 3, + byrow = TRUE + ), + na.rm = TRUE, + scale = scaling, + srtcol = 90, + srtrow = 0, + xlab = "", + cellnote = cellnote, + notecex = my_note_cex, + ... + ) + ) + ) { + if (print_trace_messages) cat( + sprintf( + "my_heights = c(%s); sum = %0.3f\n\n", + paste( + sprintf("%0.3f", my_heights), + collapse = ", " + ), + sum(my_heights) + ) + ) + if (print_trace_messages) cat( + sprintf("my_key_cex = %0.3f\n\n", + my_key_cex) + ) + if (print_trace_messages) cat( + sprintf("my_key_cex/my_heights[3] = %0.3f\n\n", + my_key_cex / my_heights[3]) + ) + if (print_trace_messages) cat( + sprintf("my_heights[2]/my_heights[3] = %0.3f\n\n", + my_heights[2] / my_heights[3]) + ) + } + par(op) + } + + # invoke hm_call inner function + if (sum(rowSums(!is.na(m_hm)) < 2)) + hm_call( + m_hm, + "none", + "log(intensities), unscaled, unimputed, and unnormalized" + ) + else + hm_call( + m_hm, + "row", + "log(intensities), row-scaled, unimputed, and unnormalized" + ) }, error = function(e) { - cat( - sprintf( - "\nCould not draw heatmap, possibly because of too many missing values. Internal message: %s\n", - e$message + if (!is.null(hm_call)) { + m_hm[is.na(m_hm)] <- 0 + tryCatch( + { + if (nrow(m_hm) > 1) + hm_call( + m_hm, + "none", + paste( + "log(intensities), unscaled,", + "zero-imputed, unnormalized" + ) + ) + else + cat("\nThere are too few peptides to produce a heatmap.\n") + }, + error = function(r) { + cat( + sprintf( + "\n%s %s Internal message: %s\n\\newline\n\n", + "Failure drawing heatmap,", + "possibly because of too many missing values.\n\\newline\n\n", + r$message + ) + ) + cat_margins() + } + ) + } else { + cat( + "\nFailure drawing heatmap, possibly because of too many missing values.\n" ) - ) - }, - finally = par(old_oma) + } + } ) } } - return(peptide_count) + # return value: + peptide_count } + +# function drawing heatmap of correlations if they exist, else covariances +cov_heatmap <- + function( + m, # matrix with rownames already formatted + top_substrates = FALSE, + ... # passthru to hm2plus or heatmap.2 + ) { + if (print_nb_messages) nbe(see_variable(m), " [", nrow(m), "x", ncol(m), "\n") + #ACE nb(rowSums(m, na.rm = TRUE)) + #ACE bad_rows <- (rowSums(m, na.rm = TRUE) == 0) + #ACE nb(see_variable(bad_rows)) + #ACE m <- m[-bad_rows, , drop = FALSE] + colnames_m <- colnames(m) + is_na_m <- is.na(m) + tmp <- m + tmp[is_na_m] <- 0 + + tmp <- m[, 0 < colSums(x = tmp)] # by default, na.rm is FALSE + + colnames_tmp <- colnames(tmp) + + my_low_p_seq <- seq( + from = min(g_intensity_hm_rows, nrow(m)), + to = 1, + by = -1 + ) + + if (g_correlate_substrates) { + # zap samples having zero or near-zero variance + tmp[is.na(tmp)] <- 0 + nzv <- caret::nearZeroVar( + tmp, # matrix of values, samples x variables + freqCut = 1.01, # min(freq most prevalent value / + # freq second most prevalent) + uniqueCut = 99 # max(number of unique values / + # total number of samples) + ) + tmp <- if (length(nzv) > 0) { + m[, -nzv, drop = FALSE] + } else { + m + } + } else { + tmp <- m[my_low_p_seq, , drop = FALSE] + } + + + t_m <- t(tmp) + t_m[is.na(t_m)] <- 0 + prefiltered_nrow <- ncol(t_m) + + my_corcov <- cov(t_m) + did_filter_rows <- did_filter_cols <- FALSE + if (g_correlate_substrates && !is_positive_definite(my_corcov)) { + my_correlate_substrates <- FALSE + t_m <- t(m[my_low_p_seq, , drop = FALSE]) + t_m[is.na(t_m)] <- 0 + unfiltered_row_count <- ncol(t_m) + unfiltered_col_count <- nrow(t_m) + + # zap empty samples + t_m <- t_m[0 < rowSums(x = t_m), ] + # zap substrates present in fewer than two samples + foo <- t_m > 0 + foo <- colSums(x = foo) > 1 + t_m <- t_m[, foo] + + did_filter_rows <- unfiltered_row_count > ncol(t_m) + did_filter_cols <- unfiltered_col_count > nrow(t_m) + + colnames_tmp <- rownames(t_m) + my_corcov <- cov(t_m) + if (g_filter_cov_var_gt_1) { + my_corcov <- keep_cov_w_var_gtr_1(my_corcov) + } + } else if (g_correlate_substrates) { + my_corcov <- cov2cor(my_corcov) + my_correlate_substrates <- TRUE + } else { + my_correlate_substrates <- FALSE + if (g_filter_cov_var_gt_1) my_corcov <- keep_cov_w_var_gtr_1(my_corcov) + } + + omitted_samples <- colnames_m[colnames_m %notin% colnames_tmp] + suffix <- if (length(omitted_samples) > 1) "s" else "" + + f_omissions <- + function(is_corr) { + cat( + sprintf( + "Below is the %s plot for %s substrates", + if (is_corr) "correlation" else "covariance", + sprintf( + if (top_substrates) + "%0.0f \"highest-quality\"" + else + "%0.0f", + ncol(t_m) + ) + ) + ) + if (did_filter_cols) { + cat(sprintf(", omitting sample%s ", suffix)) + latex_collapsed_vector(", ", omitted_samples) + } + cat(".\n\n") + } + + if (is.null(my_corcov) || sum(!is.na(t_m)) < 2) { + cat( + sprintf( + "\\newline\n%s %s plot.\n", + "Insufficient covariance to produce", + if (my_correlate_substrates) + "correlation" + else + "covariance" + ), + "\\newpage\n" + ) + return(NULL) + } + + cat("\\leavevmode\n", "\\newpage\n") + f_omissions(my_correlate_substrates) + + master_cex <- 0.4 + max_nchar <- max(nchar(rownames(t_m))) + my_limit <- g_intensity_hm_rows + diminution <- sqrt(my_limit / (my_limit + ncol(t_m))) + my_row_cex <- + my_col_cex <- + min(1.75, master_cex * 9 * diminution ^ 1.5) + my_margin <- 3 + my_row_cex * 64 / (8 + max_nchar) + my_key_cex <- 1.4 + my_hm2_cex <- 1.0 * master_cex + my_hgt_scale <- 3.50 - 0.26 * (max(0.4, my_key_cex) - 0.4) + my_hgt_scale <- 2.7 + + my_legend_height <- 4.0 - my_hgt_scale + my_legend_height <- 0.5 * my_key_cex + my_warp <- 0.65 * (my_limit + ncol(t_m)) / my_limit + my_warp <- 0.8 + my_legend_height <- 0.77 + my_legend_height <- 0.67 + my_plot_height <- my_hgt_scale + (1 - my_warp) * my_legend_height + my_legend_height <- my_warp * my_legend_height + + parjust <- par(adj = 0.5) + on.exit(par(parjust)) + my_corcov <- my_corcov[order(rownames(my_corcov)), ] + my_main <- + sprintf("%s among %s substrates %s", + if (my_correlate_substrates) "Correlation" + else "Covariance", + kinase_name, + if (!my_correlate_substrates && + g_filter_cov_var_gt_1 && + did_filter_rows + ) + "having variance > 1" + else "" + ) + my_main_nchar <- nchar(my_main) + my_heights <- c( + 0.3, + my_plot_height, + my_legend_height # was 4.0 - my_hgt_scale # was 4.19 + ) + if (print_trace_messages) cat(sprintf("max_nchar = %0.3f; ", max_nchar)) + if (print_trace_messages) cat(sprintf("my_margin = %0.3f; ", my_margin)) + if (print_trace_messages) cat(sprintf("my_plot_height = %0.3f\n\n", my_plot_height)) + if (print_trace_messages) cat(sprintf("master_cex = %0.3f; ", master_cex)) + if (print_trace_messages) cat(sprintf("my_row_cex = %0.3f; ", my_row_cex)) + if (print_trace_messages) cat(sprintf("my_col_cex = %0.3f; ", my_col_cex)) + if (print_trace_messages) cat(sprintf("my_key_cex = %0.3f\n\n", my_key_cex)) + if (print_trace_messages) cat(sprintf("my_hgt_scale = %0.3f\n\n", my_hgt_scale)) + if (print_trace_messages) cat(sprintf("legend height = %0.3f\n\n", my_legend_height)) + if (print_trace_messages) cat( + sprintf( + "my_heights = c(%s); sum = %0.3f\n\n", + paste( + sprintf("%0.3f", my_heights), + collapse = ", " + ), + sum(my_heights) + ) + ) + op <- par(cex.main = (30 + my_main_nchar) / my_main_nchar) + on.exit(par(op)) + hm2plus( + x = my_corcov, + cex = my_hm2_cex, + cexCol = my_col_cex, + cexRow = my_row_cex, + density_info = "none", + denhgt = 0.15, + denwid = 0.5, + divergent = TRUE, + key_par = list(cex = my_key_cex), + key_xlab = if (my_correlate_substrates) "Correlation" + else "Covariance", + main = my_main, + mat = matrix( + c( + c(0, 3, 3), + c(2, 1, 1), + c(0, 4, 0) + ), + nrow = 3, + ncol = 3, + byrow = TRUE + ), + heights = my_heights, + margins = c(my_margin, my_margin), + widths = c(0.5, 3.1, 0.9), + scale = "none", + symkey = TRUE, + symbreaks = TRUE, + symm = FALSE #TODO evaluate TRUE + # ... + ) + } # end cov_heatmap + +### FILE IMPORT + +# function reading bzipped file to data.frame +bzip2df <- function(d, f, ctor = bzfile) { + # read.delim file (by default, compressed by bzip2) + if (file.exists(f)) { + conn <- NULL + pf <- parent.frame() + tryCatch( + assign( + as.character(substitute(d)), + read.delim(conn <- bzfile(f, open = "r")), + pf + ), + finally = if (!is.null(conn)) close(conn) + ) + } +} + ``` -```{r, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} -cat("\\listoftables\n") -``` # Purpose -To perform for phosphopeptides: +The purpose of this analysis is to perform for phosphopeptides: - imputation of missing values, - quantile normalization, -- ANOVA (using the R stats::`r params$oneWayManyCategories` function), and +- ANOVA (using the R stats::`r params$oneWayManyCategories` function), +- assignment of an FDR-adjusted $p$-value and a "quality score" to each phosphopeptide, and - KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases: - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org) - The Human Proteome Database [http://hprd.org](http://hprd.org) @@ -1151,13 +3096,15 @@ ```{r include = FALSE} -### GLOBAL VARIABLES - -# parameters for KSEA - -ksea_cutoff_statistic <- params$kseaCutoffStatistic -ksea_cutoff_threshold <- params$kseaCutoffThreshold -ksea_min_kinase_count <- params$kseaMinKinaseCount +if (params$kseaUseAbsoluteLog2FC) { + sfc <- "|s|" + pfc <- "|p|" + pfc_txt <- "$\\text{absolute value}({\\log_2 (\\text{fold-change})})$" +} else { + sfc <- "s" + pfc <- "p" + pfc_txt <- "${\\log_2 (\\text{fold-change}})$" +} ksea_heatmap_titles <- list() ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <- @@ -1177,24 +3124,11 @@ # hash to hold names of significantly enriched kinases ksea_asterisk_hash <- new_env() -# READ PARAMETERS (mostly) - -intensity_hm_rows <- params$intensityHeatmapRows -# Input Filename -input_file <- params$inputFile - -# First data column - ideally, this could be detected via regexSampleNames, -# but for now leave it as is. -first_data_column <- params$firstDataColumn -fdc_is_integer <- is.integer(first_data_column) -if (fdc_is_integer) { - first_data_column <- as.integer(params$firstDataColumn) -} +# PROCESS (mostly read) PARAMETERS # False discovery rate adjustment for ANOVA # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 -val_fdr <- - read.table(file = params$alphaFile, sep = "\t", header = FALSE, quote = "") +val_fdr <- read.table(file = alpha_file, sep = "\t", header = FALSE, quote = "") if ( ncol(val_fdr) != 1 || @@ -1202,62 +3136,17 @@ sum(val_fdr[, 1] < 0) || sum(val_fdr[, 1] > 1) ) { - stop("alphaFile should be one column of numbers within the range [0.0,1.0]") + cat("alphaFile should be one column of numbers within the range [0.0,1.0]") + param_df_exit() + knitr::knit_exit() } val_fdr <- val_fdr[, 1] -#Imputed Data filename -imputed_data_filename <- params$imputedDataFilename -imp_qn_lt_data_filenm <- params$imputedQNLTDataFile -anova_ksea_mtdt_file <- params$anovaKseaMetadata - ``` -```{r echo = FALSE} -# Imputation method, should be one of -# "random", "group-median", "median", or "mean" -imputation_method <- params$imputationMethod - -# Selection of percentile of logvalue data to set the mean for random number -# generation when using random imputation -mean_percentile <- params$meanPercentile / 100.0 - -# deviation adjustment-factor for random values; real number. -sd_percentile <- params$sdPercentile - -# Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" -regex_sample_names <- params$regexSampleNames - -# Regular expression to extract Sample Grouping from Sample Name; -# if error occurs, compare sample_treatment_levels vs. sample_name_matches -# to see if groupings/pairs line up -# e.g., "(\\d+)" -regex_sample_grouping <- params$regexSampleGrouping - -one_way_all_categories_fname <- params$oneWayManyCategories -one_way_all_categories <- try_catch_w_e( - match.fun(one_way_all_categories_fname)) -if (!is.function(one_way_all_categories$value)) { - write("fatal error for parameter oneWayManyCategories:", stderr()) - write(one_way_all_categories$value$message, stderr()) - if (sys.nframe() > 0) quit(save = "no", status = 1) - stop("Cannot continue. Goodbye.") -} -one_way_all_categories <- one_way_all_categories$value - -one_way_two_categories_fname <- params$oneWayManyCategories -one_way_two_categories <- try_catch_w_e( - match.fun(one_way_two_categories_fname)) -if (!is.function(one_way_two_categories$value)) { - cat("fatal error for parameter oneWayTwoCategories: \n") - cat(one_way_two_categories$value$message, fill = TRUE) - if (sys.nframe() > 0) quit(save = "no", status = 1) - stop("Cannot continue. Goodbye.") -} -one_way_two_categories <- one_way_two_categories$value - -preproc_db <- params$preprocDb -ksea_app_prep_db <- params$kseaAppPrepDb +```{r echo = FALSE, results = 'asis'} + + result <- file.copy( from = preproc_db, to = ksea_app_prep_db, @@ -1272,9 +3161,19 @@ ), stderr() ) - if (sys.nframe() > 0) quit(save = "no", status = 1) - stop("Cannot continue. Goodbye.") + if (sys.nframe() > 0) { + cat("Cannot continue and quit() failed. Goodbye.") + param_df_exit() + knitr::knit_exit() + # in case knit_exit doesn't exit + quit(save = "no", status = 1) + } } + +if (FALSE) { + write.table(x = param_df, file = "test-data/params.txt") +} + ``` ```{r echo = FALSE} @@ -1289,55 +3188,75 @@ quote = "", check.names = FALSE ) + ``` -# Extract Sample Names and Treatment Levels - -Column names parsed from input file are shown in Table 1; sample names and treatment levels, in Table 2. + +# Extraction of Sample Classes and Names from Input Data ```{r echo = FALSE, results = 'asis'} data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) +my_column_names <- names(full_data) if (!fdc_is_integer) { if (length(data_column_indices) > 0) { first_data_column <- data_column_indices[1] } else { - stop(paste("failed to convert firstDataColumn:", first_data_column)) + cat(paste("failed to convert firstDataColumn:", first_data_column)) + param_df_exit() + knitr::knit_exit() } } cat( sprintf( paste( - "\n\nThe input data file has peptide-intensity data for each sample", - "in one of columns %d through %d.\n\n" + "\n\nThe input data file has peptide-intensity data", + "in columns %d (\"%s\") through %d (\"%s\")." ), - min(data_column_indices), - max(data_column_indices) + tmp <- min(data_column_indices), + my_column_names[tmp], + tmp <- max(data_column_indices), + my_column_names[tmp] ) ) -# Write column names as a LaTeX enumerated list. -column_name_df <- data.frame( - column = seq_len(length(colnames(full_data))), - name = paste0("\\verb@", colnames(full_data), "@") +if (TRUE) { + cat0( + table_offset(i = 1, new = TRUE), + "Sample classes and names are shown in ", + table_href(), + ".\n\n" ) -data_frame_latex( - x = column_name_df, - justification = "l l", - centered = TRUE, - caption = "Input data column names", - anchor = const_table_anchor_bp, - underscore_whack = FALSE +} else { + cat0( + "\\newcounter{offset}\n", + "\\setcounter{offset}{\\value{table}}\n", + "\\stepcounter{offset}\n", + "Sample classes and names are shown in ", + table_href(), + ".\n\n" ) +} + +#TODO remove this unused variable and assignment +if (FALSE) { + # Write column names as a LaTeX enumerated list. + column_name_df <- data.frame( + column = seq_len(length(colnames(full_data))), + name = paste0("\\verb@", colnames(full_data), "@") + ) +} ``` ```{r echo = FALSE, results = 'asis'} +# extract intensity columns from full_data to quant_data quant_data <- full_data[first_data_column:length(full_data)] quant_data[quant_data == 0] <- NA rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide +full_data_names <- colnames(quant_data) # Extract factors and trt-replicates using regular expressions. # Typically: # regex_sample_names is "\\.\\d+[A-Z]$" @@ -1355,20 +3274,133 @@ write_debug_file(quant_data) rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) -sample_treatment_levels <- as.factor(regmatches(sample_name_matches, rx_match)) -number_of_samples <- length(sample_name_matches) +smpl_trt <- as.factor(regmatches(sample_name_matches, rx_match)) + +if (print_nb_messages) nbe(see_variable(smpl_trt, "\n\n")) +if (print_nb_messages) nbe(see_variable(sample_name_matches, "\n\n")) +if (print_nb_messages) nbe(see_variable(full_data_names, "\n\n")) + +sample_treatment_df <- + save_sample_treatment_df <- + data.frame( + class = smpl_trt, + sample = sample_name_matches, + full_sample_names = full_data_names + ) + +if (print_nb_messages) nbe(see_variable(sample_treatment_df, "\n\n")) + +# reorder data +my_order <- with(sample_treatment_df, order(class, sample)) +quant_data <- quant_data[, my_order] +sample_name_matches <- sample_name_matches[my_order] +smpl_trt <- smpl_trt[my_order] sample_treatment_df <- data.frame( - level = sample_treatment_levels, + class = smpl_trt, sample = sample_name_matches ) -data_frame_latex( + +# filter smpl_trt as appropriate +if (sample_group_filter %in% c("include", "exclude")) { + include_sample <- + mgrepl( + v = sample_group_filter_patterns, + s = as.character(smpl_trt), + fixed = sample_group_filter_fixed, + perl = sample_group_filter_perl, + ignore.case = sample_group_filter_nocase + ) + if (sum(include_sample) < 2) { + errmsg <- + paste( + "ERROR:", + sum(include_sample), + "samples are too few for analysis;", + "check input parameters for sample-name parsing" + ) + cat0( + errmsg, + "\\stepcounter{offset}\n", + " in ", + table_href(), + ".\n\n" + ) + data_frame_tabbing_latex( + x = save_sample_treatment_df, + tabstops = c(1.25, 1.25), + caption = "Sample classes", + use_subsubsection_header = FALSE + ) + data_frame_tabbing_latex( + x = + param_df[ + c("regexSampleNames", + "regexSampleGrouping", + "groupFilterPatterns", + "groupFilter", + "groupFilterMode" + ), + ], + tabstops = c(1.75), + underscore_whack = TRUE, + caption = "Input parameters for sample-name parsing", + verbatim = FALSE + ) + param_df_exit() + knitr::knit_exit() + return(invisible(-1)) + } + sample_treatment_df <- + if (sample_group_filter == "include") + sample_treatment_df[include_sample, ] + else + sample_treatment_df[!include_sample, ] +} else { + include_sample <- rep.int(TRUE, length(smpl_trt)) +} +sample_name_matches <- sample_treatment_df$sample +rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) +smpl_trt <- as.factor(regmatches(sample_name_matches, rx_match)) +sample_treatment_df$class <- smpl_trt + +# filter quant_data as appropriate +number_of_samples <- length(sample_name_matches) +quant_data <- quant_data[, sample_name_matches] + +sample_level_integers <- as.integer(smpl_trt) +sample_treatment_levels <- levels(smpl_trt) +count_of_treatment_levels <- length(sample_treatment_levels) + +# for each phosphopeptide, across treatment levels, compute minimum +# count of observed (i.e., non-missing) values +my_env <- new_env() +for (l in sample_treatment_levels) + my_env[[as.character(l)]] <- + as.vector(rowSums(!is.na(quant_data[l == smpl_trt]))) +min_group_obs_count <- row_apply( + x = Reduce( + f = function(l, r) cbind(l, my_env[[r]]), + x = sample_treatment_levels, + init = c() + ), + fun = min + ) +names(min_group_obs_count) <- rownames(quant_data) +rm(my_env) + +# display (possibly-filtered) results +cat("\\newpage\n") + +if (sum(include_sample) > 1) { +data_frame_tabbing_latex( x = sample_treatment_df, - justification = "rp{0.2\\linewidth} lp{0.3\\linewidth}", - centered = TRUE, - caption = "Treatment levels", - anchor = const_table_anchor_tbp, - underscore_whack = FALSE + tabstops = c(1.25), + caption = "Sample classes", + use_subsubsection_header = FALSE ) +} +sample_name_grow <- 10 / (10 + max(nchar(sample_name_matches) + 6)) +sample_colsep <- transition_positions(as.integer(sample_treatment_df$class)) ``` ```{r echo = FALSE, results = 'asis'} @@ -1377,7 +3409,7 @@ ## Are the log-transformed sample distributions similar? -```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} +```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'} quant_data[quant_data == 0] <- NA #replace 0 with NA quant_data_log <- log10(quant_data) @@ -1387,25 +3419,81 @@ write_debug_file(quant_data_log) -# data visualization -old_par <- par( - mai = par("mai") + c(0.5, 0, 0, 0) -) -# ref: https://r-charts.com/distribution/add-points-boxplot/ -# Vertical plot -boxplot( - quant_data_log -, las = 1 -, col = const_boxplot_fill -, ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") -, xlab = "Sample" -) -par(old_par) - - - -cat("\n\n\n") -cat("\n\n\n") +g_ppep_distrib_ctl <- new_env() +g_ppep_distrib_ctl$xlab_line <- 3.5 + 11.86 * (0.67 - sample_name_grow) +g_ppep_distrib_ctl$mai_bottom <- (0.5 + 3.95 * (0.67 - sample_name_grow)) +g_ppep_distrib_ctl$axis <- (0.6 + 0.925 * (0.67 - sample_name_grow)) + +my_ppep_distrib_bxp <- function( + x + , sample_name_grow = sample_name_grow + , main + , varwidth = FALSE + , sub = NULL + , xlab + , ylab + , col = const_boxplot_fill + , notch = FALSE + , ppep_distrib_ctl = g_ppep_distrib_ctl + , ... + ) { + my_xlab_line <- g_ppep_distrib_ctl$xlab_line + my_mai_bottom <- g_ppep_distrib_ctl$mai_bottom + my_axis <- g_ppep_distrib_ctl$axis + + if (print_trace_messages) { + cat_variable(my_xlab_line, suffix = "; ") + cat_variable(my_mai_bottom, suffix = "; ") + cat_variable(my_axis, suffix = "\n\n") + } + + old_par <- par( + mai = par("mai") + c(my_mai_bottom, 0, 0, 0), + cex.axis = my_axis, + cex.lab = 1.2 + ) + tryCatch( + { + # Vertical plot + boxplot( + x + , las = 2 + , col = col + , main = main + , sub = NULL + , ylab = ylab + , xlab = NULL + , notch = notch + , varwidth = varwidth + , ... + ) + title( + sub = sub + , cex.sub = 1.0 + , line = my_xlab_line + 1 + ) + title( + xlab = xlab + , line = my_xlab_line + ) + }, + finally = par(old_par) + ) + } + +my_ppep_distrib_bxp( + x = quant_data_log + , sample_name_grow = sample_name_grow + , main = "Peptide intensities for each sample" + , varwidth = boxplot_varwidth + , sub = "Box widths reflect number of peptides for sample" + , xlab = "Sample" + , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") + , col = const_boxplot_fill + , notch = FALSE + ) + +cat("\n\n\n\n") ``` @@ -1454,6 +3542,8 @@ ) ``` +# Characterization of Input Data + ## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} @@ -1461,7 +3551,7 @@ q1 <- quantile(logvalues, probs = mean_percentile)[1] # 1 = row of matrix (ie, phosphopeptide) -sds <- apply(quant_data_log, 1, sd_finite) +sds <- row_apply(quant_data_log, sd_finite) if (sum(!is.na(sds)) > 2) { plot( density(sds, na.rm = TRUE) @@ -1495,7 +3585,7 @@ # prep for trt-median based imputation ``` -# Impute Missing Values +# Imputation of Missing Values ```{r echo = FALSE} @@ -1526,11 +3616,10 @@ paste("Substitute missing value with", "median peptide-intensity for sample group.\n" ) - sample_level_integers <- as.integer(sample_treatment_levels) # Take the accurate ln(x+1) because the data are log-normally distributed # and because median can involve an average of two measurements. quant_data_imp <- log1p(quant_data_imp) - for (i in seq_len(length(levels(sample_treatment_levels)))) { + for (i in seq_len(count_of_treatment_levels)) { # Determine the columns for this factor-level level_cols <- i == sample_level_integers # Extract those columns @@ -1541,7 +3630,7 @@ # a given ppep has no measurement; otherwise, proceed. if (ncol(lvlsbst) > 1) { the_centers <- - apply(lvlsbst, 1, median, na.rm = TRUE) + row_apply(lvlsbst, median, na.rm = TRUE) for (j in seq_len(nrow(lvlsbst))) { for (k in seq_len(ncol(lvlsbst))) { if (is.na(lvlsbst[j, k])) { @@ -1565,7 +3654,7 @@ # Take the accurate ln(x+1) because the data are log-normally distributed # and because median can involve an average of two measurements. quant_data_imp <- log1p(quant_data_imp) - quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = TRUE)[ind[, 1]] + quant_data_imp[ind] <- row_apply(quant_data_imp, median, na.rm = TRUE)[ind[, 1]] # Take the accurate e^x - 1 to match scaling of original input. quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) good_rows <- !is.nan(rowMeans(quant_data_imp)) @@ -1581,7 +3670,7 @@ # this will have to be quant_data_imp <- log1p(quant_data_imp) # Assign to NA cells the mean for the row - quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = TRUE)[ind[, 1]] + quant_data_imp[ind] <- row_apply(quant_data_imp, mean, na.rm = TRUE)[ind[, 1]] # Take the accurate e^x - 1 to match scaling of original input. quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) good_rows <- !is.nan(rowMeans(quant_data_imp)) @@ -1625,11 +3714,40 @@ imp_smry_pot_peptides_after <- sum(good_rows) imp_smry_rejected_after <- sum(!good_rows) imp_smry_missing_values_after <- sum(is.na(quant_data_imp[good_rows, ])) + +# From ?`%in%`, %in% is currently defined as function(x, table) match(x, table, nomatch = 0) > 0 + +sink(stderr()) +print("`%in%`:") +print(`%in%`) +sink() + +stock_in <- + names(good_rows) %in% + names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count]) +if (print_nb_messages) nbe(see_variable(stock_in), "\n") + +explicit_in <- + 0 < match( + names(good_rows), + names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count]) + ) +if (print_nb_messages) nbe(see_variable(explicit_in), "\n") + +great_enough_row_names <- good_rows[ + names(good_rows) %in% + names(min_group_obs_count[g_intensity_min_per_class <= min_group_obs_count]) +] +if (print_nb_messages) nbe(see_variable(great_enough_row_names), "\n") +great_enough_row_names <- great_enough_row_names[great_enough_row_names] +if (print_nb_messages) nbe(see_variable(great_enough_row_names), "\n") ``` + ```{r echo = FALSE, results = 'asis'} # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf tabular_lines_fmt <- paste( "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page) + " \\leavevmode", " \\caption{Imputation Results}", " \\centering", # \centering centers the table on the page " \\begin{tabular}{l c c c}", @@ -1637,8 +3755,9 @@ " \\ & potential peptides & missing values & rejected", " peptides \\\\ [0.5ex]", " \\hline", - " before imputation & %d & %d (%d\\%s) & \\\\", - " after imputation & %d & %d & %d \\\\ [1ex]", + " before imputation & %d & %d (%d\\%s) & \\\\", + " after imputation & %d & %d & %d \\\\", + " after keep comparable & %d & & %d \\\\ [1ex]", " \\hline", " \\end{tabular}", #" \\label{table:nonlin}", # may be used to refer this table in the text @@ -1654,18 +3773,37 @@ "%", imp_smry_pot_peptides_after, imp_smry_missing_values_after, - imp_smry_rejected_after + imp_smry_rejected_after, + length(great_enough_row_names), + imp_smry_pot_peptides_before - + length(great_enough_row_names) ) cat(tabular_lines) ``` -```{r echo = FALSE} - - -# Zap rows where imputation was ineffective + +```{r filter_good_rows, echo = FALSE} + +if (print_nb_messages) nbe("before name extraction, ", see_variable(length(good_rows)), " ", see_variable(good_rows), "\n") +good_rows <- names(good_rows[names(great_enough_row_names)]) +if (print_nb_messages) nbe("after name extraction, ", see_variable(length(good_rows)), see_variable(good_rows), "\n") + +#ACE min_group_obs_count <- min_group_obs_count[names(great_enough_row_names)] +#ACE nbe("good_rows") +#ACE nbe(see_variable(good_rows)) +#ACE nbe("names(min_group_obs_count) before filter for good rows") +#ACE nbe(see_variable(names(min_group_obs_count))) +min_group_obs_count <- min_group_obs_count[good_rows] +#ACE nbe("min_group_obs_count after filter for good rows") +#ACE nbe(see_variable(names(min_group_obs_count))) + +# Zap rows where imputation was insufficiently effective full_data <- full_data [good_rows, ] quant_data <- quant_data [good_rows, ] - +quant_data_log <- quant_data_log [good_rows, ] + +if (print_nb_messages) nbe("before row filter, ", see_variable(nrow(quant_data_imp)), "\n") quant_data_imp <- quant_data_imp[good_rows, ] +if (print_nb_messages) nbe("after row filter, ", see_variable(nrow(quant_data_imp)), "\n") write_debug_file(quant_data_imp) quant_data_imp_good_rows <- quant_data_imp @@ -1719,35 +3857,41 @@ ``` -```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} +```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'} zero_sd_rownames <- rownames(quant_data_imp)[ - is.na((apply(quant_data_imp, 1, sd, na.rm = TRUE)) == 0) + is.na((row_apply(quant_data_imp, sd, na.rm = TRUE)) == 0) ] if (length(zero_sd_rownames) >= nrow(quant_data_imp)) { - stop("All peptides have zero standard deviation. Cannot continue.") + cat("All peptides have zero standard deviation. Cannot continue.") + param_df_exit() + knitr::knit_exit() } if (length(zero_sd_rownames) > 0) { cat( - sprintf("%d peptides with zero variance were removed from statistical consideration", - length(zero_sd_rownames) + sprintf( + "%d %s %s", + length(zero_sd_rownames), + "peptides with zero variance", + "were removed from statistical consideration" ) ) zap_named_rows <- function(df, nms) { return(df[!(row.names(df) %in% nms), ]) } - quant_data_imp <- zap_named_rows(quant_data_imp, zero_sd_rownames) - quant_data <- zap_named_rows(quant_data, zero_sd_rownames) - full_data <- zap_named_rows(full_data, zero_sd_rownames) + quant_data_imp <- + zap_named_rows(quant_data_imp, zero_sd_rownames) + quant_data <- + zap_named_rows(quant_data, zero_sd_rownames) + full_data <- + zap_named_rows(full_data, zero_sd_rownames) + min_group_obs_count <- + min_group_obs_count[names(min_group_obs_count) %notin% zero_sd_rownames] } if (sum(is.na(quant_data)) > 0) { cat("\\leavevmode\\newpage\n") - # data visualization - old_par <- par( - mai = par("mai") + c(0.5, 0, 0, 0) - ) # Copy quant data to x x <- quant_data # x gets to have values of: @@ -1776,22 +3920,24 @@ show_stripchart <- 50 > (count_red + count_blue) / length(sample_name_matches) if (show_stripchart) { - boxplot_sub <- "Light blue = data before imputation; Red = imputed data" + boxplot_sub <- "Light blue = data before imputation; Red = imputed data;" } else { boxplot_sub <- "" } # Vertical plot colnames(blue_dots) <- sample_name_matches - boxplot( - blue_dots - , las = 1 # "always horizontal" - , col = const_boxplot_fill - , ylim = ylim + my_ppep_distrib_bxp( + x = blue_dots + , sample_name_grow = sample_name_grow , main = "Peptide intensities after eliminating unusable peptides" - , sub = boxplot_sub + , varwidth = boxplot_varwidth + , sub = paste(boxplot_sub, "Box widths reflect number of peptides for sample") , xlab = "Sample" , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") + , col = const_boxplot_fill + , notch = FALSE + , ylim = ylim ) if (show_stripchart) { @@ -1803,7 +3949,7 @@ method = "jitter", # Random noise jitter = const_stripchart_jitter, pch = 19, # Pch symbols - cex = const_stripsmall_cex, # Size of symbols reduced + cex = const_stripchart_cex, # Size of symbols reduced col = "lightblue", # Color of the symbol vertical = TRUE, # Vertical mode add = TRUE # Add it over @@ -1813,49 +3959,71 @@ method = "jitter", # Random noise jitter = const_stripchart_jitter, pch = 19, # Pch symbols - cex = const_stripsmall_cex, # Size of symbols reduced + cex = const_stripchart_cex, # Size of symbols reduced col = "red", # Color of the symbol vertical = TRUE, # Vertical mode add = TRUE # Add it over ) } - if (TRUE) { - # show measured values in blue on left half-violin plot - cat("\\leavevmode\n\\quad\n\n\\quad\n\n") - vioplot::vioplot( - x = lapply(blue_dots, function(x) x[!is.na(x)]), - col = "lightblue1", - side = "left", - plotCentre = "line", - ylim = ylim_save, - main = "Distributions of observed and imputed data", - sub = "Light blue = observed data; Pink = imputed data", - xlab = "Sample", - ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") +} +``` + +```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} +if (sum(is.na(quant_data)) > 0) { + # show measured values in blue on left half-violin plot + cat("\\leavevmode\n\\quad\n\n\\quad\n\n") + old_par <- par( + mai = par("mai") + c(g_ppep_distrib_ctl$mai_bottom, 0, 0, 0), + cex.axis = g_ppep_distrib_ctl$axis, + cex.lab = 1.2 + ) + tryCatch( + { + vioplot::vioplot( + x = lapply(blue_dots, function(x) x[!is.na(x)]), + col = "lightblue1", + side = "left", + plotCentre = "line", + ylim = ylim_save, + main = "Distributions of observed and imputed data", + sub = NULL, + las = 2, + xlab = NULL, + ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") + ) + title( + sub = "Light blue = observed data; Pink = imputed data", + cex.sub = 1.0, + line = g_ppep_distrib_ctl$xlab_line + 1 ) - red_violins <- lapply(red_dots, function(x) x[!is.na(x)]) - cols_to_delete <- c() - for (ix in seq_len(length(red_violins))) { - if (length(red_violins[[ix]]) < 1) { - cols_to_delete <- c(cols_to_delete, ix) + title( + xlab = "Sample", + line = g_ppep_distrib_ctl$xlab_line + ) + red_violins <- lapply(red_dots, function(x) x[!is.na(x)]) + cols_to_delete <- c() + for (ix in seq_len(length(red_violins))) { + if (length(red_violins[[ix]]) < 1) { + cols_to_delete <- c(cols_to_delete, ix) + } } - } - # destroy any unimputable columns - if (!is.null(cols_to_delete)) { - red_violins <- red_violins[-cols_to_delete] - } - # plot imputed values in red on right half-violin plot - vioplot::vioplot( - x = red_violins, - col = "lightpink1", - side = "right", - plotCentre = "line", - add = TRUE - ) - } - - par(old_par) + # destroy any unimputable columns + if (!is.null(cols_to_delete)) { + red_violins <- red_violins[-cols_to_delete] + } + # plot imputed values in red on right half-violin plot + vioplot::vioplot( + x = red_violins, + col = "lightpink1", + side = "right", + plotCentre = "line", + add = TRUE + ) + + }, + finally = par(old_par) + ) # density plot cat("\\leavevmode\n\n\n\n\n\n\n") @@ -1893,88 +4061,88 @@ } ``` -# Perform Quantile Normalization +# Quantile Normalization The excellent `normalize.quantiles` function from *[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)* performs "quantile normalization" as described Bolstad *et al.* (2003), DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)* -and *its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html)*, +and its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html), i.e., it assumes that the goal is to detect subtle differences among grossly similar samples (having similar distributions) -by equailzing intra-quantile quantitations. -Unfortunately, one software library upon which it depends +by equalizing intra-quantile quantitations^[Unfortunately, +one software library upon which `preprocessCore` depends *[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)* -that requires that a specific, non-concurrent version of the library be +that requires that a specific, non-concurrent version of the library (`openblas` version $0.3.3$) be installed. The installation command equivalent to what was used to install the library to produce the results presented here is: -``` - conda install bioconductor-preprocesscore openblas=0.3.3 -``` +\linebreak +` conda install bioconductor-preprocesscore openblas=0.3.3`]. <!-- -# Apply quantile normalization using preprocessCore::normalize.quantiles -# --- -# tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html -# except this: https://support.bioconductor.org/p/122925/#9135989 -# says to install it like this: -# ``` -# BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1]) -# ``` -# conda installation (necessary because of a bug in recent openblas): -# conda install bioconductor-preprocesscore openblas=0.3.3 -# ... -# --- -# normalize.quantiles {preprocessCore} -- Quantile Normalization -# -# Description: -# Using a normalization based upon quantiles, this function normalizes a -# matrix of probe level intensities. -# -# THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the -# assumption that the data is missing at random. -# -# Usage: -# normalize.quantiles(x, copy = TRUE, keep.names = FALSE) -# -# Arguments: -# -# - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. -# -# - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, -# but in certain situations not making a copy of the matrix, but instead normalizing -# it in place will be more memory friendly. -# -# - keep.names: Boolean option to preserve matrix row and column names in output. -# -# Details: -# This method is based upon the concept of a quantile-quantile plot extended to n dimensions. -# No special allowances are made for outliers. If you make use of quantile normalization -# please cite Bolstad et al, Bioinformatics (2003). -# -# This functions will handle missing data (ie NA values), based on -# the assumption that the data is missing at random. -# -# Note that the current implementation optimizes for better memory usage -# at the cost of some additional run-time. -# -# Value: A normalized matrix. -# -# Author: Ben Bolstad, bmbolstad.com -# -# References -# -# - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide -# Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf -# -# - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of -# Normalization Methods for High Density Oligonucleotide Array Data Based on Bias -# and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 -# http://bmbolstad.com/misc/normalize/normalize.html -# ... + # Apply quantile normalization using preprocessCore::normalize.quantiles + # --- + # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html + # except this: https://support.bioconductor.org/p/122925/#9135989 + # says to install it like this: + # ``` + # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1]) + # ``` + # conda installation (necessary because of a bug in recent openblas): + # conda install bioconductor-preprocesscore openblas=0.3.3 + # ... + # --- + # normalize.quantiles {preprocessCore} -- Quantile Normalization + # + # Description: + # Using a normalization based upon quantiles, this function normalizes a + # matrix of probe level intensities. + # + # THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the + # assumption that the data is missing at random. + # + # Usage: + # normalize.quantiles(x, copy = TRUE, keep.names = FALSE) + # + # Arguments: + # + # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. + # + # - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, + # but in certain situations not making a copy of the matrix, but instead normalizing + # it in place will be more memory friendly. + # + # - keep.names: Boolean option to preserve matrix row and column names in output. + # + # Details: + # This method is based upon the concept of a quantile-quantile plot extended to n dimensions. + # No special allowances are made for outliers. If you make use of quantile normalization + # please cite Bolstad et al, Bioinformatics (2003). + # + # This functions will handle missing data (ie NA values), based on + # the assumption that the data is missing at random. + # + # Note that the current implementation optimizes for better memory usage + # at the cost of some additional run-time. + # + # Value: A normalized matrix. + # + # Author: Ben Bolstad, bmbolstad.com + # + # References + # + # - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide + # Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf + # + # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of + # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias + # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 + # http://bmbolstad.com/misc/normalize/normalize.html + # ... --> ```{r echo = FALSE, results = 'asis'} +if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp)), "\n") if (nrow(quant_data_imp) > 0) { quant_data_imp_qn <- preprocessCore::normalize.quantiles( as.matrix(quant_data_imp), keep.names = TRUE @@ -1983,17 +4151,20 @@ quant_data_imp_qn <- as.matrix(quant_data_imp) } +if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn)), "\n") + quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) - write_debug_file(quant_data_imp_qn) quant_data_imp_qn_log <- log10(quant_data_imp_qn) - write_debug_file(quant_data_imp_qn_log) +if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn_log)), "\n") +if (print_nb_messages) nbe(see_variable(ncol(quant_data_imp_qn_log)), "\n") + quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) -sel <- apply(quant_data_imp_qn_ls, 1, any_nan) +sel <- row_apply(quant_data_imp_qn_ls, any_nan) quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ] @@ -2008,10 +4179,9 @@ data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) ``` -<!-- ACE insertion begin --> ## Are normalized, imputed, log-transformed sample distributions similar? -```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} +```{r echo = FALSE, fig.dim = c(9, 6.5), results = 'asis'} # Save unimputed quant_data_log for plotting below unimputed_quant_data_log <- quant_data_log @@ -2034,6 +4204,22 @@ # data visualization + if (TRUE) { + + my_ppep_distrib_bxp( + x = quant_data_log + , sample_name_grow = sample_name_grow + , main = "Peptide intensities for each sample" + , varwidth = boxplot_varwidth + , sub = NULL + , xlab = "Sample" + , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") + , col = const_boxplot_fill + , notch = boxplot_notch + ) + + } else { + old_par <- par( mai = par("mai") + c(0.5, 0, 0, 0) , oma = par("oma") + c(0.5, 0, 0, 0) @@ -2043,12 +4229,16 @@ colnames(quant_data_log) <- sample_name_matches boxplot( quant_data_log - , las = 1 + , las = 2 + , cex.axis = 0.9 * sample_name_grow , col = const_boxplot_fill , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") , xlab = "Sample" + , notch = boxplot_notch + , varwidth = boxplot_varwidth ) par(old_par) + } } else { cat("There are no peptides to plot\n") } @@ -2074,7 +4264,28 @@ # ANOVA Analysis -```{r, echo = FALSE} +## Assignment of $p$-value and quality score + +For each phosphopeptide, ANOVA analysis was performed to compute a $p$-value representing the evidence against the "null hypothesis" ($H_0$) that the intensity does not vary significantly among sample groups. + +However, because as more and more phosphopeptides are tested, there is increasd probability that, by random chance, a given peptide will have a $p$-value that appears to indicate significance. For this reason, the $p$-values were adjusted by applying the False Discovery Rate (FDR) correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x). + +Furthermore, to give more weight to phosphopeptides having fewer missing values, an (arbitrarily defined) quality score was assigned to each, defined as: + +$$ +\textit{quality}_j + = \frac{1 + o_{j}}{v_{j}(1 + o_{j}) + 0.005} +$$ + +where: + +- $o_j$ is the minimum number of non-missing observations per sample group for substrate $j$ for all sample groups, and +- $v_j$ is the FDR-adjusted ANOVA $p$-value for substrate $j$. + + +```{r, echo = FALSE, results = 'asis'} +cat("\\newpage\n") + # Make new data frame containing only Phosphopeptides # to connect preANOVA to ANOVA (connect_df) connect_df <- data.frame( @@ -2084,16 +4295,10 @@ colnames(connect_df) <- c("Phosphopeptide", "Intensity") ``` -```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'} -count_of_treatment_levels <- length(levels(sample_treatment_levels)) +```{r anova, echo = FALSE, fig.dim = c(10, 12), results = 'asis'} +g_can_run_ksea <- FALSE +old_oma <- par("oma") if (count_of_treatment_levels < 2) { - nuke_control_sequences <- - function(s) { - s <- gsub("[\\]", "xyzzy_plugh", s) - s <- gsub("[$]", "\\\\$", s) - s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) - return(s) - } cat( "ERROR!!!! Cannot perform ANOVA analysis", "(see next page)\\newpage\n" @@ -2109,7 +4314,7 @@ paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"), "\n\\end{quote}\n\n") - regex_sample_names <- nuke_control_sequences(regex_sample_names) + regex_sample_names <- latex_printable_control_seqs(regex_sample_names) cat("\\leavevmode\n\n\n") cat("Parsing rule for SampleNames is", @@ -2125,7 +4330,7 @@ paste(sample_name_matches, collapse = "\n\n\n"), "\n\\end{quote}\n\n") - regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping) + regex_sample_grouping <- latex_printable_control_seqs(regex_sample_grouping) cat("\\leavevmode\n\n\n") cat("Parsing rule for SampleGrouping is", @@ -2144,31 +4349,90 @@ } else { - p_value_data_anova_ps <- - apply( - quant_data_imp_qn_log, - 1, - anova_func, - grouping_factor = sample_treatment_levels, - one_way_f = one_way_all_categories - ) + if (print_nb_messages) nbe("computing p_value_data_anova_ps\n") + if (print_nb_messages) nbe(see_variable(nrow(quant_data_imp_qn_log)), "\n") + if (print_nb_messages) nbe(see_variable(ncol(quant_data_imp_qn_log)), "\n") + if (print_nb_messages) nbe(see_variable(quant_data_imp_qn_log[, ".NE.7C"]), "\n") + if (print_nb_messages) nbe(see_variable(quant_data_imp_qn_log), "\n") + if (print_nb_messages) nbe(see_variable(anova_func), "\n") + if (print_nb_messages) nbe(see_variable(smpl_trt), "\n") + if (print_nb_messages) nbe(see_variable(one_way_all_categories), "\n") + tryCatch( + { + p_value_data_anova_ps <- + row_apply( + quant_data_imp_qn_log, + anova_func, + grouping_factor = smpl_trt, + one_way_f = one_way_all_categories + ) + }, + error = function(e) { + mesg <- paste("Could not compute ANOVA because", e$message) + cat("\n\n", mesg, "\n\n") + param_df_noexit(e) + sink(stderr()) + cat("\n\n", mesg, "\n\n") + values <- paste(param_df$parameter, "=", param_df$value, collapse = "\n") + cat(values) + sink() + knitr::knit_exit() + exit(code = 1) + } + ) + if (print_nb_messages) nbe(see_variable(p_value_data_anova_ps), "\n") p_value_data_anova_ps_fdr <- p.adjust(p_value_data_anova_ps, method = "fdr") + my_ppep_v <- full_data[, 1] + p_value_data <- list( + phosphopeptide = my_ppep_v, + raw_anova_p = p_value_data_anova_ps, + fdr_adjusted_anova_p = p_value_data_anova_ps_fdr, + missing_values = rowSums(is.na(quant_data)), + min_group_obs_count = min_group_obs_count + ) p_value_data <- data.frame( - phosphopeptide = full_data[, 1] - , - raw_anova_p = p_value_data_anova_ps - , - fdr_adjusted_anova_p = p_value_data_anova_ps_fdr + phosphopeptide = my_ppep_v, + raw_anova_p = p_value_data_anova_ps, + fdr_adjusted_anova_p = p_value_data_anova_ps_fdr, + missing_values = rowSums(is.na(quant_data)), + min_group_obs_count = min_group_obs_count ) + p_value_data$quality <- 1.0 / with( + p_value_data, + fdr_adjusted_anova_p + 0.005 / (1 + min_group_obs_count) + ) + + p_value_data$ranking <- + with( + p_value_data, + switch( + g_intensity_hm_criteria, + "quality" = order(-quality), + "na_count" = order(missing_values, fdr_adjusted_anova_p), + # the default is "p_value" + order(fdr_adjusted_anova_p) + ) + ) + p_value_data <- p_value_data[p_value_data$ranking, , drop = FALSE] + + write.table( + p_value_data, + file = "p_value_data.txt", + sep = "\t", + col.names = TRUE, + row.names = FALSE, + quote = FALSE + ) + # output ANOVA file to constructed filename, # e.g. "Outputfile_pST_ANOVA_STEP5.txt" - # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" + # becomes "Outputfile_pST_ANOVA_STEP5_FDR0.05.txt" # Re-output datasets to include p-values - metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:3]) + metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:ncol(p_value_data)]) write.table( cbind(metadata_plus_p, quant_data_imp), file = imputed_data_filename, @@ -2188,61 +4452,97 @@ ) - p_value_data <- - p_value_data[order(p_value_data$fdr_adjusted_anova_p), ] - first_page_suppress <- 1 number_of_peptides_found <- 0 cutoff <- val_fdr[1] for (cutoff in val_fdr) { - if (number_of_peptides_found > 49) { + #loop through FDR cutoffs + if (number_of_peptides_found > g_intensity_hm_rows - 1) { cat("\\leavevmode\n\n\n") break } - #loop through FDR cutoffs + bool_1 <- (p_value_data$fdr_adjusted_anova_p < cutoff) + bool_2 <- (p_value_data$min_group_obs_count >= g_intensity_min_per_class) + g_can_run_ksea <- g_can_run_ksea || (sum(bool_2) > 0) + bool_4 <- (p_value_data$quality >= params$minQuality) + bool_3 <- as.logical( + as.integer(bool_1) * + as.integer(bool_2) * + as.integer(bool_4) + ) + if (print_trace_messages) { + if (length(bool_1) > 30) { + cat_variable(bool_1, force_str = TRUE) + cat_variable(bool_2, force_str = TRUE) + cat_variable(bool_3, force_str = TRUE) + } else { + cat_variable(bool_1, suffix = "\n\n") + cat_variable(bool_2, suffix = "\n\n") + cat_variable(bool_3, suffix = "\n\n") + } + cat_variable(length(bool_3), digits = 0, suffix = "; ") + cat_variable(sum(bool_3), digits = 0, suffix = "\n\n") + } filtered_p <- - p_value_data[ - which(p_value_data$fdr_adjusted_anova_p < cutoff), - , drop = FALSE - ] + p_value_data[bool_3, , drop = FALSE] + filtered_p <- + filtered_p[!is.na(filtered_p$phosphopeptide), , drop = FALSE] + + if (print_trace_messages) + cat_variable(filtered_p, force_str = TRUE) + + have_remaining_peptides <- sum(bool_3, na.rm = TRUE) > 0 + filter_result_string <- + sprintf( + "%s, %s of %0.0f peptides remained having both %s and %s.\n\n", + "After filtering for ANOVA results", + if (have_remaining_peptides) + as.character(sum(bool_3, na.rm = TRUE)) + else + "none", + length(bool_3), + sprintf("adjusted p-value < %s", as.character(signif(cutoff, 2))), + sprintf( + "more than %0.0f observations in some groups", + max(0, g_intensity_min_per_class - 1) + ) + ) + filtered_data_filtered <- quant_data_imp_qn_log[ rownames(filtered_p), , drop = FALSE ] + # order by p-value filtered_data_filtered <- filtered_data_filtered[ order(filtered_p$fdr_adjusted_anova_p), , drop = FALSE ] - # <!-- ACE insertion start --> - - if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) { + if (have_remaining_peptides && nrow(filtered_p) > 0 && nrow(filtered_data_filtered) > 0) { if (first_page_suppress == 1) { first_page_suppress <- 0 } else { cat("\\newpage\n") } - if (nrow(filtered_data_filtered) > 1) { - subsection_header(sprintf( - "Intensity distributions for %d phosphopeptides whose adjusted p-value < %0.2f\n", - nrow(filtered_data_filtered), - cutoff - )) - } else { - subsection_header(sprintf( - "Intensity distribution for one phosphopeptide (%s) whose adjusted p-value < %0.2f\n", - rownames(filtered_data_filtered)[1], - cutoff - )) - } - cat("\n\n\n") - cat("\n\n\n") - - old_oma <- par("oma") + latex_samepage({ + cat(filter_result_string) + if (nrow(filtered_data_filtered) > 1) { + cat(subsection_header(sprintf( + "Intensity distributions for %d phosphopeptides\n", + nrow(filtered_data_filtered) + ))) + } else { + cat(subsection_header(sprintf( + "Intensity distribution for one phosphopeptide (%s)\n", + rownames(filtered_data_filtered)[1] + ))) + } + }) # end latex_samepage + old_par <- par( mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), oma = old_oma * c(1, 1, 0.3, 1), @@ -2250,18 +4550,24 @@ cex.axis = 0.7, fin = c(9, 7.25) ) - # ref: https://r-charts.com/distribution/add-points-boxplot/ # Vertical plot colnames(filtered_data_filtered) <- sample_name_matches tryCatch( boxplot( filtered_data_filtered, main = "Imputed, normalized intensities", # no line plot - las = 1, + las = 2, + cex.axis = 0.9 * sample_name_grow, col = const_boxplot_fill, - ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") + ylab = latex2exp::TeX("$log_{10}$(peptide intensity)"), + notch = FALSE, + varwidth = boxplot_varwidth ), - error = function(e) print(e) + error = function(e) { + print(e) + cat_margins() + } + ) par(old_par) } else { @@ -2272,7 +4578,7 @@ )) } - if (nrow(filtered_data_filtered) > 0) { + if (have_remaining_peptides && nrow(filtered_data_filtered) > 0) { # Add Phosphopeptide column to anova_filtered table # The assumption here is that the first intensity is unique; # this is a hokey assumption but almost definitely will @@ -2305,37 +4611,26 @@ # Produce heatmap to visualize significance and the effect of imputation - anova_filtered_merge_format <- sapply( - X = filtered_p$fdr_adjusted_anova_p - , - FUN = function(x) { - if (x > 0.0001) - paste0("(%0.", 1 + ceiling(-log10(x)), "f) %s") - else - paste0("(%0.4e) %s") - } - ) - cat_hm_heading <- function(m, cutoff) { - cat("\\newpage\n") - if (nrow(m) > intensity_hm_rows) { - subsection_header( + if (nrow(m) > g_intensity_hm_rows) { + cat("\\clearpage\n") + cat(subsection_header( paste( sprintf("Heatmap for the %d most-significant peptides", - intensity_hm_rows), + g_intensity_hm_rows), sprintf("whose adjusted p-value < %0.2f\n", cutoff) ) - ) + )) } else { - if (nrow(m) == 1) { + if (nrow(m) == 0) { return(FALSE) } else { - subsection_header( + cat(subsection_header( paste( - sprintf("Heatmap for %d usable peptides whose", nrow(m)), + sprintf("Heatmap for %d usable peptide genes whose", nrow(m)), sprintf("adjusted p-value < %0.2f\n", cutoff) ) - ) + )) } } cat("\n\n\n") @@ -2346,40 +4641,125 @@ # construct matrix with appropriate rownames m <- as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) - if (nrow(m) > 0) { + nrow_m <- nrow(m) + ncol_m <- ncol(m) + if (nrow_m > 0) { rownames_m <- rownames(m) - rownames(m) <- sapply( - X = seq_len(nrow(m)) - , + q <- data.frame(pepname = rownames_m) + g <- sqldf(" + SELECT q.pepname, substr(met.Gene_Name, 1, 30) AS gene_name + FROM q, metadata_plus_p AS met + WHERE q.pepname = met.Phosphopeptide + ORDER BY q.rowid + ") + tmp <- sapply( + X = seq_len(nrow(g)), + FUN = function(i) { + pre <- strsplit(g$gene_name[i], "; ")[[1]] + rslt <- paste(unique(pre), sep = "; ") + return(rslt) + } + ) + tmp <- unlist(tmp) + tmp <- + make.names(tmp, unique = TRUE) + tmp <- sub( + "No_Gene_Name", + "not_found", + tmp, + fixed = TRUE + ) + ten_trunc_names <- trunc_ppep(rownames_m) + tmp <- sapply( + X = seq_len(nrow_m), FUN = function(i) { sprintf( - anova_filtered_merge_format[i], - filtered_p$fdr_adjusted_anova_p[i], - rownames_m[i] + "(%s) %s", + tmp[i], + ten_trunc_names[i] ) } ) + rownames(m) <- tmp } # draw the heading and heatmap - if (nrow(m) > 0) { + if (nrow_m > 0) { number_of_peptides_found <- - draw_intensity_heatmap( + ppep_heatmap( m = m, cutoff = cutoff, hm_heading_function = cat_hm_heading, - hm_main_title = "Unimputed, unnormalized log(intensities)", - suppress_row_dendrogram = FALSE + hm_main_title = + "log(intensities), row-scaled, unimputed, unnormalized", + suppress_row_dendrogram = FALSE, + master_cex = 0.35, + sepcolor = "black", + colsep = sample_colsep ) + if (number_of_peptides_found > 1) { + cat("\\leavevmode\n") + } } } } } -cat("\\leavevmode\n\n\n") +cat(filter_result_string) +cat("\\leavevmode\n") + +if (!g_can_run_ksea) { + errmsg <- paste("Cannot proceed with KSEA analysis", + "because too many values are missing.") + if (FALSE) cat0( + errmsg, + "\\stepcounter{offset}\n", + "\\stepcounter{offset}\n", + "\\stepcounter{offset}\n", + " in ", + table_href(), + ".\n\n" + ) + if (FALSE) { + if (print_nb_messages) nbe(see_variable(p_value_data)) + } else { + if (print_nb_messages) nbe(see_variable(p_value_data)) + + display_p_value_data <- p_value_data + display_p_value_data$raw_anova_p <- + sprintf("%0.3g", display_p_value_data$raw_anova_p) + display_p_value_data$fdr_adjusted_anova_p <- + sprintf("%0.3g", display_p_value_data$fdr_adjusted_anova_p) + display_p_value_data$quality <- + sprintf("%0.3g", display_p_value_data$quality) + + headers_1st_line <- + c("", "Raw ANOVA", "FDR-adj.", "Missing", "Min. #", "", "") + headers_2nd_line <- + c("Phosphopeptide", "p-value", "p-value", "values", "group-obs", "Quality", "Ranking") + data_frame_tabbing_latex( + x = display_p_value_data, + tabstops = c(2.75, 0.80, 0.80, 0.5, 0.6, 0.60), + use_subsubsection_header = FALSE, + headings = c(headers_1st_line, headers_2nd_line), + caption = "ANOVA results" + + ) + } + data_frame_tabbing_latex( + x = save_sample_treatment_df, + tabstops = c(1.25, 1.25), + caption = "Sample classes", + use_subsubsection_header = FALSE + ) + param_df_exit() + knitr::knit_exit() + return(invisible(-1)) +} + ``` ```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} -if (count_of_treatment_levels > 1) { +if (g_can_run_ksea && count_of_treatment_levels > 1) { # Prepare two-way contrasts with adjusted p-values # Strategy: # - use imputed, log-transformed data: @@ -2395,7 +4775,7 @@ # Each contrast is between a combination of trt levels m2 <- combn( - x = seq_len(length(levels(sample_treatment_levels))), + x = seq_len(length(levels(smpl_trt))), m = 2, simplify = TRUE ) @@ -2409,13 +4789,13 @@ return( data.frame( contrast = cntrst, - level = sample_treatment_levels[ - sample_treatment_levels %in% - levels(sample_treatment_levels)[c(lvl1, lvl2)] + level = smpl_trt[ + smpl_trt %in% + levels(smpl_trt)[c(lvl1, lvl2)] ], label = sample_name_matches[ - sample_treatment_levels %in% - levels(sample_treatment_levels)[c(lvl1, lvl2)] + smpl_trt %in% + levels(smpl_trt)[c(lvl1, lvl2)] ] ) ) @@ -2595,6 +4975,7 @@ ) # - create contrast-metadata table + if (print_nb_messages) nbe("CREATE TABLE contrast_lvl_lvl_metadata") dml_no_rows_exec(db, " CREATE TABLE contrast_lvl_lvl_metadata AS @@ -2705,12 +5086,11 @@ # - run the two-level (one-way) test p_value_data_contrast_ps <- - apply( - X = contrast_cast_data, - MARGIN = 1, # apply to rows - FUN = anova_func, + row_apply( + x = contrast_cast_data, + fun = anova_func, grouping_factor = - as.factor(as.numeric(grouping_factor$level)), # anova_func arg2 + as.factor(grouping_factor$level), # anova_func arg2 one_way_f = one_way_two_categories, # anova_func arg3 simplify = TRUE # TRUE is the default for simplify ) @@ -2922,6 +5302,8 @@ ; " ) + # We are done with DDL and insertion + RSQLite::dbDisconnect(db) } ``` @@ -2929,7 +5311,7 @@ cat("\\newpage\n") ``` -# KSEA Analysis +# KSEA Analysis Summaries Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package: @@ -2940,24 +5322,24 @@ For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as: $$ -\text{kinase enrichment score}_{j,i} = \frac{(\overline{s}_{j,i} - \overline{p}_j)\sqrt{m_{j,i}}}{\delta_j} +\text{kinase enrichment }z\text{-score}_{j,i} = \frac{(\overline{`r sfc`}_{j,i} - \overline{`r pfc`}_j)\sqrt{m_{j,i}}}{\delta_j} $$ and fold-enrichment is computed as: $$ -\text{Enrichment}_{j,i} = \frac{\overline{s}_{j,i}}{\overline{p}_j} +\text{Enrichment}_{j,i} = \frac{\overline{`r sfc`}_{j,i}}{\overline{`r pfc`}_j} $$ where: -- $\overline{s}_{j,i}$ is the mean $\log_2 (|\text{fold-change|})$ in intensities (for contrast $j$) of known substrates of the kinase $i$, -- $\overline{p}_j$ is the mean $\log_2 (|\text{fold-change}|)$ of all phosphosites identified in contrast $j$, and +- $\overline{`r sfc`}_{j,i}$ is the mean `r pfc_txt` in intensities of known substrates of the kinase $i$ in contrast $j$, +- $\overline{`r pfc`}_j$ is the mean `r pfc_txt` of all phosphosites identified in contrast $j$, and - $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$, -- $\delta_j$ is the standard deviation of the $\log_2 (|\text{fold-change}|)$ for contrast $j$ across all phosphosites in the dataset. +- $\delta_j$ is the standard deviation of the $\log_2 (\text{fold-change})$ for contrast $j$ across all phosphosites in the dataset. - Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score. -$\text{FDR}_{j,i}$ is computed from the $p$-value for the z-score using the R `stats::p.adjust` function, applying the False Discovery Rate correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x) +$\text{FDR}_{j,i}$ is the False Discovery Rate corrected kinase enrichment score. Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive). @@ -2965,9 +5347,14 @@ - Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis. - Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)). -- Kinase-strate deata were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads). - -```{r ksea, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} +- Kinase-substrate data were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads). + +For each enriched kinase, a heatmap showing the intensities is presented for up to `r g_intensity_hm_rows` substrates, i.e., those substrates having the highest"quality". + +Where possible, a heatmap of the correlations among these the selected substrates is also presented; if correlations cannot be computed (because of too many missing values), then the covariances are heatmapped for substrates having a variance greater than 1. + +```{r ksea, echo = FALSE, fig.dim = c(12, 14.5), results = 'asis'} +cat("\\clearpage\n") db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) @@ -3046,21 +5433,9 @@ contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) contrast_longlabel <- ( sprintf( - "Trt %s {%s} -> Trt %s {%s}", + "Class %s -> Class %s", contrast_metadata_df[i_cntrst, "b_level"], - gsub( - pattern = ";", - replacement = ", ", - x = contrast_metadata_df[i_cntrst, "b_samples"], - fixed = TRUE - ), - contrast_metadata_df[i_cntrst, "a_level"], - gsub( - pattern = ";", - replacement = ", ", - x = contrast_metadata_df[i_cntrst, "a_samples"], - fixed = TRUE - ) + contrast_metadata_df[i_cntrst, "a_level"] ) ) kseaapp_input <- @@ -3091,18 +5466,44 @@ expr = { ksea_scores_rslt <- ksea_scores( - ksdata = pseudo_ksdata, # KSEAapp::KSData, - px = kseaapp_input, - networkin = TRUE, - networkin_cutoff = 2 + ksdata = pseudo_ksdata, + px = kseaapp_input, + networkin = TRUE, + networkin_cutoff = 2, + minimum_substrate_count = ksea_min_substrate_count ) + if (FALSE) { + ksea_scores_rslt <- + ksea_scores_rslt[ + ksea_scores_rslt$m >= ksea_min_substrate_count, + , + drop = FALSE + ] + } + + if (FALSE) { + data_frame_tabbing_latex( + x = ksea_scores_rslt, + tabstops = c(0.8, 0.8, 0.8, 0.8, 0.8, 0.8), + caption = paste("KSEA scores for contrast ", + cntrst_b_level, "to", cntrst_a_level), + use_subsubsection_header = FALSE + ) + } + + if (FALSE) { + if (print_nb_messages) nbe("Output contents of `ksea_scores_rslt` table\n") + cat_variable(ksea_scores_rslt) + cat("\n\\clearpage\n") + } + if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { next_index <- 1 + next_index rslt$score_list[[next_index]] <- ksea_scores_rslt rslt$name_list[[next_index]] <- contrast_label rslt$longname_list[[next_index]] <- contrast_longlabel - low_fdr_print( + ksea_low_fdr_print( rslt = rslt, i_cntrst = i_cntrst, i = next_index, @@ -3113,21 +5514,24 @@ ) } }, - error = function(e) str(e) + error = function(e) { + str(e) + cat_margins() + } ) } plotted_kinases <- NULL -if (length(rslt$score_list) > 1) { +if (g_can_run_ksea && length(rslt$score_list) > 1) { for (i in seq_len(length(ksea_heatmap_titles))) { hdr <- ksea_heatmap_titles[[i]] which_kinases <- i cat("\\clearpage\n\\begin{center}\n") if (i == const_ksea_astrsk_kinases) { - subsection_header(hdr) + cat(subsection_header(hdr)) } else { - subsection_header(hdr) + cat(subsection_header(hdr)) } cat("\\end{center}\n") @@ -3146,7 +5550,7 @@ m_cutoff = 1, # a numeric value between 0 and 1 indicating the p-value/FDR cutoff # for indicating significant kinases in the heatmap - p_cutoff = 0.05, + p_cutoff = params$kseaCutoffThreshold, # a binary input of TRUE or FALSE, indicating whether or not to perform # hierarchical clustering of the sample columns sample_cluster = TRUE, @@ -3163,131 +5567,446 @@ # - 1 : all kinases # - 2 : significant kinases # - 3 : non-significant kinases - which_kinases = which_kinases + which_kinases = which_kinases, + margins = c(7, 15) ) - cat("\\begin{center}\n") - cat("Color intensities reflects $z$-score magnitudes; hue reflects $z$-score sign. Asterisks reflect significance.\n") - cat("\\end{center}\n") + if (!is.null(plotted_kinases)) { + cat("\\begin{center}\n") + if (which_kinases != const_ksea_nonastrsk_kinases) + cat("Asterisks reflect significance.\n") + cat("\\end{center}\n") + } } # end for (i in ... } # end if (length ... - -for (i_cntrst in seq_len(length(rslt$score_list))) { - next_index <- i_cntrst - cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] - cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] - cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] - contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) - contrast_longlabel <- ( - sprintf( - "Trt %s {%s} -> Trt %s {%s}", - contrast_metadata_df[i_cntrst, "b_level"], - gsub( - pattern = ";", - replacement = ", ", - x = contrast_metadata_df[i_cntrst, "b_samples"], - fixed = TRUE - ), - contrast_metadata_df[i_cntrst, "a_level"], - gsub( - pattern = ";", - replacement = ", ", - x = contrast_metadata_df[i_cntrst, "a_samples"], - fixed = TRUE +``` + +```{r kseabar_calc, echo = FALSE, fig.dim = c(9.5, 6), results = 'asis'} +ksea_prints <- list() +ksea_barplots <- list() + for (i_cntrst in seq_len(length(rslt$score_list))) { + next_index <- i_cntrst + cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] + cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] + cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] + contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) + contrast_longlabel <- ( + sprintf( + "Class %s -> Class %s", + contrast_metadata_df[i_cntrst, "b_level"], + contrast_metadata_df[i_cntrst, "a_level"] + ) + ) + main_title <- ( + sprintf( + "Change from treatment %s to treatment %s", + contrast_metadata_df[i_cntrst, "b_level"], + contrast_metadata_df[i_cntrst, "a_level"] + ) ) + sub_title <- contrast_longlabel + tryCatch( + expr = { + ksea_scores_rslt <- rslt$score_list[[next_index]] + if (print_nb_messages) nbe(see_variable(ksea_scores_rslt)) #ACE + + if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { + sink(deferred <- file()) + ksea_low_fdr_print( + rslt = rslt, + i_cntrst = i_cntrst, + i = next_index, + a_level = cntrst_a_level, + b_level = cntrst_b_level, + fold_change = cntrst_fold_change, + caption = contrast_longlabel, + write_db = FALSE, + anchor = const_table_anchor_t + ) + cat("\n") + sink() + lines <- + paste( + readLines(deferred, warn = FALSE), + collapse = "\n" + ) + close(deferred) + sq_put(ksea_prints, lines) + sink(stderr()) + cat("\n---\n") + cat_variable(ksea_prints) + barplot_closure <- + ksea_low_fdr_barplot_factory( + rslt = rslt, + i_cntrst = i_cntrst, + i = next_index, + a_level = cntrst_a_level, + b_level = cntrst_b_level, + fold_change = cntrst_fold_change, + caption = contrast_longlabel + ) + if (rlang::is_closure(barplot_closure)) + sq_put(ksea_barplots, barplot_closure) + else + sq_put(ksea_barplots, no_op) + str(ksea_barplots) + cat("\n...\n") + sink() + } + }, + error = function(e) { + str(e) + cat_margins() + } + ) + } +``` + +```{r phosphoelm_kinase_upid_desc, echo = FALSE, fig.dim = c(12, 13.7), results = 'asis'} + +have_kinase_descriptions <- + if (!is.null(bzip2df(kinase_uprt_desc_lut, kinase_uprt_desc_lut_bz2)) && + !is.null(bzip2df(kinase_name_uprt_lut, kinase_name_uprt_lut_bz2)) + ) { + rownames(kinase_uprt_desc_lut) <- kinase_uprt_desc_lut$UniProtID + kinase_name_to_desc_uprt <- function(s) { + rslt <- NULL + tryCatch( + { + which_rows <- eval(s == kinase_name_uprt_lut$kinase) + kinase_uprtid <- + kinase_name_uprt_lut[which_rows, 2] + # filter for first _HUMAN match if any + grepl_human <- grepl("_HUMAN$", kinase_uprtid) + if (0 < sum(grepl_human)) + kinase_uprtid <- kinase_uprtid[grepl_human] + # filter for first match if any + if (0 < length(kinase_uprtid)) { + kinase_uprtid <- kinase_uprtid[1] + kinase_desc <- kinase_uprt_desc_lut[kinase_uprtid, 2] + if (!is.na(kinase_desc)) + rslt <- c(kinase_desc, kinase_uprtid) + else + rslt <- c(kinase_desc, "") + } + }, + warning = str + ) + rslt + } + TRUE + } else { + kinase_name_to_desc_uprt <- function(s) NULL + FALSE + } +``` + +```{r write_params, echo = FALSE, results = 'asis'} +# perhaps this should be moved into the functions section, eventually ... +write_params <- function(db) { + # write parameters to report + + # write parameters to SQLite output + + mqppep_anova_script_param_df <- data.frame( + script = "mqppep_anova_script.Rmd", + parameter = names(param_unlist), + value = param_unlist ) + ddl_exec(db, " + DROP TABLE IF EXISTS script_parameter; + " ) - main_title <- ( - sprintf( - "Change from treatment %s to treatment %s", - contrast_metadata_df[i_cntrst, "b_level"], - contrast_metadata_df[i_cntrst, "a_level"] - ) + ddl_exec(db, " + CREATE TABLE IF NOT EXISTS script_parameter( + script TEXT, + parameter TEXT, + value ANY, + UNIQUE (script, parameter) ON CONFLICT REPLACE + ) + ; + " + ) + RSQLite::dbWriteTable( + conn = db, + name = "script_parameter", + value = mqppep_anova_script_param_df, + append = TRUE ) - sub_title <- contrast_longlabel - tryCatch( - expr = { - ksea_scores_rslt <- rslt$score_list[[next_index]] - - if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { - low_fdr_barplot( - rslt = rslt, - i_cntrst = i_cntrst, - i = next_index, - a_level = cntrst_a_level, - b_level = cntrst_b_level, - fold_change = cntrst_fold_change, - caption = contrast_longlabel - ) - } - }, - error = function(e) str(e) - ) + + loaded_packages_df <- sessioninfo::package_info("loaded") + loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library) + loaded_packages_df <- data.frame( + package = loaded_packages_df$package, + version = loaded_packages_df$loadedversion, + date = loaded_packages_df$date + ) + #ACE cat("\\clearpage\n\\section{R package versions}\n") + #ACE data_frame_tabbing_latex( + #ACE x = loaded_packages_df, + #ACE tabstops = c(2.5, 1.25), + #ACE caption = "R package versions" + #ACE ) + cat("\\clearpage\n\\section{Input parameter settings}\n") + data_frame_tabbing_latex( + x = param_df, + tabstops = c(1.75), + underscore_whack = TRUE, + caption = "Input parameters", + verbatim = FALSE + ) +} + +if (!have_kinase_descriptions) { + write_params(db) + # We are done with output + RSQLite::dbDisconnect(db) + param_df_exit() + knitr::knit_exit() } ``` -```{r enriched, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} - -# Use enriched kinases to find enriched kinase-substrate pairs -enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash)) -all_enriched_substrates <- sqldf(" - SELECT - gene AS kinase, - ppep, - '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label - FROM ( - SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep - FROM pseudo_ksdata - WHERE GENE IN (SELECT kinase FROM enriched_kinases) +```{r kseabar, echo = FALSE, fig.dim = c(9.5, 12.3), results = 'asis'} +if (have_kinase_descriptions) { + my_section_header <- + sprintf( + "inases whose KSEA %s < %s\n", + ksea_cutoff_statistic, + signif(ksea_cutoff_threshold, 2) + ) + + # Use enriched kinases to find enriched kinase-substrate pairs + enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash)) + + enriched_kinase_descs <- + Reduce( + f = function(l, r) { + lkup <- kinase_name_to_desc_uprt(r) + if (is.null(lkup)) l + else r2 <- rbind( + l, + data.frame( + kinase = r, + uniprot_id = lkup[2], + description = lkup[1] + ) + ) + }, + x = enriched_kinases$kinase, + init = NULL + ) + + if (length(enriched_kinase_descs) > 0 && nrow(enriched_kinase_descs) > 0) { + cat("\n\\clearpage\n") + data_frame_tabbing_latex( + x = enriched_kinase_descs, + tabstops = c(0.9, 1.3), + headings = c("Kinase", "UniProt ID", "Description"), + caption = paste0("Descriptions of k", my_section_header) + ) + } + + if (FALSE) { + cat_variable(sqldf("SELECT kinase FROM enriched_kinases")) + cat_variable(sqldf(" + SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep + FROM pseudo_ksdata + WHERE gene IN (SELECT kinase FROM enriched_kinases) + ")) + data_frame_table_latex( + x = sqldf(" + SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep + FROM pseudo_ksdata + WHERE gene IN (SELECT kinase FROM enriched_kinases) + "), + justification = "l l l", + centered = TRUE, + caption = "substrates of enriched kinases", + anchor = c(const_table_anchor_p, const_table_anchor_t), + underscore_whack = TRUE ) - GROUP BY ppep - ") - -# helper used to label per-kinase substrate enrichment figure -cat_enriched_heading <- function(m, cut_args) { - cutoff <- cut_args$cutoff - kinase <- cut_args$kinase - statistic <- cut_args$statistic - threshold <- cut_args$threshold - cat("\\newpage\n") - if (nrow(m) > intensity_hm_rows) { - subsection_header( - paste( + data_frame_table_latex( + x = sqldf(" + SELECT + gene AS kinase, + ppep, + sub_gene, + '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label, + fdr_adjusted_anova_p, + quality, + min_group_obs_count + FROM ( + SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep + FROM pseudo_ksdata + WHERE gene IN (SELECT kinase FROM enriched_kinases) + ), + p_value_data + WHERE ppep = phosphopeptide + GROUP BY kinase, ppep + ORDER BY kinase, ppep, p_value_data.quality DESC + "), + justification = "l l l l l l l", + centered = TRUE, + caption = "labeled substrates of enriched kinases", + anchor = c(const_table_anchor_p, const_table_anchor_t), + underscore_whack = TRUE + ) + } + all_enriched_substrates <- sqldf(" + SELECT + gene AS kinase, + ppep, + sub_gene, + '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label, + fdr_adjusted_anova_p, + quality, + min_group_obs_count + FROM ( + SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep + FROM pseudo_ksdata + WHERE gene IN (SELECT kinase FROM enriched_kinases) + ), + p_value_data + WHERE ppep = phosphopeptide + GROUP BY kinase, ppep + ORDER BY kinase, ppep, p_value_data.quality DESC + ") + + all_enriched_substrates <- + all_enriched_substrates[ + all_enriched_substrates$quality >= params$minQuality, + , + drop = FALSE + ] + + all_enriched_substrates$sub_gene <- + sub( + " ///.*", + " ...", + all_enriched_substrates$sub_gene + ) + + all_enriched_substrates$label <- + with( + all_enriched_substrates, + sprintf( + "(%s-%s) %s", + kinase, + trunc_subgene(sub_gene), + ppep + ) + ) + + # this global is set to TRUE by cat_enriched_heading immediately below + g_neednewpage <- FALSE + + # helper used to label per-kinase substrate enrichment figure + cat_enriched_heading <- function(m, cut_args) { + cutoff <- cut_args$cutoff + kinase <- cut_args$kinase + if (g_neednewpage) cat("\\newpage\n") + g_neednewpage <- TRUE + if (nrow(m) > g_intensity_hm_rows) { + cat(subsection_header( sprintf( - "Lowest p-valued %d (of %d) enriched %s-substrates,", - intensity_hm_rows, + "Highest-quality %d (of %d) enriched %s-substrates", + g_intensity_hm_rows, nrow(m), kinase - ), - sprintf(" KSEA %s < %0.2f\n", statistic, threshold) - ) - ) - } else { - if (nrow(m) == 1) { - return(FALSE) + ) + )) } else { - subsection_header( - paste( + if (nrow(m) == 0) { + return(FALSE) + } else { + nrow_m <- nrow(m) + cat(subsection_header( sprintf( - "%d enriched %s-substrates,", - nrow(m), - kinase - ), - sprintf( - " KSEA %s < %0.2f\n", - statistic, - threshold + "%d enriched %s-substrate%s", + nrow_m, + kinase, + if (nrow_m > 1) "s" else "" ) + )) + } + } + cat("\n\n\n") + cat("\n\n\n") + return(TRUE) + } + + # -------------------------------- + # hack begin - show all substrates + enriched_substrates <- all_enriched_substrates + # add "FALSE &&" to prevent listing of substrates + if (show_enriched_substrates && nrow(enriched_substrates) > 0) { + short_row_names <- sub( + "$FAILED_MATCH_GENE_NAME", + "not_found", + enriched_substrates$sub_gene, + fixed = TRUE + ) + + if (print_nb_messages) nbe(see_variable(enriched_substrates)) + substrates_df <- with( + enriched_substrates, + data.frame( + kinase = kinase, + substrate = sub(" ///*", "...", short_row_names), + anova_p_value = signif(fdr_adjusted_anova_p, 2), + min_group_obs_count = signif(min_group_obs_count, 0), + quality = signif(quality, 3), + sequence = trunc_n(30)(ppep) ) ) - } + + substrates_df <- substrates_df[ + with(substrates_df, order(kinase, -quality)), + , + drop = FALSE + ] + + if (print_nb_messages) nbe(see_variable(substrates_df)) + if (nrow(substrates_df) < 1) + substrates_df$sequence <- c() + if (print_nb_messages) nbe(see_variable(substrates_df)) + names(substrates_df) <- headers_2nd_line <- + c("Kinase", "Substrate", "p-value", "per group)", "quality", "Sequence") + headers_1st_line <- c("", "", "ANOVA", "min(values", "", "") + data_frame_tabbing_latex( + x = substrates_df, + tabstops = c(1.2, 0.8, 0.5, 0.65, 0.5), + headings = c(headers_1st_line, headers_2nd_line), + caption = "Details for all enriched substrates of enriched kinases" + ) + rm( + enriched_substrates, + substrates_df, + short_row_names, + headers_1st_line, + headers_2nd_line + ) } - cat("\n\n\n") - cat("\n\n\n") - return(TRUE) + cat("\\clearpage\n") + # hack end - show all substrates + # -------------------------------- + + # print deferred tables and graphs for kinases from contrasts + for (i_cntrst in seq_len(length(ksea_prints))) { + #latex_samepage({ + cat(ksea_prints[[i_cntrst]]) + cat("\n") + ksea_barplots[[i_cntrst]]() + cat("\n") + cat("\\clearpage\n") + #}) + } + } - -# Disabling heatmaps for substrates pending decision whether to eliminate them altogether -if (FALSE) +``` + +```{r enriched, echo = FALSE, fig.dim = c(12, 13.7), results = 'asis'} +if (g_can_run_ksea) { + g_did_enriched_header <- FALSE for (kinase_name in sort(enriched_kinases$kinase)) { enriched_substrates <- all_enriched_substrates[ @@ -3295,135 +6014,256 @@ , drop = FALSE ] + ten_trunc_ppep <- trunc_enriched_substrate(enriched_substrates$ppep) + enriched_substrates$label <- with( + enriched_substrates, + sprintf( + "(%s) %s", + make.names( + sub("$FAILED_MATCH_GENE_NAME", "not_found", sub_gene, fixed = TRUE), + unique = TRUE + ), + ten_trunc_ppep + ) + ) # Get the intensity values for the heatmap enriched_intensities <- as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE]) + # Remove rows having too many NA values to be relevant - na_counter <- is.na(enriched_intensities) - na_counts <- apply(na_counter, 1, sum) - enriched_intensities <- - enriched_intensities[na_counts < ncol(enriched_intensities) / 2, , drop = FALSE] + good_rows <- (rowSums(enriched_intensities, na.rm = TRUE) != 0) + #ACE nbe(see_variable(good_rows), "\n") + enriched_substrates <- enriched_substrates[good_rows, , drop = FALSE] + enriched_intensities <- enriched_intensities[good_rows, , drop = FALSE] + # Rename the rows with the display-name for the heatmap - rownames(enriched_intensities) <- + short_row_names <- sub( + "$FAILED_MATCH_GENE_NAME", + "not_found", + enriched_substrates$sub_gene, + fixed = TRUE + ) + short_row_names <- + make.names(short_row_names, unique = TRUE) + long_row_names <- sapply( X = rownames(enriched_intensities), FUN = function(rn) { enriched_substrates[enriched_substrates$ppep == rn, "label"] } ) + rownames(enriched_intensities) <- long_row_names # Format as matrix for heatmap m <- as.matrix(enriched_intensities) + rownames(m) <- trunc_enriched_substrate(rownames(m)) + + #ACE nb("m with bad rows: ", see_variable(m), "\n") + #ACE good_rows <- (rowSums(m, na.rm = TRUE) != 0) + #ACE nb(see_variable(good_rows), "\n") + #ACE m <- m[good_rows, , drop = FALSE] + #ACE nb("m without(?) bad rows: ", see_variable(m), "\n") + #ACE nb(see_variable(short_row_names), "\n") + #ACE local_short_row_names <- short_row_names[good_rows] + #ACE local_long_row_names <- long_row_names[good_rows] + #ACE local_enriched_intensities <- enriched_intensities[local_long_row_names, ] + # Draw the heading and heatmap - if (nrow(m) > 0) { + nrow_m <- nrow(m) + if (nrow_m > 0) { + if (!g_did_enriched_header) { + cat("\n\\clearpage\n") + cat(section_header(paste0("K", my_section_header))) + g_did_enriched_header <- TRUE + } + is_na_m <- is.na(m) + cellnote_m <- is_na_m + cellnote_m[!is_na_m] <- "" + cellnote_m[is_na_m] <- "NA" cut_args <- new_env() cut_args$cutoff <- cutoff cut_args$kinase <- kinase_name cut_args$statistic <- ksea_cutoff_statistic cut_args$threshold <- ksea_cutoff_threshold number_of_peptides_found <- - draw_intensity_heatmap( + ppep_heatmap( m = m, + cellnote = cellnote_m, cutoff = cut_args, hm_heading_function = cat_enriched_heading, hm_main_title = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates", - suppress_row_dendrogram = FALSE + suppress_row_dendrogram = FALSE, + master_cex = 0.35, + sepcolor = "black", + colsep = sample_colsep ) + if (number_of_peptides_found > 1) { + + tryCatch( + { + rownames(m) <- short_row_names + cov_heatmap(m, nrow_m > g_intensity_hm_rows) + }, + error = function(e) { + cat( + sprintf( + "ERROR: %s\n\\newline\n", + mget("e") + ) + ) + cat( + sprintf( + "message: %s\n\\newline\n", + e$message + ) + ) + cat_margins() + } + ) + } + substrates_df <- with( + enriched_substrates, + data.frame( + substrate = sub(" ///*", "...", short_row_names), + sequence = trunc_long_ppep(ppep), + anova_p_value = signif(fdr_adjusted_anova_p, 2), + min_group_obs_count = signif(min_group_obs_count, 0), + quality = signif(quality, 3) + ) + ) + excess_substrates <- nrow(substrates_df) > g_intensity_hm_rows + if (excess_substrates) + substrates_df <- substrates_df[1:g_intensity_hm_rows, ] + names(substrates_df) <- headers_2nd_line <- + c("Substrate", "Sequence", "p-value", "per group)", "quality") + headers_1st_line <- c("", "", "ANOVA", "min(values", "") + if (1 < nrow(enriched_substrates)) + cat("\n\\newpage\n") + cat(subsubsection_header( + sprintf( + "Details for %s%s-substrates", + if (excess_substrates) + sprintf( + "%s \"highest quality\" ", + g_intensity_hm_rows + ) + else "", + kinase_name + ) + )) + substrates_df <- substrates_df[order(-substrates_df$quality), ] + data_frame_tabbing_latex( + x = substrates_df, + tabstops = c(0.8, 3.8, 0.6, 0.8), + headings = c(headers_1st_line, headers_2nd_line) + ) + } else { + if (print_nb_messages) nbe(see_variable(nrow_m > 0), "\n") } + if (print_nb_messages) nb("end kinase ", kinase_name, "\n") } -# Write output tabular files - -# get kinase, ppep, concat(kinase) tuples for enriched kinases - -kinase_ppep_label <- sqldf(" - WITH - t(ppep, label) AS - ( - SELECT DISTINCT - SUB_MOD_RSD AS ppep, - group_concat(gene, '; ') AS label + # Write output tabular files + + # get kinase, ppep, concat(kinase) tuples for enriched kinases + + if (print_nb_messages) nb("kinase_ppep_label <- ...\n") + if (print_nb_messages) nbe("kinase_ppep_label <- ...\n") + kinase_ppep_label <- sqldf(" + WITH + t(ppep, label) AS + ( + SELECT DISTINCT + SUB_MOD_RSD AS ppep, + group_concat(gene, '; ') AS label + FROM pseudo_ksdata + WHERE GENE IN (SELECT kinase FROM enriched_kinases) + GROUP BY ppep + ), + k(kinase, ppep_join) AS + ( + SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join FROM pseudo_ksdata WHERE GENE IN (SELECT kinase FROM enriched_kinases) - GROUP BY ppep - ), - k(kinase, ppep_join) AS - ( - SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join - FROM pseudo_ksdata - WHERE GENE IN (SELECT kinase FROM enriched_kinases) - ) - SELECT k.kinase, t.ppep, t.label - FROM t, k - WHERE t.ppep = k.ppep_join - ORDER BY k.kinase, t.ppep - ") - -# extract what we need from full_data -impish <- cbind(rownames(quant_data_imp), quant_data_imp) -colnames(impish)[1] <- "Phosphopeptide" -data_table_imputed_sql <- " - SELECT - f.*, - k.label AS KSEA_enrichments, - q.* - FROM - metadata_plus_p f - LEFT JOIN kinase_ppep_label k - ON f.Phosphopeptide = k.ppep, - impish q - WHERE - f.Phosphopeptide = q.Phosphopeptide - " -data_table_imputed <- sqldf(data_table_imputed_sql) -# Zap the duplicated 'Phosphopeptide' column named 'ppep' -data_table_imputed <- - data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] - -# Output with imputed, un-normalized data - -write.table( - data_table_imputed - , file = imputed_data_filename - , sep = "\t" - , col.names = TRUE - , row.names = FALSE - , quote = FALSE + ) + SELECT k.kinase, t.ppep, t.label + FROM t, k + WHERE t.ppep = k.ppep_join + ORDER BY k.kinase, t.ppep + ") + + + # extract what we need from full_data + impish <- cbind(rownames(quant_data_imp), quant_data_imp) + colnames(impish)[1] <- "Phosphopeptide" + data_table_imputed_sql <- " + SELECT + f.*, + k.label AS KSEA_enrichments, + q.* + FROM + metadata_plus_p f + LEFT JOIN kinase_ppep_label k + ON f.Phosphopeptide = k.ppep, + impish q + WHERE + f.Phosphopeptide = q.Phosphopeptide + " + data_table_imputed <- sqldf(data_table_imputed_sql) + # Zap the duplicated 'Phosphopeptide' column named 'ppep' + data_table_imputed <- + data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] + + # Output imputed, un-normalized data + if (print_nb_messages) nb("Output imputed, un-normalized data tabular file\n") + if (print_nb_messages) nbe("Output imputed, un-normalized data tabular file\n") + write.table( + data_table_imputed + , file = imputed_data_filename + , sep = "\t" + , col.names = TRUE + , row.names = FALSE + , quote = FALSE + ) + + + #output quantile normalized data + impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log) + colnames(impish)[1] <- "Phosphopeptide" + data_table_imputed <- sqldf(data_table_imputed_sql) + # Zap the duplicated 'Phosphopeptide' column named 'ppep' + data_table_imputed <- + data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] + if (print_nb_messages) nb("Output quantile normalized data tabular file\n") + if (print_nb_messages) nbe("Output quantile normalized data tabular file\n") + write.table( + data_table_imputed, + file = imp_qn_lt_data_filenm, + sep = "\t", + col.names = TRUE, + row.names = FALSE, + quote = FALSE ) - -#output quantile normalized data -impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log) -colnames(impish)[1] <- "Phosphopeptide" -data_table_imputed <- sqldf(data_table_imputed_sql) -# Zap the duplicated 'Phosphopeptide' column named 'ppep' -data_table_imputed <- - data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] -write.table( - data_table_imputed, - file = imp_qn_lt_data_filenm, - sep = "\t", - col.names = TRUE, - row.names = FALSE, - quote = FALSE -) - -ppep_kinase <- sqldf(" - SELECT DISTINCT k.ppep, k.kinase - FROM ( - SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep - FROM pseudo_ksdata - WHERE GENE IN (SELECT kinase FROM enriched_kinases) - ) k - ORDER BY k.ppep, k.kinase - ") - -RSQLite::dbWriteTable( - conn = db, - name = "ksea_enriched_ks", - value = ppep_kinase, - append = FALSE - ) + ppep_kinase <- sqldf(" + SELECT DISTINCT k.ppep, k.kinase + FROM ( + SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep + FROM pseudo_ksdata + WHERE GENE IN (SELECT kinase FROM enriched_kinases) + ) k + ORDER BY k.ppep, k.kinase + ") + + RSQLite::dbWriteTable( + conn = db, + name = "ksea_enriched_ks", + value = ppep_kinase, + append = FALSE + ) +} + +if (print_nb_messages) nb("RSQLite::dbWriteTable anova_signif\n") RSQLite::dbWriteTable( conn = db, @@ -3453,6 +6293,8 @@ " ) +if (print_nb_messages) nb("Output contents of `stats_metadata_v` table to tabular file\n") +if (print_nb_messages) nbe("Output contents of `stats_metadata_v` table to tabular file\n") write.table( dbReadTable(db, "stats_metadata_v"), file = anova_ksea_mtdt_file, @@ -3462,75 +6304,21 @@ quote = FALSE ) +cat("\n\\clearpage\n") ``` +# Data-processing summary flowchart + +![Flowchart showing ANOVA and KSEA data-processing steps](KSEA_impl_flowchart.pdf) + ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} cat("\\leavevmode\n\n\n") -# write parameters to report - -param_unlist <- unlist(as.list(params)) -param_df <- data.frame( - parameter = paste0("\\verb@", names(param_unlist), "@"), - value = paste0("\\verb@", gsub("$", "\\$", param_unlist, fixed = TRUE), "@") - ) - -data_frame_latex( - x = param_df, - justification = "p{0.35\\linewidth} p{0.6\\linewidth}", - centered = TRUE, - caption = "Input parameters", - anchor = const_table_anchor_bp, - underscore_whack = FALSE - ) - -# write parameters to SQLite output - -mqppep_anova_script_param_df <- data.frame( - script = "mqppep_anova_script.Rmd", - parameter = names(param_unlist), - value = param_unlist - ) -ddl_exec(db, " - DROP TABLE IF EXISTS script_parameter; - " -) -ddl_exec(db, " - CREATE TABLE IF NOT EXISTS script_parameter( - script TEXT, - parameter TEXT, - value ANY, - UNIQUE (script, parameter) ON CONFLICT REPLACE - ) - ; - " -) -RSQLite::dbWriteTable( - conn = db, - name = "script_parameter", - value = mqppep_anova_script_param_df, - append = TRUE -) - +write_params(db) # We are done with output RSQLite::dbDisconnect(db) + +cat("\\clearpage\n\\section{R package versions}\n") +utils::toLatex(utils::sessionInfo()) ``` -<!-- -There's gotta be a better way... - -loaded_packages_df <- sessioninfo::package_info("loaded") -loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library) -loaded_packages_df <- data.frame( - package = loaded_packages_df$package, - version = loaded_packages_df$loadedversion, - date = loaded_packages_df$date - ) -data_frame_latex( - x = loaded_packages_df, - justification = "l | l l", - centered = FALSE, - caption = "Loaded R packages", - anchor = const_table_anchor_bp - ) --->
--- a/mqppep_mrgfltr.py Mon Jul 11 19:22:25 2022 +0000 +++ b/mqppep_mrgfltr.py Fri Oct 28 18:27:21 2022 +0000 @@ -87,7 +87,10 @@ nargs=1, required=True, dest="phosphopeptides", - help="Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format", + help=" ".join([ + "Phosphopeptide data for experimental results, including the", + "intensities and the mapping to kinase domains, in tabular format" + ]), ) # UniProtKB/SwissProt DB input, SQLite parser.add_argument( @@ -106,7 +109,10 @@ required=False, default=[], dest="species", - help="limit PhosphoSitePlus records to indicated species (field may be empty)", + help=" ".join([ + "limit PhosphoSitePlus records to indicated species", + "(field may be empty)" + ]), ) # outputs: @@ -174,7 +180,7 @@ # determine species to limit records from PSP_Regulatory_Sites if options.species is None: exit( - 'Argument "species" is required (and may be empty) but not supplied' + 'Argument "species" is required (& may be empty) but not supplied' ) try: if len(options.species) > 0: @@ -216,20 +222,25 @@ FUNCTION_PHOSPHORESIDUE = ( "Function Phosphoresidue(PSP=PhosphoSitePlus.org)" ) - GENE_NAME = "Gene_Name" # Gene Name from UniProtKB - ON_FUNCTION = ( - "ON_FUNCTION" # ON_FUNCTION column from PSP_Regulatory_Sites - ) - ON_NOTES = "NOTES" # NOTES column from PSP_Regulatory_Sites - ON_OTHER_INTERACT = "ON_OTHER_INTERACT" # ON_OTHER_INTERACT column from PSP_Regulatory_Sites - ON_PROCESS = ( - "ON_PROCESS" # ON_PROCESS column from PSP_Regulatory_Sites - ) - ON_PROT_INTERACT = "ON_PROT_INTERACT" # ON_PROT_INTERACT column from PSP_Regulatory_Sites + # Gene Name from UniProtKB + GENE_NAME = "Gene_Name" + # ON_FUNCTION column from PSP_Regulatory_Sites + ON_FUNCTION = ("ON_FUNCTION") + # NOTES column from PSP_Regulatory_Sites + ON_NOTES = "NOTES" + # ON_OTHER_INTERACT column from PSP_Regulatory_Sites + ON_OTHER_INTERACT = "ON_OTHER_INTERACT" + # ON_PROCESS column from PSP_Regulatory_Sites + ON_PROCESS = ("ON_PROCESS") + # ON_PROT_INTERACT column from PSP_Regulatory_Sites + ON_PROT_INTERACT = "ON_PROT_INTERACT" PHOSPHOPEPTIDE = "Phosphopeptide" PHOSPHOPEPTIDE_MATCH = "Phosphopeptide_match" PHOSPHORESIDUE = "Phosphoresidue" - PUTATIVE_UPSTREAM_DOMAINS = "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains" + PUTATIVE_UPSTREAM_DOMAINS = " ".join([ + "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/", + "Phosphatases/Binding Domains" + ]) SEQUENCE = "Sequence" SEQUENCE10 = "Sequence10" SEQUENCE7 = "Sequence7" @@ -328,8 +339,26 @@ CitationData ) VALUES (?,?) """ - CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."' - CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122' + CITATION_INSERT_PSP = " ".join([ + "PhosphoSitePlus(R) (PSP) was created by Cell Signaling", + "Technology Inc. It is licensed under a Creative Commons", + "Attribution-NonCommercial-ShareAlike 3.0 Unported License.", + "When using PSP data or analyses in printed publications or", + "in online resources, the following acknowledgements must be", + "included: (a) the words", + '"PhosphoSitePlus(R), www.phosphosite.org" must', + "be included at appropriate places in the text or webpage,", + "and (b) the following citation must be included in the", + 'bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser', + "JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations,", + "PTMs and recalibrations. Nucleic Acids Res. 2015", + '43:D512-20. PMID: 25514926."' + ]) + CITATION_INSERT_PSP_REF = " ".join([ + 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and', + 'recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298,', + "https://doi.org/10.1093/nar/gkr1122" + ]) MRGFLTR_METADATA_COLUMNS = [ "ppep_id", @@ -388,7 +417,8 @@ file1.close() file1_encoded.close() - # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed + # Get the list of phosphopeptides with the p's that represent + # the phosphorylation sites removed re_phos = re.compile("p") end_time = time.process_time() # timer @@ -397,7 +427,7 @@ file=sys.stderr, ) - # ----------- Get SwissProt data from SQLite database (start) ----------- + # -------- Get SwissProt data from SQLite database (start) ----------- # build UniProt sequence LUT and list of unique SwissProt sequences # Open SwissProt SQLite database @@ -465,7 +495,7 @@ Description = "" Gene_Name = "" - # ----------- Get SwissProt data from SQLite database (finish) ----------- + # -------- Get SwissProt data from SQLite database (finish) ----------- end_time = time.process_time() # timer print( @@ -473,12 +503,13 @@ file=sys.stderr, ) - # ----------- Get SwissProt data from SQLite database (start) ----------- + # -------- Get SwissProt data from SQLite database (start) ----------- # Open SwissProt SQLite database conn = sql.connect(uniprot_sqlite) cur = conn.cursor() - # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide + # Set up dictionary to aggregate results for phosphopeptides + # corresponding to dephosphoeptide DephosphoPep_UniProtSeq_LUT = {} # Set up dictionary to accumulate results @@ -548,7 +579,7 @@ # Get one tuple for each `phospho_pep` # in DephosphoPep_UniProtSeq_LUT[dephospho_pep] for (upid, gn, desc) in r: - # Append pseudo-tuple per UniProt_ID but only when it is not present + # Append pseudo-tuple per UniProt_ID iff not present if ( upid not in DephosphoPep_UniProtSeq_LUT[ @@ -571,7 +602,7 @@ phospho_pep = dephospho_pep = sequence = 0 upid = gn = desc = r = "" - # ----------- Get SwissProt data from SQLite database (finish) ----------- + # -------- Get SwissProt data from SQLite database (finish) ----------- end_time = time.process_time() # timer print( @@ -608,33 +639,6 @@ file=sys.stderr, ) # timer - # ######################################################################## - # # trim upstream_data to include only the upstream map columns - # old_cols = upstream_data.columns.tolist() - # i = 0 - # first_intensity = -1 - # last_intensity = -1 - # intensity_re = re.compile("Intensity.*") - # for col_name in old_cols: - # m = intensity_re.match(col_name) - # if m: - # last_intensity = i - # if first_intensity == -1: - # first_intensity = i - # i += 1 - # # print('last intensity = %d' % last_intensity) - # col_PKCalpha = last_intensity + 2 - # - # data_in_cols = [old_cols[0]] + old_cols[ - # first_intensity: last_intensity + 1 - # ] - # - # if upstream_data.empty: - # print("upstream_data is empty") - # exit(0) - # - # data_in = upstream_data.copy(deep=True)[data_in_cols] - ######################################################################## # trim upstream_data to include only the upstream map columns old_cols = upstream_data.columns.tolist() i = 0 @@ -666,7 +670,7 @@ data_in.columns = data_col_names print("data_in") print(data_in) - ######################################################################## + ###################################################################### # Convert floating-point integers to int64 integers # ref: https://stackoverflow.com/a/68497603/15509512 @@ -689,7 +693,7 @@ ) # timer # Produce a dictionary of metadata for a single phosphopeptide. - # This is a replacement of `UniProtInfo_subdict` in the original code. + # This is replaces `UniProtInfo_subdict` from the original code. def pseq_to_subdict(phospho_pep): # Strip "p" from phosphopeptide sequence dephospho_pep = re_phos.sub("", phospho_pep) @@ -733,19 +737,28 @@ if phospho_pep not in PhosphoPep_UniProtSeq_LUT: raise PreconditionError( phospho_pep, - "no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT", + " ".join([ + "no matching phosphopeptide found in", + "PhosphoPep_UniProtSeq_LUT" + ]) ) if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: raise PreconditionError( dephospho_pep, - "dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT", + " ".join([ + "dephosphorylated phosphopeptide not found", + "in DephosphoPep_UniProtSeq_LUT" + ]) ) if ( - dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)] + dephospho_pep != PhosphoPep_UniProtSeq_LUT[ + (phospho_pep, DEPHOSPHOPEP)] ): my_err_msg = "dephosphorylated phosphopeptide does not match " - my_err_msg += "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " - my_err_msg += PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)] + my_err_msg += "PhosphoPep_UniProtSeq_LUT" + my_err_msg += "[(phospho_pep,DEPHOSPHOPEP)] = " + my_err_msg += PhosphoPep_UniProtSeq_LUT[ + (phospho_pep, DEPHOSPHOPEP)] raise PreconditionError(dephospho_pep, my_err_msg) result[SEQUENCE] = [dephospho_pep] @@ -761,20 +774,18 @@ if (dephospho_pep, SEQUENCE) not in DephosphoPep_UniProtSeq_LUT: raise PreconditionError( dephospho_pep, - "no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT", + "".join( + "no matching phosphopeptide found", + "in DephosphoPep_UniProtSeq_LUT") ) UniProtSeqList = DephosphoPep_UniProtSeq_LUT[ (dephospho_pep, SEQUENCE) ] if len(UniProtSeqList) < 1: print( - "Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" - % dephospho_pep + "Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] %s" + % (dephospho_pep, "because value has zero length") ) - # raise PreconditionError( - # "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)", - # 'value has zero length' - # ) for UniProtSeq in UniProtSeqList: i = 0 phosphoresidues = [] @@ -854,13 +865,15 @@ # add Sequence10 if psite < 10: # phospho_pep at N terminus seq10 = ( - str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite: psite + 11] + str(UniProtSeq)[:psite] + "p" + str( + UniProtSeq)[psite: psite + 11] ) elif ( len(UniProtSeq) - psite < 11 ): # phospho_pep at C terminus seq10 = ( - str(UniProtSeq)[psite - 10: psite] + "p" + str(UniProtSeq)[psite:] + str(UniProtSeq)[psite - 10: psite] + "p" + str( + UniProtSeq)[psite:] ) else: seq10 = str(UniProtSeq)[psite - 10: psite + 11] @@ -922,23 +935,6 @@ newstring = "; ".join( [", ".join(prez) for prez in result[PHOSPHORESIDUE]] ) - # #separate the isoforms in PHOSPHORESIDUE column with ";" - # oldstring = result[PHOSPHORESIDUE] - # oldlist = list(oldstring) - # newstring = "" - # i = 0 - # for e in oldlist: - # if e == ";": - # if numps > 1: - # if i%numps: - # newstring = newstring + ";" - # else: - # newstring = newstring + "," - # else: - # newstring = newstring + ";" - # i +=1 - # else: - # newstring = newstring + e result[PHOSPHORESIDUE] = newstring # separate sequence7's by | @@ -973,7 +969,8 @@ ) # timer # Construct dictionary from list of lists - # ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/ + # ref: https://www.8bitavenue.com/\ + # how-to-convert-list-of-lists-to-dictionary-in-python/ UniProt_Info = { result[0]: result[1] for result in result_list @@ -982,8 +979,8 @@ end_time = time.process_time() # timer print( - "%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" - % (end_time - start_time,), + "%0.6f create dictionary mapping phosphopeptide %s" + % (end_time - start_time, "to metadata dictionary [C]"), file=sys.stderr, ) # timer @@ -1042,14 +1039,13 @@ ).reset_index() seq7_df.columns = [SEQUENCE7, PHOSPHOPEPTIDE] - # --- -------------- begin read PSP_Regulatory_sites --------------------------------- - # read in PhosphoSitePlus Regulatory Sites dataset - # ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) ----------- + # read in PhosphoSitePlus Regulatory Sites dataset from SQLite + # --- -------------- begin read PSP_Regulatory_sites ----- conn = sql.connect(uniprot_sqlite) regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn) # Close SwissProt SQLite database conn.close() - # ... -------------- end read PSP_Regulatory_sites ------------------------------------ + # ... -------------- end read PSP_Regulatory_sites ------- # keep only the human entries in dataframe if len(species) > 0: @@ -1127,7 +1123,8 @@ fp_series[i] = fp_series[i].replace("; ; ; ", "; ") fp_series[i] = fp_series[i].replace("; ; ", "; ") - # turn blanks into N_A to signify the info was searched for but cannot be found + # turn blanks into N_A to signify the info + # that was searched for but cannot be found if fp_series[i] == "": fp_series[i] = N_A @@ -1203,7 +1200,6 @@ inplace=True, ) - # data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort') res2 = sorted( data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key=lambda s: s.casefold() ) @@ -1229,21 +1225,6 @@ file=sys.stderr, ) # timer - # #rename upstream columns in new list - # new_cols = [] - # for name in cols: - # if "_NetworKIN" in name: - # name = name.split("_")[0] - # if " motif" in name: - # name = name.split(" motif")[0] - # if " sequence " in name: - # name = name.split(" sequence")[0] - # if "_Phosida" in name: - # name = name.split("_")[0] - # if "_PhosphoSite" in name: - # name = name.split("_")[0] - # new_cols.append(name) - # rename upstream columns in new list def col_rename(name): if "_NetworKIN" in name: @@ -1276,8 +1257,8 @@ upstream_data_cast.columns = new_cols_cast upstream_data_cast["p_peptide"] = upstream_data.index - # --- -------------- begin read upstream_data_melt ------------------------------------ - # ----------- Get melted kinase mapping data from SQLite database (start) ----------- + # Get melted kinase mapping data from SQLite database + # --- begin read upstream_data_melt --------------------------------- conn = sql.connect(uniprot_sqlite) upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn) # Close SwissProt SQLite database @@ -1303,8 +1284,7 @@ % (end_time - start_time, len(upstream_data_melt.axes[0])), file=sys.stderr, ) - # ----------- Get melted kinase mapping data from SQLite database (finish) ----------- - # ... -------------- end read upstream_data_melt -------------------------------------- + # ... end read upstream_data_melt --------------------------------- end_time = time.process_time() # timer print( @@ -1332,10 +1312,13 @@ if p_peptide in melt_dict: melt_dict[p_peptide].append(characterization) else: - exit( - 'Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' - % (p_peptide) - ) + los = [ + "Phosphopeptide %s" % p_peptide, + "not found in ppep_mapping_db:", + '"phopsphopeptides" and "ppep_mapping_db" must both', + "originate from the same run of mqppep_kinase_mapping" + ] + exit(" ".join(los)) end_time = time.process_time() # timer print( @@ -1397,29 +1380,12 @@ ] ] - # cols_output_prelim = output_df.columns.tolist() - # - # print("cols_output_prelim") - # print(cols_output_prelim) - # - # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]] - # - # print("cols_output with p-peptide") - # print(cols_output) - # - # cols_output = [col for col in cols_output if not col == "p-peptide"] - # - # print("cols_output") - # print(cols_output) - # - # output_df = output_df[cols_output] - # join output_df back to quantitative columns in data_in df quant_cols = data_in.columns.tolist() quant_cols = quant_cols[1:] quant_data = data_in[quant_cols] - # ----------- Write merge/filter metadata to SQLite database (start) ----------- + # ---- Write merge/filter metadata to SQLite database (start) ---- # Open SwissProt SQLite database conn = sql.connect(output_sqlite) cur = conn.cursor() @@ -1467,7 +1433,7 @@ # Close SwissProt SQLite database conn.close() - # ----------- Write merge/filter metadata to SQLite database (finish) ----------- + # ---- Write merge/filter metadata to SQLite database (finish) ---- output_df = output_df.merge( quant_data, @@ -1480,15 +1446,18 @@ output_df = output_df[output_cols] # cosmetic changes to Upstream column + # fill the NaN with "" for those Phosphopeptides that got a + # "WARNING: Failed match for " in the upstream mapping output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[ PUTATIVE_UPSTREAM_DOMAINS ].fillna( "" - ) # fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping + ) us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS]) i = 0 while i < len(us_series): - # turn blanks into N_A to signify the info was searched for but cannot be found + # turn blanks into N_A to signify the info + # that was searched for but cannot be found if us_series[i] == "": us_series[i] = N_A i += 1 @@ -1530,8 +1499,9 @@ # Rev. 7/1/2016 # Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's # Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS - # Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \ - # read from SwissProt SQLite database + # Rev. 12/2/2021: Converted to Python from ipynb; use fast \ + # Aho-Corasick searching; \ + # read from SwissProt SQLite database # Rev. 12/9/2021: Transfer code to Galaxy tool wrapper #
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/perpage.tex Fri Oct 28 18:27:21 2022 +0000 @@ -0,0 +1,547 @@ +% \iffalse +%% +%% perpage is part of the bigfoot bundle for critical typesetting +%% Copyright 2002--2014 David Kastrup <dak@gnu.org> +%% +%% The license notice and corresponding source code for this file are +%% contained in perpage.dtx. +%% +% This program is free software; you can redistribute it and/or modify +% it under the terms of the GNU General Public License as published by +% the Free Software Foundation; either version 2 of the License, or +% (at your option) any later version. +% +% This program is distributed in the hope that it will be useful, +% but WITHOUT ANY WARRANTY; without even the implied warranty of +% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +% GNU General Public License for more details. +% +% You should have received a copy of the GNU General Public License +% along with this program; if not, write to the Free Software +% Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +% \fi +% \CheckSum{396} +% \GetFileInfo{perpage.sty} +% \date{\filedate} +% \author{David Kastrup\thanks{\texttt{dak@gnu.org}}} +% \title{The \texttt{perpage} package\\Version \fileversion} +% \maketitle +% \section{Description} +% +% The \texttt{perpage} package adds the ability to reset counters per +% page and/or keep their occurences sorted in order of appearance on +% the page. +% +% It works by attaching itself to the code for \cmd{\stepcounter} and +% will then modify the given counter according to information written +% to the |.aux| file, which means that multiple passes may be needed. +% Since it uses the internals of the \cmd{\label} mechanism, the need +% for additional passes will get announced by \LaTeX\ as ``labels may +% have changed''. +% +% \DescribeMacro{\MakePerPage} +% \begin{quote} +% |\MakePerPage[2]{footnote}| +% \end{quote} +% will start footnote numbers with~2 on each page (the optional +% argument defaults to~1). 2~might be a strange number, unless you +% have used something like +% \begin{quote} +% |\renewcommand\thefootnote{\fnsymbol{footnote}}| +% \end{quote} +% and want to start off with a dagger. The starting value must not be +% less than~1 so that the counter logic can detect the reset of a +% counter +% reliably.\footnote{This unfortunately means that you can't just use +% \cmd{\alph} in order to get figures on page~10 numbered as ``10'', +% ``10a'', ``10b''.} +% It could be a good idea to redefine |\@cnterr| if you use a format +% with limited range: at the first pass, footnotes are not reset +% across pages and things like |\fnsymbol| will quickly run out of +% characters to use. +% +% \DescribeMacro{\theperpage} +% If you want to label things also on a per page base, for example +% with +% \begin{quote} +% |\renewcommand{\thefigure}{\thepage-\arabic{figure}}| +% \end{quote} +% you'll have the problem that \cmd{\thepage} is updated +% asynchronously with the real page, since \TeX\ does not know which +% page the figure will end up. If you have used the |perpage| package +% for modifying the figure counter, however, at the point where the +% counter is incremented, the macro \cmd{\theperpage} will be set to +% the correct value corresponding to the actual page location. Note +% that this macro is shared between all counters, so advancing a +% different counter under control of |perpage| will render +% \cmd{\thefigure} incorrect. +% +% \DescribeMacro{\MakeSorted} +% \begin{quote} +% |\MakeSorted{figure}| +% \end{quote} +% will make the |figure| counter get `sorted': this means that counter +% values will be assigned in order of appearance in the output, not in +% order of appearance in the source code. For example, the order of +% interspersed one- and two-column figures might get mixed up by +% \LaTeX\ in the output. Making the counter sorted will fix the order +% to match the order of appearance. A similar problem is when +% ordinary footnotes are present in floating material (this does not +% work in standard \LaTeX, but might do so when using |manyfoot.sty| +% or |bigfoot.sty|): this might jumble their order in the output, and +% making their counter sorted will make things appear fine again. +% +% While this would not fix the order in the table of figures, +% fortunately the respective entries actually get written out in order +% of appearance in the output anyway, so this indeed fixes the +% problem. +% +% Manually setting the counter does not lead to reliable results in +% general; as a special case, however, resetting it to zero is +% recognized (this can also happen automatically when the counter is +% dependent on some other counter). The point where it is reset in +% the source code separates `count groups': everything in the source +% before that point is assigned sorted numbers separately from +% everything appearing behind it, and the sequence numbers start again +% with~1 with the first item appearing in the output (not the source) +% from the new count group. +% +% \DescribeMacro{\MakeSortedPerPage} +% \begin{quote} +% |\MakeSortedPerPage[2]{table}| +% \end{quote} +% will make the table numbers restart at 2 on each page \emph{and} +% will keep them sorted, to boot. Introducing new count groups by +% resetting the counter to~0 manually will not work, as it is not +% clear how to handle count groups scattered between pages. You will +% usually want to use something like +% \begin{quote} +% |\renewcommand{\thefigure}{\theperpage-\arabic{figure}}| +% \end{quote} +% to go along with a page-wise figure +% number.\footnote{Note the use of \cmd{\theperpage} here, see above.} +% Note that it would be quite silly to start the ranges with~2: this +% is just an example for the optional argument in case that you ever +% need it. +% +% \DescribeMacro{\AddAbsoluteCounter} +% \begin{quote} +% |\AddAbsoluteCounter{equation}| +% \end{quote} +% will create a counter |absequation| that will advance together with +% the counter |equation| but will not get reset along with it. This +% is not sorted into output order, but just runs along with the +% sequence in the source file. As a special case, the counter +% |abspage| is created in this manner and \cmd{\theabspage} is defined +% as an arabic number that works in the same contexts as \cmd{\page} +% (namely, gets properly deferred by \cmd{\protected@write}). +% +% \StopEventually{} +% \section{The documentation driver} +% This is the default driver for typesetting the docs. Running it +% through as a separate file will include the code section. Running +% the original |.dtx| file through will omit the code. +% \begin{macrocode} +%<*driver> +%ACE \documentclass{ltxdoc} +%ACE \usepackage{perpage} +%ACE \MakePerPage{footnote} +%ACE \begin{document} +%ACE \OnlyDescription +%ACE %<driver> \AlsoImplementation +%ACE \DocInput{perpage.dtx} +%ACE \end{document} +%ACE %</driver> +%ACE % \end{macrocode} +%ACE % +%ACE % \section{The package interfaces} +%ACE % First identification. +%ACE % \begin{macrocode} +%ACE %<*style> +%ACE \NeedsTeXFormat{LaTeX2e} +%ACE \ProvidesPackage{perpage}[2014/10/25 2.0 Reset/sort counters per page] +%ACE % \end{macrocode} +%ACE % \begin{macro}{\pp@cl@begin} +%ACE % \begin{macro}{\pp@cl@end} +%ACE % These macros are considerable tricky. They are called as +%ACE % artificial `dependent' counters when the counter they are hooked +%ACE % into is advanced. The way in which those counters are called are +%ACE % one of the following: +%ACE % \begin{quote} +%ACE % \begin{verbatim} +%ACE % \def\@stpelt#1{\global\csname c@#1\endcsname \z@} +%ACE % \end{verbatim} +%ACE % \end{quote} +%ACE % which is the default way of resetting a subordinate counter used +%ACE % in \LaTeX, or +%ACE % \begin{quote} +%ACE % \begin{verbatim} +%ACE % \def\@stpelt#1{\global\csname c@#1\endcsname \m@ne\stepcounter{#1}} +%ACE % \end{verbatim} +%ACE % \end{quote} +%ACE % which is a little present from |fixltx2e.sty| as of 2014/05/01, +%ACE % quite complicating this feat. +%ACE % +%ACE % The startup code swallows either |\global \advance| or |\global|. +% \begin{macrocode} +\def\pp@cl@begin{\z@\z@ \begingroup} +% \end{macrocode} +% The command used for ending our fake counters checks for the +% |\m@ne| condition. We don't want to bump our auxiliary counters +% twice, so we remove the following |\stepcounter| command. Things +% will go haywire if there is none, of course. +% \begin{macrocode} +\def\pp@cl@end{\afterassignment\pp@cl@end@ii \count@} +\def\pp@cl@end@ii{% + \relax + \expandafter\endgroup + \ifnum\count@<\z@ + \expandafter\pp@cl@end@iii + \fi} +\def\pp@cl@end@iii\stepcounter#1{} +% \end{macrocode} +% \end{macro} +% \end{macro} +% +% \begin{macro}{\AddAbsoluteCounter} +% adds a counter with prefix |abs| to a given counter. It typesets +% as an arabic number and never gets reset. And it is advanced +% whenever the unprefixed counter gets advanced. +% \begin{macrocode} +\newcommand\AddAbsoluteCounter[1] +{\@ifundefined{c@abs#1}{% + \expandafter\newcount\csname c@abs#1\endcsname + \global\value{abs#1}\@ne + \global\expandafter\let\csname cl@abs#1\endcsname\@empty + \expandafter\xdef\csname theabs#1\endcsname{% + \noexpand\number \csname c@abs#1\endcsname}% + \global\@namedef{c@pabs@#1}{\pp@cl@begin + \stepcounter{abs#1}% + \pp@cl@end}% + \@addtoreset{pabs@#1}{#1}}{}} +% \end{macrocode} +% \end{macro} +% \begin{macro}{\c@perpage} +% We now create the absolute counter |perpage|: +% \begin{macrocode} +%ACE \AddAbsoluteCounter{page} +% \end{macrocode} +% \end{macro} +% \begin{macro}{\theabspage} +% This has to be specially defined so that it will expand as late as +% \cmd{\thepage} does. Several commands set the latter temporarily +% to \cmd{\relax} in order to inhibit expansion, and we will more or +% less imitate its behavior when found set in that manner. +% \begin{macrocode} +\def\theabspage{\ifx\thepage\relax + \noexpand\theabspage + \else + \number\c@abspage + \fi} +% \end{macrocode} +% \end{macro} +% Here follow the three commands for defining counters per page: +% \begin{macro}{\MakePerPage} +% This creates a counter reset per page. An optional second +% argument specifies the starting point of the sequence. +% \begin{macrocode} +\newcommand*\MakePerPage[2][\@ne]{% + \pp@makeperpage{#2}\c@pchk@{#1}} +% \end{macrocode} +% \end{macro} +% \begin{macro}{\MakeSorted} +% This will create a counter sorted in appearance on the page. No +% optional argument is given: set the counter to a desired starting +% value manually if you need to. Resetting it to zero will start a +% new count group, setting it to other values is probably not reliable. +% \begin{macrocode} +\newcommand*\MakeSorted[1]{% + \setcounter{#1}{\z@}% + \pp@makeperpage{#1}\c@schk@{\@ne}} +% \end{macrocode} +% \end{macro} +% \begin{macro}{\MakeSortedPerPage} +% This will create output in sorted order, reset on each page. Use +% an optional argument to specify the starting value per page. This +% must not be~0, unfortunately. +% \begin{macrocode} +\newcommand*\MakeSortedPerPage[2][\@ne]{% + \pp@makeperpage{#2}\c@spchk@{#1}} +% \end{macrocode} +% \end{macro} +% All of those must only occur in the preamble since we can't do the +% initialization of the counter values otherwise. +% \begin{macrocode} +\@onlypreamble\MakePerPage +\@onlypreamble\MakeSorted +\@onlypreamble\MakeSortedPerPage +% \end{macrocode} +% \section{Internals} +% +% It works in the following manner: The basic work is done through +% attaching help code to the counter's reset list. Each counter has +% an associated absolute id that is counted through continuously and +% is never reset, thus providing a unique frame of reference. Sorted +% and perpage counters work by writing out information to the +% |.aux| file. +% +% The information we maintain for each counter while processing the +% source file are: +% \begin{itemize} +% \item The absolute counter id. +% \item The last counter value so that we can check whether the +% sequence has been interrupted. +% \item The current scope id. +% \item Its starting value. +% \end{itemize} +% +% The information written to the file consists of: +% \begin{itemize} +% \item The absolute counter id. +% \item The current scope id. +% \item The scope's starting value. +% \item The absolute counter id of a superior counter. +% \end{itemize} +% +% Sorted counters work by writing out the current absolute id and +% range id into the |.aux| file each time the counter gets incremented. +% Whenever the counter is changed in a manner different from being +% incremented, a new counter scope gets started. Each counter scope +% has its own independently assigned counter numbers and is associated +% with its absolute id starting value. So as each counter is +% incremented, we write out the triple of current absolute id, counter +% scope and initial value for the scope. Scope changes when a value +% assigned from the file differs from the `natural' value. When the +% file is read in, counter movements are tracked. Each counter that +% does not have its `natural' value, is having a counter setting +% recorded. +% +% The stuff works by adding a pseudo-reset counter to the counter's +% dependent counter list. +% +% \begin{macro}{\pp@makeperpage} +% This does the relevant things for modifying a counter. It defines +% its reset value, it defines the correspoding absolute counter. +% The absolute counter serves a double function: it is also used for +% assigning numbers while reading the |.aux| file. For this purpose +% it is assigned the initialized values here and in the enddocument +% hook (which is called before rereading the |.aux| file and +% checking for changed labels), while the counter is reset to zero +% at the start of the document. +% \begin{macrocode} +\def\pp@makeperpage#1#2#3{% + \global\expandafter\mathchardef\csname c@pp@r@#1\endcsname=#3\relax + \global\@namedef{c@pchk@#1}{#2{#1}}% + \newcounter{pp@a@#1}% + \setcounter{pp@a@#1}{#3}% + \addtocounter{pp@a@#1}\m@ne + \@addtoreset{pchk@#1}{#1}% + \AtBeginDocument{\setcounter{pp@a@#1}\z@}% + \edef\next{\noexpand\AtEndDocument + {\noexpand\setcounter{pp@a@#1}{% + \number\value{pp@a@#1}}}}\next} +\@onlypreamble\pp@makeperpage +% \end{macrocode} +% \end{macro} +% \begin{macro}{\pp@chkvlist} +% Check for an empty vertical list. If we have one, that is worth +% warning about. +% \begin{macrocode} +\def\pp@chkvlist{% + \ifcase + \ifvmode + \ifx\lastnodetype\@undefined + \ifdim-\@m\p@=\prevdepth\ifdim\lastskip=\z@\ifnum\lastpenalty=\z@ + \@ne + \fi\fi\fi + \else + \ifnum\lastnodetype=\m@ne \@ne \fi + \fi + \fi \z@ + \or + \PackageWarning{perpage}{\string\stepcounter\space probably at start of + vertical list:^^JYou might need to use \string\leavevmode\space + before it to avoid vertical shifts}% + \fi} +% \end{macrocode} +% \end{macro} +% \begin{macro}{\pp@fetchctr} +% \begin{macro}{\theperpage} +% This fetches the counter information and puts it into +% \cmd{\pp@label}, \cmd{\pp@page} and (globally) into +% \cmd{\theperpage}. +% \begin{macrocode} +\def\pp@fetchctr#1{\expandafter\expandafter\expandafter\pp@fetchctrii + \csname pp@r@#1@\number\value{pp@a@#1}\endcsname + {}{}} + +\global\let\theperpage\@empty + +\def\pp@fetchctrii#1#2#3{\def\pp@label{#1}% + \def\pp@page{#2}% + \gdef\theperpage{#3}} +% \end{macrocode} +% \end{macro} +% \end{macro} +% Ok, let's put together all the stuff for the simplest case, counters +% numbered per page without sorting: +% \begin{macro}{\c@pchk@} +% This is the code buried into to the reset list. When the reset +% list is executed in the context of advancing a counter, we call +% something like +% \begin{verbatim} +%\global\c@pchk@{countername}\z@ +% \end{verbatim} +% since the reset list expected a counter here instead of some +% generic command. That is the reason we start off this command by +% giving \cmd{\global} something to chew on. +% \begin{macrocode} +\def\c@pchk@#1{\pp@cl@begin +% \end{macrocode} +% Now we fetch the page value corresponding to the not yet adjusted +% value of the absolute counter to see whether the previous counter +% advance happened on the same page. +% \begin{macrocode} + \pp@fetchctr{#1}\let\next\pp@page + \addtocounter{pp@a@#1}\@ne + \pp@fetchctr{#1}% +% \end{macrocode} +% We compare the pages for current and last advance of the counter. +% If they differ, we reset the counter to its starting value. We do +% the same if the counter has been reset to zero manually, likely by +% being in the reset list of some other counter. +% \begin{macrocode} + \ifcase\ifx\next\pp@page\else\@ne\fi + \ifnum\value{#1}=\z@\@ne\fi\z@ + \else + \setcounter{#1}{\value{pp@r@#1}}% + \fi + \pp@writectr\pp@pagectr{#1}{\noexpand\theabspage}} +% \end{macrocode} +% \end{macro} +% \begin{macro}{\pp@writectr} +% This is the common ending of all pseudo reset counters. It writes +% out an appropriate command to the |.aux| file with all required +% information. We try to replicate any sentinel kerns or penalties. +% \begin{macrocode} +\def\pp@writectr#1#2#3{\edef\next{% + \string#1{#2}{\number\value{pp@a@#2}}{#3}{\noexpand\thepage}}% + \pp@chkvlist + \dimen@=\lastkern + \ifdim\dimen@=\z@ \else \unkern\fi + \count@=\lastpenalty + \protected@write\@auxout{}{\next}% + \ifdim\dimen@=\z@ + \penalty \ifnum\count@<\@M \@M \else \count@ \fi + \else \kern\dimen@\fi + \pp@cl@end} +% \end{macrocode} +% \end{macro} +% \begin{macro}{\pp@labeldef} +% This is a helper macro. +% \begin{macrocode} +\def\pp@labeldef#1#2#3#4#5{\@newl@bel{pp@r@#2}{#3}{{#1}{#4}{#5}}} +% \end{macrocode} +% \end{macro} +% +% \begin{macro}{\pp@pagectr} +% This is the workhorse for normal per page counters. It is called +% whenever the |.aux| file is read in and establishes the +% appropriate information for each counter advancement in a +% pseudolabel. +% \begin{macrocode} +\def\pp@pagectr#1#2#3#4{\@ifundefined{c@pp@a@#1}{}{% + \addtocounter{pp@a@#1}\@ne + \expandafter\pp@labeldef\expandafter + {\number\value{pp@a@#1}}{#1}{#2}{#3}{#4}}} +% \end{macrocode} +% \end{macro} +% \begin{macro}{\c@schk@} +% This is called for implementing sorted counters. Sorted counters +% maintain a ``count group'', and the values in each count group are +% numbered independently from that of other count groups. Whenever +% a counter is found to have been reset, it will start a new count +% group. At the end of document, the count group counters need to +% get reset, too, so that the check for changed |.aux| files will +% still work. +% \begin{macrocode} +\def\c@schk@#1{\pp@cl@begin + \addtocounter{pp@a@#1}\@ne + \ifnum\value{#1}=\@ne + \expandafter\xdef\csname pp@g@#1\endcsname{\number\value{pp@a@#1}}% + \edef\next{\noexpand\AtEndDocument{\global\let + \expandafter\noexpand\csname pp@g@#1@\number\value{pp@a@#1}\endcsname + \relax}}\next + \fi + \pp@fetchctr{#1}% + \ifx\pp@page\@empty + \else \setcounter{#1}{\pp@label}\fi + \pp@writectr\pp@spagectr{#1}{\csname pp@g@#1\endcsname}}% +% \end{macrocode} +% \end{macro} +% \begin{macro}{\pp@spagectr} +% This is the code advancing the respective value of the appropriate +% count group and assigning the label. +% \begin{macrocode} +\def\pp@spagectr#1#2#3#4{\@ifundefined{c@pp@a@#1}{}{% + \count@0\csname pp@g@#1@#3\endcsname + \advance\count@\@ne + \expandafter\xdef\csname pp@g@#1@#3\endcsname{\number\count@}% + \expandafter\pp@labeldef\expandafter + {\number\count@}{#1}{#2}{#3}{#4}}} +% \end{macrocode} +% \end{macro} +% \begin{macro}{\c@spchk@} +% And this finally is the counter advance code for sorted counters +% per page. Basically, we just use one count group per page. +% Resetting a counter manually will not introduce a new count group, +% and it would be hard to decide what to do in case count groups and +% page positions overlap. +% \begin{macrocode} +\def\c@spchk@#1{\pp@cl@begin + \addtocounter{pp@a@#1}\@ne + \pp@fetchctr{#1}% + \ifx\pp@page\@empty + \else \setcounter{#1}{\pp@label}\fi + \pp@writectr\pp@ppagectr{#1}{\noexpand\theabspage}} +% \end{macrocode} +% \end{macro} +% \begin{macro}{\pp@ppagectr} +% \begin{macrocode} +\def\pp@ppagectr#1#2#3#4{\@ifundefined{c@pp@a@#1}{}{% + \def\next{#3}% + \expandafter\ifx\csname pp@page@#1\endcsname\next + \addtocounter{pp@a@#1}\@ne + \else + \setcounter{pp@a@#1}{\value{pp@r@#1}}% + \fi + \global\expandafter\let\csname pp@page@#1\endcsname\next + \expandafter\pp@labeldef\expandafter + {\number\value{pp@a@#1}}{#1}{#2}{#3}{#4}}} +% \end{macrocode} +% \end{macro} +% \begin{macro}{\@testdef} +% \LaTeX's current (2007) definition of this macro causes save stack +% overflow. We fix this by an additional grouping. Delay to the +% beginning of document to keep Babel happy. +% \begin{macrocode} +\AtBeginDocument{% + \begingroup + \@testdef{}{undefined}{}% + \expandafter + \endgroup + \ifx\@undefined\relax + \let\pp@@testdef\@testdef + \def\@testdef#1#2#3{{\pp@@testdef{#1}{#2}{#3}% + \if@tempswa\aftergroup\@tempswatrue\fi}}% + \fi} +%</style> +% \end{macrocode} +% \end{macro} +% +% \Finale +% \endinput +% Local Variables: +% mode: doctex +% TeX-master: "perpage.drv" +% End:
--- a/search_ppep.py Mon Jul 11 19:22:25 2022 +0000 +++ b/search_ppep.py Fri Oct 28 18:27:21 2022 +0000 @@ -237,7 +237,10 @@ # Parse Command Line parser = argparse.ArgumentParser( - description="Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB)." + description=" ".join([ + "Phopsphoproteomic Enrichment", + "phosphopeptide SwissProt search (in place in SQLite DB)." + ]) ) # inputs: @@ -249,7 +252,11 @@ nargs=1, required=True, dest="phosphopeptides", - help="Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool", + help=" ".join([ + "Phosphopeptide data for experimental results,", + "generated by the Phopsphoproteomic Enrichment Localization", + "Filter tool" + ]), ) parser.add_argument( "--uniprotkb", @@ -257,7 +264,10 @@ nargs=1, required=True, dest="uniprotkb", - help="UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool", + help=" ".join([ + "UniProtKB/Swiss-Prot data, converted from FASTA format by the", + "Phopsphoproteomic Enrichment Kinase Mapping tool" + ]), ) parser.add_argument( "--schema", @@ -310,7 +320,8 @@ cur.executescript(DROP_TABLES_SQL) # if options.db_schema: - # print("\nAfter dropping tables/views that are to be created, schema is:") + # print("\nAfter dropping tables/views that are to be created," + # + schema is:") # cur.execute("SELECT * FROM sqlite_schema") # for row in cur.fetchall(): # if row[4] is not None: @@ -403,7 +414,11 @@ deppep_count = row[0] cur.execute( - "SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)" + """ + SELECT count(*) FROM ( + SELECT Sequence FROM UniProtKB GROUP BY Sequence + ) + """ ) for row in cur.fetchall(): sequence_count = row[0] @@ -431,9 +446,11 @@ old_seq = "" for row in cur.fetchall(): if duplicate_count == 0: - print( - "\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column)." - ) + print(" ".join([ + "\nEach of the following sequences is associated with several", + "accession IDs (which are listed in the first column) but", + "the same gene ID (which is listed in the second column)." + ])) if row[2] != old_seq: old_seq = row[2] duplicate_count += 1 @@ -480,13 +497,19 @@ ) else: raise ValueError( - "UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID" - % (UniProtKB_id,) + "UniProtKB_id %s, but Sequence is None: %s %s" + % ( + UniProtKB_id, + "Check whether SwissProt file is missing", + "the sequence for this ID") ) ker.execute( """ - SELECT count(*) || ' accession-peptide-phosphopeptide combinations were found' - FROM uniprotkb_pep_ppep_view + SELECT + count(*) || + ' accession-peptide-phosphopeptide combinations were found' + FROM + uniprotkb_pep_ppep_view """ ) for row in ker.fetchall(): @@ -494,7 +517,9 @@ ker.execute( """ - SELECT count(*) || ' accession matches were found', count(*) AS accession_count + SELECT + count(*) || ' accession matches were found', + count(*) AS accession_count FROM ( SELECT accession FROM uniprotkb_pep_ppep_view @@ -520,7 +545,9 @@ ker.execute( """ - SELECT count(*) || ' phosphopeptide matches were found', count(*) AS phosphopeptide_count + SELECT + count(*) || ' phosphopeptide matches were found', + count(*) AS phosphopeptide_count FROM ( SELECT phosphopeptide FROM uniprotkb_pep_ppep_view