Repository 'mqppep_preproc'
hg clone https://toolshed.g2.bx.psu.edu/repos/galaxyp/mqppep_preproc

Changeset 1:b76c75521d91 (2022-10-28)
Previous changeset 0:8dfd5d2b5903 (2022-07-11) Next changeset 2:a5e7469dfdfa (2022-12-12)
Commit message:
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 43e7a43b545c24b2dc33d039198551c032aa79be
modified:
MaxQuantProcessingScript.R
macros.xml
mqppep_anova.R
mqppep_anova_script.Rmd
mqppep_mrgfltr.py
mqppep_preproc.xml
search_ppep.py
added:
KSEA_impl_flowchart.dia
KSEA_impl_flowchart.pdf
kinase_name_uniprot_lut.tabular.bz2
kinase_uniprot_description_lut.tabular.bz2
mqppep_anova_preamble.tex
perpage.tex
b
diff -r 8dfd5d2b5903 -r b76c75521d91 KSEA_impl_flowchart.dia
b
Binary file KSEA_impl_flowchart.dia has changed
b
diff -r 8dfd5d2b5903 -r b76c75521d91 KSEA_impl_flowchart.pdf
b
Binary file KSEA_impl_flowchart.pdf has changed
b
diff -r 8dfd5d2b5903 -r b76c75521d91 MaxQuantProcessingScript.R
--- a/MaxQuantProcessingScript.R Mon Jul 11 19:22:54 2022 +0000
+++ b/MaxQuantProcessingScript.R Fri Oct 28 18:26:42 2022 +0000
[
@@ -220,7 +220,6 @@
     type = "character",
     help = "pY or pST enriched samples (ie, 'Y' or 'ST')"
   )
-  # default = "^Number of Phospho [(]STY[)]$",
   ,
   make_option(
     c("-p", "--phosphoCol"),
@@ -229,7 +228,6 @@
     help = paste0("PERL-compatible regular expression matching",
              " header of column having number of 'Phospho (STY)'")
   )
-  # default = "^Intensity[^_]",
   ,
   make_option(
     c("-s", "--startCol"),
@@ -238,7 +236,6 @@
     help = paste0("PERL-compatible regular expression matching",
              " header of column having first sample intensity")
   )
-  # default = 1,
   ,
   make_option(
     c("-I", "--intervalCol"),
@@ -247,7 +244,6 @@
     help = paste0("Column interval between the Intensities of samples",
              " (eg, 1 if subsequent column; 2 if every other column")
   )
-  # default = 0.75,
   ,
   make_option(
     c("-l", "--localProbCutoff"),
@@ -255,7 +251,6 @@
     type = "double",
     help = "Localization Probability Cutoff"
   )
-  # default = "sum",
   ,
   make_option(
     c("-f", "--collapse_func"),
@@ -264,7 +259,6 @@
     help = paste0("merge identical phosphopeptides",
              " by ('sum' or 'average') the intensities")
   )
-  # default = "filtered_data.txt",
   ,
   make_option(
     c("-r", "--filtered_data"),
@@ -272,7 +266,6 @@
     type = "character",
     help = "filtered_data.txt"
   )
-  # default = "quantData.txt",
   ,
   make_option(
     c("-q", "--quant_data"),
b
diff -r 8dfd5d2b5903 -r b76c75521d91 kinase_name_uniprot_lut.tabular.bz2
b
Binary file kinase_name_uniprot_lut.tabular.bz2 has changed
b
diff -r 8dfd5d2b5903 -r b76c75521d91 kinase_uniprot_description_lut.tabular.bz2
b
Binary file kinase_uniprot_description_lut.tabular.bz2 has changed
b
diff -r 8dfd5d2b5903 -r b76c75521d91 macros.xml
--- a/macros.xml Mon Jul 11 19:22:54 2022 +0000
+++ b/macros.xml Fri Oct 28 18:26:42 2022 +0000
b
@@ -1,89 +1,47 @@
 <macros>
-    <token name="@TOOL_VERSION@">0.1.13</token>
+    <token name="@TOOL_VERSION@">0.1.15</token>
     <token name="@VERSION_SUFFIX@">0</token>
     <xml name="requirements">
         <requirements>
             <requirement type="package" version="1.56.0"  >bioconductor-preprocesscore</requirement>
-            <requirement type="package" version="1.22.2"  >numpy</requirement>
+            <requirement type="package" version="6.2.1"   >gmp</requirement>
+            <requirement type="package" version="1.23.4"  >numpy</requirement>
             <requirement type="package" version="0.3.3"   >openblas</requirement>
-            <requirement type="package" version="1.4.1"   >pandas</requirement>
-            <requirement type="package" version="1.64"    >perl-dbd-sqlite</requirement>
-            <requirement type="package" version="5.26.2"  >perl</requirement>
-            <requirement type="package" version="1.4.0"   >pyahocorasick</requirement>
-            <requirement type="package" version="3.9.10"  >python</requirement>
-            <requirement type="package" version="1.14.2"  >r-data.table</requirement>
-            <requirement type="package" version="1.1.2"   >r-dbi</requirement>
-            <requirement type="package" version="3.3.5"   >r-ggplot2</requirement>
+            <requirement type="package" version="1.5.1"   >pandas</requirement>
+            <requirement type="package" version="1.70"    >perl-dbd-sqlite</requirement>
+            <requirement type="package" version="5.32.1"  >perl</requirement>
+            <requirement type="package" version="1.4.4"   >pyahocorasick</requirement>
+            <requirement type="package" version="3.10.6"  >python</requirement>
+            <requirement type="package" version="4.1.3"   >r-base</requirement>
+            <requirement type="package" version="6.0_93"  >r-caret</requirement>
+            <requirement type="package" version="1.14.4"  >r-data.table</requirement>
+            <requirement type="package" version="1.1.3"   >r-dbi</requirement>
+            <requirement type="package" version="3.3.6"   >r-ggplot2</requirement>
             <requirement type="package" version="3.1.3"   >r-gplots</requirement>
-            <requirement type="package" version="0.9.4"   >r-latex2exp</requirement>
-            <requirement type="package" version="1.7.1"   >r-optparse</requirement>
+            <requirement type="package" version="0.9.5"   >r-latex2exp</requirement>
+            <requirement type="package" version="1.7.3"   >r-optparse</requirement>
             <requirement type="package" version="1.4.4"   >r-reshape2</requirement>
-            <requirement type="package" version="2.11"    >r-rmarkdown</requirement>
-            <requirement type="package" version="2.2.8"   >r-rsqlite</requirement>
-            <requirement type="package" version="0.4.0"   >r-sass</requirement>
+            <requirement type="package" version="2.17"    >r-rmarkdown</requirement>
+            <!--
+            <requirement type="package" version="2.2.18"  >r-rsqlite</requirement>
+            <requirement type="package" version="0.4.2"   >r-sass</requirement>
+            -->
+            <requirement type="package" version="1.2.2"   >r-sessioninfo</requirement>
             <requirement type="package" version="0.4_11"  >r-sqldf</requirement>
-            <requirement type="package" version="1.4.0"   >r-stringr</requirement>
-            <requirement type="package" version="0.37"    >r-tinytex</requirement>
+            <requirement type="package" version="1.4.1"   >r-stringr</requirement>
+            <requirement type="package" version="0.42"    >r-tinytex</requirement>
             <requirement type="package" version="0.3.7"   >r-vioplot</requirement>
             <!--
             It would be nice to use conda-forge/texlive-core rather than r-tinytex because the
-            former installs texlive when the package is built, but issue 23 blocked PDF-creation.
+            former installs texlive when the package is built, but issue 19/61 blocked PDF-creation.
             Also, texlive-core also gave pango font errors (output had missing symbols replaced
             with boxes) unless I specified the build as well as the version when building a
             conda environment, e.g.:  texlive-core=20210325=h97429d4_0
             -->
         </requirements>
-        <!-- I specified the versions above because it takes a VERY long time to search for package versions when they are not omitted; also, version numbers should lead to reproducible behavior.  Contrast execution times of this (about 18 seconds):
-            echo n | time conda create -n mqppep_ver -c conda-forge -c bioconda \
-              bioconductor-preprocesscore=1.56.0 \
-              numpy=1.22.2 \
-              openblas=0.3.3 \
-              pandas=1.4.1 \
-              perl-dbd-sqlite=1.64 \
-              perl-dbd-sqlite=1.64 \
-              perl=5.26.2 \
-              pyahocorasick=1.4.0 \
-              python=3.9.10 \
-              r-data.table=1.14.2 \
-              r-dbi=1.1.2 \
-              r-ggplot2=3.3.5 \
-              r-gplots=3.1.3 \
-              r-latex2exp=0.9.4 \
-              r-optparse=1.7.1 \
-              r-reshape2=1.4.4 \
-              r-rmarkdown=2.11 \
-              r-rsqlite=2.2.8 \
-              r-sass=0.4.0 \
-              r-sqldf=0.4_11 \
-              r-stringr=1.4.0 \
-              r-tinytex=0.37 \
-              r-vioplot=0.3.7
-          with this (42 or more seconds):
-            echo n | time conda create -n mqppep_nover -c conda-forge -c bioconda \
-              bioconductor-preprocesscore= \
-              numpy \
-              openblas=0.3.3 \
-              pandas \
-              perl \
-              perl-dbd-sqlite \
-              perl-dbd-sqlite \
-              pyahocorasick \
-              python \
-              r-data.table \
-              r-dbi \
-              r-ggplot2 \
-              r-gplots \
-              r-latex2exp \
-              r-optparse \
-              r-reshape2 \
-              r-rmarkdown \
-              r-rsqlite \
-              r-sass \
-              r-sqldf \
-              r-stringr \
-              r-tinytex \
-              r-vioplot
-
+        <!-- I specified the versions above because it takes a VERY long time
+             to search for package versions when they are not omitted; also,
+             locking version numbers might lead to more-reproducible behavior.
         -->
     </xml>
 </macros>
b
diff -r 8dfd5d2b5903 -r b76c75521d91 mqppep_anova.R
--- a/mqppep_anova.R Mon Jul 11 19:22:54 2022 +0000
+++ b/mqppep_anova.R Fri Oct 28 18:26:42 2022 +0000
[
b'@@ -1,20 +1,15 @@\n #!/usr/bin/env Rscript\n # libraries\n library(optparse)\n-library(data.table)\n library(stringr)\n+library(tinytex)\n \n # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285\n \n # parse options\n option_list <- list(\n-  make_option(\n-    c("-i", "--inputFile"),\n-    action = "store",\n-    default = NA,\n-    type = "character",\n-    help = "Phosphopeptide Intensities sparse input file path"\n-  ),\n+\n+  # files\n   make_option(\n     c("-a", "--alphaFile"),\n     action = "store",\n@@ -24,64 +19,11 @@\n              " path to text file having one column and no header")\n   ),\n   make_option(\n-    c("-S", "--preproc_sqlite"),\n-    action = "store",\n-    default = NA,\n-    type = "character",\n-    help = "Path to \'preproc_sqlite\' produced by `mqppep_mrgfltr.py`"\n-  ),\n-  make_option(\n-    c("-K", "--ksea_sqlite"),\n+    c("-M", "--anova_ksea_metadata"),\n     action = "store",\n-    default = NA,\n-    type = "character",\n-    help = "Path to \'ksea_sqlite\' output produced by this tool"\n-  ),\n-  make_option(\n-    c("-f", "--firstDataColumn"),\n-    action = "store",\n-    default = "^Intensity[^_]",\n-    type = "character",\n-    help = "First column of intensity values"\n-  ),\n-  make_option(\n-    c("-m", "--imputationMethod"),\n-    action = "store",\n-    default = "random",\n+    default = "anova_ksea_metadata.tsv",\n     type = "character",\n-    help = paste0("Method for missing-value imputation,",\n-             " one of c(\'group-median\',\'median\',\'mean\',\'random\')")\n-  ),\n-  make_option(\n-    c("-p", "--meanPercentile"),\n-    action = "store",\n-    default = 3,\n-    type = "integer",\n-    help = paste0("Mean percentile for randomly generated imputed values;",\n-              ", range [1,99]")\n-  ),\n-  make_option(\n-    c("-d", "--sdPercentile"),\n-    action = "store",\n-    default = 3,\n-    type = "double",\n-    help = paste0("Adjustment value for standard deviation of",\n-              " randomly generated imputed values; real")\n-  ),\n-  make_option(\n-    c("-s", "--regexSampleNames"),\n-    action = "store",\n-    default = "\\\\.(\\\\d+)[A-Z]$",\n-    type = "character",\n-    help = "Regular expression extracting sample-names"\n-  ),\n-  make_option(\n-    c("-g", "--regexSampleGrouping"),\n-    action = "store",\n-    default = "(\\\\d+)",\n-    type = "character",\n-    help = paste0("Regular expression extracting sample-group",\n-             " from an extracted sample-name")\n+    help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments"\n   ),\n   make_option(\n     c("-o", "--imputedDataFile"),\n@@ -102,11 +44,56 @@\n         )\n   ),\n   make_option(\n+    c("-i", "--inputFile"),\n+    action = "store",\n+    default = NA,\n+    type = "character",\n+    help = "Phosphopeptide Intensities sparse input file path"\n+  ),\n+  make_option(\n+    c("-K", "--ksea_sqlite"),\n+    action = "store",\n+    default = NA,\n+    type = "character",\n+    help = "Path to \'ksea_sqlite\' output produced by this tool"\n+  ),\n+  make_option(\n+    c("-S", "--preproc_sqlite"),\n+    action = "store",\n+    default = NA,\n+    type = "character",\n+    help = "Path to \'preproc_sqlite\' produced by `mqppep_mrgfltr.py`"\n+  ),\n+  make_option(\n     c("-r", "--reportFile"),\n     action = "store",\n-    default = "QuantDataProcessingScript.html",\n+    default = "mqppep_anova.pdf",\n+    type = "character",\n+    help = "PDF report file path"\n+  ),\n+\n+  # parameters\n+  make_option(\n+    c("-f", "--firstDataColumn"),\n+    action = "store",\n+    default = "^Intensity[^_]",\n     type = "character",\n-    help = "HTML report file path"\n+    help = "First column of intensity values"\n+  ),\n+  make_option(\n+    c("-m", "--imputationMethod"),\n+    action = "store",\n+    default = "random",\n+    type = "character",\n+    help = paste0("Method for missing-value imputation,",\n+             " one of c(\'group-median\',\'median\',\'mean\',\'random\')")\n+  ),\n+  make_option(\n+    c("-C", "--intensityMinValuesPerClass"),\n+    action = "store",\n+    default = "0",\n+    type = "integer",\n+    hel'..b'_config_file_string(args$regexSampleGrouping, nc)\n+cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\\n"))\n+\n+cat(paste0("regex_sample_names file: ", args$regexSampleNames, "\\n"))\n+regex_sample_names <- read_config_file_string(args$regexSampleNames, nc)\n cat(paste0("regex_sample_names: ",    regex_sample_names,    "\\n"))\n-cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\\n"))\n+\n+if (group_filter != "none") {\n+  cat(paste0("group_filter_patterns file: \'", args$sampleGroupFilterPatterns, "\'\\n"))\n+  group_filter_patterns <- read_config_file_string(args$sampleGroupFilterPatterns, nc)\n+} else {\n+  group_filter_patterns <- ".*"\n+}\n+cat(paste0("group_filter_patterns: ", group_filter_patterns, "\\n"))\n+\n+sink()\n+\n \n # from: https://github.com/molgenis/molgenis-pipelines/wiki/\n #   How-to-source-another_file.R-from-within-your-R-script\n@@ -253,45 +391,72 @@\n     return(NULL)\n }\n \n-script_dir <-  location_of_this_script()\n+# validation of input parameters is complete; it is now justifiable to\n+#   install LaTeX tools to render markdown as PDF; this involves a big\n+#   download from GitHub\n+if (!tinytex::is_tinytex()) tinytex::install_tinytex()\n \n rmarkdown_params <- list(\n-    inputFile = input_file\n-  , alphaFile = alpha_file\n-  , preprocDb = preproc_sqlite\n+\n+    # files\n+    alphaFile = alpha_file\n+  , anovaKseaMetadata = anova_ksea_metadata_file\n+  , imputedDataFilename = imputed_data_file\n+  , imputedQNLTDataFile = imp_qn_lt_data_file\n+  , inputFile = input_file\n+  , kseaAppPrepDb = ksea_sqlite_file\n+  , preprocDb = preproc_sqlite_file\n+\n+    # parameters\n   , firstDataColumn = first_data_column\n+  , groupFilter = group_filter\n+  , groupFilterMode = group_filter_mode         # arg sampleGroupFilterMode\n+  , groupFilterPatterns = group_filter_patterns # arg sampleGroupFilterPatterns\n   , imputationMethod = imputation_method\n+  , intensityMinValuesPerGroup = intensity_min_values_per_class\n+  , kseaCutoffStatistic = ksea_cutoff_statistic\n+  , kseaCutoffThreshold = ksea_cutoff_threshold\n+  , kseaMinSubstrateCount = ksea_min_substrate_count\n+  , kseaUseAbsoluteLog2FC = ksea_use_absolute_log2_fc # add\n   , meanPercentile = mean_percentile\n-  , sdPercentile = sd_percentile\n+  , minQuality = min_quality                          # add\n+  , regexSampleGrouping = regex_sample_grouping\n   , regexSampleNames = regex_sample_names\n-  , regexSampleGrouping = regex_sample_grouping\n-  , imputedDataFilename = imputed_data_file_name\n-  , imputedQNLTDataFile = imp_qn_lt_data_filenm\n-  , anovaKseaMetadata = anova_ksea_metadata\n-  , kseaAppPrepDb = ksea_sqlite\n-  , kseaCutoffThreshold = ksea_cutoff_threshold\n-  , kseaCutoffStatistic = ksea_cutoff_statistic\n+  , sdPercentile = sd_percentile\n   )\n \n print("rmarkdown_params")\n-str(rmarkdown_params)\n+print(rmarkdown_params)\n+print(\n+  lapply(\n+    X = rmarkdown_params,\n+    FUN = function(x) {\n+      paste0(\n+        nchar(as.character(x)),\n+        ": \'",\n+        as.character(x),\n+        "\'"\n+      )\n+    }\n+  )\n+)\n+\n \n # freeze the random number generator so the same results will be produced\n #  from run to run\n set.seed(28571)\n \n-# BUG (or "opportunity")\n-# To render as PDF for the time being requires installing the conda\n-# package `r-texlive` until this issue in `texlive-core` is resolved:\n-#   https://github.com/conda-forge/texlive-core-feedstock/issues/19\n-# This workaround is detailed in the fourth comment of:\n-#   https://github.com/conda-forge/texlive-core-feedstock/issues/61\n+script_dir <-  location_of_this_script()\n \n-library(tinytex)\n-tinytex::install_tinytex()\n rmarkdown::render(\n   input = paste(script_dir, "mqppep_anova_script.Rmd", sep = "/")\n-, output_format = rmarkdown::pdf_document(toc = TRUE)\n , output_file = report_file_name\n , params = rmarkdown_params\n+, output_format = rmarkdown::pdf_document(\n+    includes = rmarkdown::includes(in_header = "mqppep_anova_preamble.tex")\n+  , dev = "pdf"\n+  , toc = TRUE\n+  , toc_depth = 2\n+  , number_sections = FALSE\n+  )\n )\n'
b
diff -r 8dfd5d2b5903 -r b76c75521d91 mqppep_anova_preamble.tex
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mqppep_anova_preamble.tex Fri Oct 28 18:26:42 2022 +0000
[
@@ -0,0 +1,90 @@
+% -----------------------------------------------------------------------------
+% preamble includes BEGIN
+% -----------------------------------------------------------------------------
+\usepackage{longtable, lscape, ifthen}
+
+% -----------------------------------------------------------------------------
+% put \T or \B at the ends of lines to add space for super- or sub-
+%   scripts above or below, respectively
+% ref: ?
+\newcommand\T{\rule{0pt}{2.6ex}}       % Top strut
+\newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut
+
+% -----------------------------------------------------------------------------
+% horizontal line commands; ideally, these would compute the width rather than
+%   hardcoding it
+% ref: ?
+\def\hlinport{\makebox[6.5in]{\hrulefill} \\} % hline outside tabular, port
+\def\hlinlscp{\makebox[9in]{\hrulefill} \\} % hline outside tabular, lndscp
+%ref: https://stackoverflow.com/a/67335722
+\def\hlinnotab{\\makebox[1.0\linewidth]{\hrulefill}\\[1ex]}
+
+% -----------------------------------------------------------------------------
+% ref: https://latex.org/forum/viewtopic.php?p=23257#p23257
+\newcommand{\nonemptyline}[1]{%
+  %\ifthenelse{\equal{#1}{}}{do when empty}{do when not empty}
+  \ifthenelse{\equal{#1}{}}{}{#1}%
+}
+
+% -----------------------------------------------------------------------------
+% For RMarkdown, I needed to put this into a preamble.tex file and include it
+%   via `output: pdf_document: includes: in_header: preamble.tex` because
+%   Markdown was expanding the \tabfill command before writing the tex file
+% ref: https://tex.stackexchange.com/a/119477 in reply to
+%      https://tex.stackexchange.com/questions/119473/tabbing-and-line-wrapping
+\makeatletter
+\newlength\tdima
+\newcommand\tabfill[1]{\setlength\tdima{\linewidth}%
+  \addtolength\tdima{\@totalleftmargin}%
+  \addtolength\tdima{-\dimen\@curtab}%
+  \parbox[t]{\tdima}{#1\ifhmode\strut\fi}}
+  %\parbox[t]{\tdima}{\nonemptyline{#1}\ifhmode\strut\fi}}
+\makeatother
+%
+% Create a tabbing environment in which to use tabfill
+% param #1 is specified the tabstops (as expected by the tabbing
+% environment) and is provided in braces after invocation, e.g.:
+%   \begin{tabwrap}{\hspace{1.25in}\=}
+% param #2 is the contents of the envirnent
+\newenvironment{tabwrap}[2]{%
+  \begin{tabbing}#1\kill\ignorespaces%
+  #2}%
+  {\end{tabbing}%
+}
+
+% -----------------------------------------------------------------------------
+% Make a caption for a non-floating figure or table, e.g.,
+% ref: https://github.com/rf-latex/capt-of/blob/main/capt-of.dtx
+%      https://texfaq.org/FAQ-figurehere
+%   
+% Usage: \captionof{*type*}[*move*]{*caption*}
+%        *type*    is `figure` or `table` (or some type you've
+%                    defined with the`float` package)
+%        *move*    is the optional moving argument *caption* (the thing
+%                    that goes to the list of tables/figures)
+%        *caption* is the text of the caption
+\makeatletter
+\newcommand\captionof[1]{\def\@captype{#1}\caption}
+\makeatother
+%
+%%ACE \captionof{table}{Hello world from line 210}
+% To circumvent mis-numbering of interleaved float and non-float table
+%   and figure captions, it is necessary to include the `perpage` package and
+%   "make them sorted" (FFI see https://texfaq.org/FAQ-figurehere)
+% I (ACE) don't know how to get this package to include:
+%    \usepackage{bigfoot}
+% so I included the source instead:
+\makeatletter
+\input{perpage.tex}
+\makeatother
+%
+% Ensure that table numbers are sorted
+\MakeSorted{table}
+% Ensure that figure numbers are sorted
+\MakeSorted{figure}
+
+% -----------------------------------------------------------------------------
+
+% -----------------------------------------------------------------------------
+% preamble includes END
+% -----------------------------------------------------------------------------
b
diff -r 8dfd5d2b5903 -r b76c75521d91 mqppep_anova_script.Rmd
--- a/mqppep_anova_script.Rmd Mon Jul 11 19:22:54 2022 +0000
+++ b/mqppep_anova_script.Rmd Fri Oct 28 18:26:42 2022 +0000
[
b'@@ -7,81 +7,153 @@\n date:\n - "May 28, 2018"\n - "; revised June 23, 2022"\n+lot: true\n output:\n   pdf_document:\n     toc: true\n-    toc_depth: 3\n+    toc_depth: 2\n     keep_tex: true\n-header-includes:\n-  - \\usepackage{longtable}\n-  - \\newcommand\\T{\\rule{0pt}{2.6ex}}       % Top strut\n-  - \\newcommand\\B{\\rule[-1.2ex]{0pt}{0pt}} % Bottom strut\n+    dev: pdf\n+    includes:\n+      in_header: mqppep_anova_preamble.tex\n+latex_macros: false\n+raw_tex: true\n+urlcolor: blue\n params:\n   alphaFile:            "test-data/alpha_levels.tabular"\n   inputFile:            "test-data/test_input_for_anova.tabular"\n   preprocDb:            "test-data/test_input_for_anova.sqlite"\n   kseaAppPrepDb:        !r c(":memory:", "test-data/mqppep.sqlite")[2]\n-  show_toc:             true\n-  firstDataColumn:      "^Intensity[^_]"\n-  imputationMethod:     !r c("group-median", "median", "mean", "random")[1]\n-  meanPercentile:       1\n-  sdPercentile:         1.0\n   regexSampleNames:     "\\\\.\\\\d+[A-Z]$"\n   regexSampleGrouping:  "\\\\d+"\n+  groupFilterPatterns:  ".+"\n+  groupFilter:    !r c("none", "exclude", "include")[1]\n+  imputationMethod:     !r c("group-median", "median", "mean", "random")[4]\n+  kseaCutoffThreshold:  !r c(0.05, 0.1, 0.25, 0.5, 0.9)[5]\n+  #imputationMethod:     !r c("group-median", "median", "mean", "random")[1]\n+\n+  # how should sample groups be interpreted?\n+  #  - "f": fixed patterns (like `grep -F`)\n+  #  - "p": PERL-compatible (like `grep -P`)\n+  #  - "r": extended grep patterns (like `grep -E`)\n+  # use what case sensitivity?\n+  #  - "i": case insensitive matching (like `grep -i`)\n+  groupFilterMode: !r c("r", "ri", "p", "pi", "f", "fi")[1]\n+  # what pattern should be used for the first column\n+  #   (extended grep pattern, case sensitive)\n+  firstDataColumn:      "^Intensity[^_]"\n+  # for small random value imputation, what percentile should be center?\n+  meanPercentile:       50\n+  #meanPercentile:       1\n+  # for small random value imputation, what should `s / mean(x)` ratio be?\n+  sdPercentile:         1.0\n+  # output path for imputed data file\n   imputedDataFilename:  "test-data/limbo/imputedDataFilename.txt"\n+  # output path for imputed/quantile-normalized/log-transformed data file\n   imputedQNLTDataFile:  "test-data/limbo/imputedQNLTDataFile.txt"\n+  # output path for contents of `stats_metadata_v` table\n   anovaKseaMetadata:    "test-data/limbo/anovaKseaMetadata.txt"\n+  # how to test one variable with > 2 categories (e.g., aov or kruskal.test)\n   oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1]\n+  # how to test one variable with 2 categories (e.g., oneway.test)\n   oneWayTwoCategories:  !r c("aov", "kruskal.test", "oneway.test")[3]\n-  kseaCutoffStatistic:  !r c("p.value", "FDR")[2]\n-  kseaCutoffThreshold:  !r c( 0.1, 0.05)[2]\n-  kseaMinKinaseCount:   1\n-  intensityHeatmapRows: 75\n+  # what should be the minimum quality for consideration in both\n+  minQuality:           0\n+  # correct KSEA with FDR (recommended) or raw p-value\n+  kseaCutoffStatistic:  !r c("FDR", "p.value")[1]\n+  # correct KSEA threshold 0.05 (conventional) or higher (perhaps better)\n+  #   "perhaps better" meaning that KSEA is an hypothesis-generator, not -test\n+  #kseaCutoffThreshold:  !r c(0.05, 0.1, 0.25, 0.5)[1]\n+  # minimum number of substrates required for a kinase to be considered in KSEA\n+  kseaMinSubstrateCount: 1\n+  # Should KSEA be performed aggregating signed log2FC or absolute?\n+  # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores\n+  # TRUE  use abs(log2FC) for KSEA as Justin Drake requested; this is a\n+  #         justifiable deviation from the KSEAapp::KSEA.Scores algorithm.\n+  kseaUseAbsoluteLog2FC: TRUE\n+  #kseaUseAbsoluteLog2FC: FALSE\n+  # minimum number of observed values per sample-group\n+  intensityMinValuesPerGroup: 1\n+  # maximum number of heatmap rows (result are poor when > 50)\n+  intensityHeatmapRows: 50\n+  # what should be the primary criterion to eliminate excessive heatmap rows\n+  intensityHeatmapCriteria: '..b'nb_messages) nbe("Output quantile normalized data tabular file\\n")\n+  write.table(\n+    data_table_imputed,\n+    file = imp_qn_lt_data_filenm,\n+    sep = "\\t",\n+    col.names = TRUE,\n+    row.names = FALSE,\n+    quote = FALSE\n   )\n \n-\n-#output quantile normalized data\n-impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log)\n-colnames(impish)[1] <- "Phosphopeptide"\n-data_table_imputed <- sqldf(data_table_imputed_sql)\n-# Zap the duplicated \'Phosphopeptide\' column named \'ppep\'\n-data_table_imputed <-\n-    data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]\n-write.table(\n-  data_table_imputed,\n-  file = imp_qn_lt_data_filenm,\n-  sep = "\\t",\n-  col.names = TRUE,\n-  row.names = FALSE,\n-  quote = FALSE\n-)\n-\n-ppep_kinase <- sqldf("\n-  SELECT DISTINCT k.ppep, k.kinase\n-    FROM (\n-      SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep\n-        FROM pseudo_ksdata\n-        WHERE GENE IN (SELECT kinase FROM enriched_kinases)\n-      ) k\n-    ORDER BY k.ppep, k.kinase\n-  ")\n-\n-RSQLite::dbWriteTable(\n-  conn = db,\n-  name = "ksea_enriched_ks",\n-  value = ppep_kinase,\n-  append = FALSE\n-  )\n+  ppep_kinase <- sqldf("\n+    SELECT DISTINCT k.ppep, k.kinase\n+      FROM (\n+        SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep\n+          FROM pseudo_ksdata\n+          WHERE GENE IN (SELECT kinase FROM enriched_kinases)\n+        ) k\n+      ORDER BY k.ppep, k.kinase\n+    ")\n+\n+  RSQLite::dbWriteTable(\n+    conn = db,\n+    name = "ksea_enriched_ks",\n+    value = ppep_kinase,\n+    append = FALSE\n+    )\n+}\n+\n+if (print_nb_messages) nb("RSQLite::dbWriteTable anova_signif\\n")\n \n RSQLite::dbWriteTable(\n   conn = db,\n@@ -3453,6 +6293,8 @@\n     "\n   )\n \n+if (print_nb_messages) nb("Output contents of `stats_metadata_v` table to tabular file\\n")\n+if (print_nb_messages) nbe("Output contents of `stats_metadata_v` table to tabular file\\n")\n write.table(\n   dbReadTable(db, "stats_metadata_v"),\n   file = anova_ksea_mtdt_file,\n@@ -3462,75 +6304,21 @@\n   quote = FALSE\n   )\n \n+cat("\\n\\\\clearpage\\n")\n \n ```\n \n+# Data-processing summary flowchart\n+\n+![Flowchart showing ANOVA and KSEA data-processing steps](KSEA_impl_flowchart.pdf)\n+\n ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = \'asis\'}\n cat("\\\\leavevmode\\n\\n\\n")\n \n-# write parameters to report\n-\n-param_unlist <- unlist(as.list(params))\n-param_df <- data.frame(\n-  parameter = paste0("\\\\verb@", names(param_unlist), "@"),\n-  value = paste0("\\\\verb@", gsub("$", "\\\\$", param_unlist, fixed = TRUE), "@")\n-  )\n-\n-data_frame_latex(\n-  x = param_df,\n-  justification = "p{0.35\\\\linewidth} p{0.6\\\\linewidth}",\n-  centered = TRUE,\n-  caption = "Input parameters",\n-  anchor = const_table_anchor_bp,\n-  underscore_whack = FALSE\n-  )\n-\n-# write parameters to SQLite output\n-\n-mqppep_anova_script_param_df <- data.frame(\n-  script    = "mqppep_anova_script.Rmd",\n-  parameter = names(param_unlist),\n-  value     = param_unlist\n-  )\n-ddl_exec(db, "\n-  DROP TABLE IF EXISTS script_parameter;\n-  "\n-)\n-ddl_exec(db, "\n-  CREATE TABLE IF NOT EXISTS script_parameter(\n-    script    TEXT,\n-    parameter TEXT,\n-    value     ANY,\n-    UNIQUE (script, parameter) ON CONFLICT REPLACE\n-    )\n-    ;\n-  "\n-)\n-RSQLite::dbWriteTable(\n-  conn = db,\n-  name = "script_parameter",\n-  value = mqppep_anova_script_param_df,\n-  append = TRUE\n-)\n-\n+write_params(db)\n # We are done with output\n RSQLite::dbDisconnect(db)\n+\n+cat("\\\\clearpage\\n\\\\section{R package versions}\\n")\n+utils::toLatex(utils::sessionInfo())\n ```\n-<!--\n-There\'s gotta be a better way...\n-\n-loaded_packages_df <-  sessioninfo::package_info("loaded")\n-loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library)\n-loaded_packages_df <- data.frame(\n-  package = loaded_packages_df$package,\n-  version = loaded_packages_df$loadedversion,\n-  date    = loaded_packages_df$date\n-  )\n-data_frame_latex(\n-  x = loaded_packages_df,\n-  justification = "l | l l",\n-  centered = FALSE,\n-  caption = "Loaded R packages",\n-  anchor = const_table_anchor_bp\n-  )\n--->\n'
b
diff -r 8dfd5d2b5903 -r b76c75521d91 mqppep_mrgfltr.py
--- a/mqppep_mrgfltr.py Mon Jul 11 19:22:54 2022 +0000
+++ b/mqppep_mrgfltr.py Fri Oct 28 18:26:42 2022 +0000
[
b'@@ -87,7 +87,10 @@\n         nargs=1,\n         required=True,\n         dest="phosphopeptides",\n-        help="Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format",\n+        help=" ".join([\n+            "Phosphopeptide data for experimental results, including the",\n+            "intensities and the mapping to kinase domains, in tabular format"\n+        ]),\n     )\n     #   UniProtKB/SwissProt DB input, SQLite\n     parser.add_argument(\n@@ -106,7 +109,10 @@\n         required=False,\n         default=[],\n         dest="species",\n-        help="limit PhosphoSitePlus records to indicated species (field may be empty)",\n+        help=" ".join([\n+            "limit PhosphoSitePlus records to indicated species",\n+            "(field may be empty)"\n+        ]),\n     )\n \n     # outputs:\n@@ -174,7 +180,7 @@\n     # determine species to limit records from PSP_Regulatory_Sites\n     if options.species is None:\n         exit(\n-            \'Argument "species" is required (and may be empty) but not supplied\'\n+            \'Argument "species" is required (& may be empty) but not supplied\'\n         )\n     try:\n         if len(options.species) > 0:\n@@ -216,20 +222,25 @@\n         FUNCTION_PHOSPHORESIDUE = (\n             "Function Phosphoresidue(PSP=PhosphoSitePlus.org)"\n         )\n-        GENE_NAME = "Gene_Name"  # Gene Name from UniProtKB\n-        ON_FUNCTION = (\n-            "ON_FUNCTION"  # ON_FUNCTION column from PSP_Regulatory_Sites\n-        )\n-        ON_NOTES = "NOTES"  # NOTES column from PSP_Regulatory_Sites\n-        ON_OTHER_INTERACT = "ON_OTHER_INTERACT"  # ON_OTHER_INTERACT column from PSP_Regulatory_Sites\n-        ON_PROCESS = (\n-            "ON_PROCESS"  # ON_PROCESS column from PSP_Regulatory_Sites\n-        )\n-        ON_PROT_INTERACT = "ON_PROT_INTERACT"  # ON_PROT_INTERACT column from PSP_Regulatory_Sites\n+        # Gene Name from UniProtKB\n+        GENE_NAME = "Gene_Name"\n+        # ON_FUNCTION column from PSP_Regulatory_Sites\n+        ON_FUNCTION = ("ON_FUNCTION")\n+        # NOTES column from PSP_Regulatory_Sites\n+        ON_NOTES = "NOTES"\n+        # ON_OTHER_INTERACT column from PSP_Regulatory_Sites\n+        ON_OTHER_INTERACT = "ON_OTHER_INTERACT"\n+        # ON_PROCESS column from PSP_Regulatory_Sites\n+        ON_PROCESS = ("ON_PROCESS")\n+        # ON_PROT_INTERACT column from PSP_Regulatory_Sites\n+        ON_PROT_INTERACT = "ON_PROT_INTERACT"\n         PHOSPHOPEPTIDE = "Phosphopeptide"\n         PHOSPHOPEPTIDE_MATCH = "Phosphopeptide_match"\n         PHOSPHORESIDUE = "Phosphoresidue"\n-        PUTATIVE_UPSTREAM_DOMAINS = "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains"\n+        PUTATIVE_UPSTREAM_DOMAINS = " ".join([\n+            "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/",\n+            "Phosphatases/Binding Domains"\n+        ])\n         SEQUENCE = "Sequence"\n         SEQUENCE10 = "Sequence10"\n         SEQUENCE7 = "Sequence7"\n@@ -328,8 +339,26 @@\n             CitationData\n           ) VALUES (?,?)\n           """\n-        CITATION_INSERT_PSP = \'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."\'\n-        CITATION_INSERT_PSP_REF = \'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122\'\n+        CITATION_INSERT_PSP = " '..b'- end read upstream_data_melt --------------------------------------\n+        # ... end read upstream_data_melt ---------------------------------\n \n         end_time = time.process_time()  # timer\n         print(\n@@ -1332,10 +1312,13 @@\n             if p_peptide in melt_dict:\n                 melt_dict[p_peptide].append(characterization)\n             else:\n-                exit(\n-                    \'Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping\'\n-                    % (p_peptide)\n-                )\n+                los = [\n+                    "Phosphopeptide %s" % p_peptide,\n+                    "not found in ppep_mapping_db:",\n+                    \'"phopsphopeptides" and "ppep_mapping_db" must both\',\n+                    "originate from the same run of mqppep_kinase_mapping"\n+                ]\n+                exit(" ".join(los))\n \n         end_time = time.process_time()  # timer\n         print(\n@@ -1397,29 +1380,12 @@\n             ]\n         ]\n \n-        # cols_output_prelim = output_df.columns.tolist()\n-        #\n-        # print("cols_output_prelim")\n-        # print(cols_output_prelim)\n-        #\n-        # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]]\n-        #\n-        # print("cols_output with p-peptide")\n-        # print(cols_output)\n-        #\n-        # cols_output = [col for col in cols_output if not col == "p-peptide"]\n-        #\n-        # print("cols_output")\n-        # print(cols_output)\n-        #\n-        # output_df = output_df[cols_output]\n-\n         # join output_df back to quantitative columns in data_in df\n         quant_cols = data_in.columns.tolist()\n         quant_cols = quant_cols[1:]\n         quant_data = data_in[quant_cols]\n \n-        # ----------- Write merge/filter metadata to SQLite database (start) -----------\n+        # ---- Write merge/filter metadata to SQLite database (start) ----\n         # Open SwissProt SQLite database\n         conn = sql.connect(output_sqlite)\n         cur = conn.cursor()\n@@ -1467,7 +1433,7 @@\n \n         # Close SwissProt SQLite database\n         conn.close()\n-        # ----------- Write merge/filter metadata to SQLite database (finish) -----------\n+        # ---- Write merge/filter metadata to SQLite database (finish) ----\n \n         output_df = output_df.merge(\n             quant_data,\n@@ -1480,15 +1446,18 @@\n         output_df = output_df[output_cols]\n \n         # cosmetic changes to Upstream column\n+        # fill the NaN with "" for those Phosphopeptides that got a\n+        #   "WARNING: Failed match for " in the upstream mapping\n         output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[\n             PUTATIVE_UPSTREAM_DOMAINS\n         ].fillna(\n             ""\n-        )  # fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping\n+        )\n         us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS])\n         i = 0\n         while i < len(us_series):\n-            # turn blanks into N_A to signify the info was searched for but cannot be found\n+            # turn blanks into N_A to signify the info\n+            #   that was searched for but cannot be found\n             if us_series[i] == "":\n                 us_series[i] = N_A\n             i += 1\n@@ -1530,8 +1499,9 @@\n         # Rev. 7/1/2016\n         # Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A\'s\n         # Rev. 7/3/2016:  renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS\n-        # Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \\\n-        #                read from SwissProt SQLite database\n+        # Rev. 12/2/2021: Converted to Python from ipynb; use fast \\\n+        #                 Aho-Corasick searching; \\\n+        #                 read from SwissProt SQLite database\n         # Rev. 12/9/2021: Transfer code to Galaxy tool wrapper\n \n         #\n'
b
diff -r 8dfd5d2b5903 -r b76c75521d91 mqppep_preproc.xml
--- a/mqppep_preproc.xml Mon Jul 11 19:22:54 2022 +0000
+++ b/mqppep_preproc.xml Fri Oct 28 18:26:42 2022 +0000
[
b'@@ -288,31 +288,36 @@\n         </test>\n     </tests>\n     <help><![CDATA[\n-=========================================================\n-Phopsphoproteomic Enrichment Pipeline Preprocessing Steps\n-=========================================================\n+=============================================================\n+**Phopsphoproteomic Enrichment Pipeline Preprocessing Steps**\n+=============================================================\n \n-**Overview**\n+*Overview*\n+==========\n \n Prior to statistical analysis, it is necessary to perform\n three steps to transform the MaxQuant output\n for phosphoproteome-enriched samples.\n \n-**Workflow position**\n+*Workflow position*\n+===================\n \n-``upstream tool``\n-      The input data file for this tool is the ``Phospho (STY)Sites.txt`` file that is produced:\n+Upstream tool\n+=============\n+\n+The input dataset for this tool is the ``Phospho (STY)Sites.txt`` file that is produced:\n \n-      - by the Galaxy "MaxQuant" (``maxquant``) tool\n-      - or by the Galaxy "Maxquant (using mqpar.xml)" (``maxquant_mqpar``) tool\n-      - or by the desktop version of MaxQuant.\n+   - by the Galaxy "MaxQuant" (``maxquant``) tool\n+   - or by the Galaxy "Maxquant (using mqpar.xml)" (``maxquant_mqpar``) tool\n+   - or by the desktop version of MaxQuant.\n \n-``downstream tool``\n-  The "MaxQuant Phosphopeptide ANOVA" tool (``mqppep_anova``) consumes the ``merged/filtered`` output file ``preproc_tab`` that this tool produces.\n+Downstream tool\n+===============\n \n-======================================================================\n-Phopsphoproteomic Enrichment Pipeline Localization-Probability Cut-Off\n-======================================================================\n+The "MaxQuant Phosphopeptide ANOVA" tool (``mqppep_anova``) consumes the "preprocessed" output file ``preproc_tab`` that this tool produces.\n+\n+*Phopsphoproteomic Enrichment Pipeline Localization-Probability Cut-Off*\n+========================================================================\n \n This step applies a "localization-probability cut-off" for phosphopeptides for each phosphopeptide.\n Higher values may reduce the number of peptides in the output.\n@@ -336,30 +341,48 @@\n so it is omitted here even though it was included in Larry Cheng\'s original script.\n \n \n-**Input dataset**\n+Input dataset\n+=============\n+\n+Phospho (STY)Sites.txt\n+   This is the ``MaxQuant Phospho (STY)Sites.txt`` file produced by MaxQuant.\n+   If you use the desktop version of MaxQuant, you will find this file in the ``txt`` folder.\n \n-``phosphoSites``\n-    This is the ``MaxQuant Phospho (STY)Sites.txt`` file produced by MaxQuant.\n-    If you use the desktop version of MaxQuant, you will find this file in the ``txt`` folder.\n+Input parameters\n+================\n+\n+Localization probability cutoff\n+  Minimum localization probability; see above.\n \n-**Output datasets**\n+Intensity merge-function\n+  Specifies how intensities for identical phosphosites should be merged; see above.\n+\n+Output datasets\n+===============\n \n ``ppep_intensities``\n   Data table (in tabular format) presenting, for each sample, the mass-spectral intensity of each phopshopeptide having localization probability greater than the cutoff.\n+\n ``enrichment.pdf``\n   Graph (in PDF format) presenting non-zero proportions of pS, pT, and pY among the phosphosites; note that a phosphopeptide may have multiple phosphosite.\n+\n ``locProbCutoff.pdf``\n   Graph (in PDF format) contrasting proportion of phosphopeptides above the localization probability cutoff with the proportion below.\n+\n ``enrichment.svg``\n   Enrichment graph (in downloadable "scalable vector graphics" format) for incorporation into documents.\n+\n ``locProbCutoff.svg``\n   Localization probability cutoff graph (in downloadable "scalable vector graphics" format) for incorporation into documents.\n+\n ``filteredData``\n   Data table (in tabular format) comprising rows of the ``phosphSites`` input file that are not flagged as contaminants or reversed sequences.'..b'ensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (`https://creativecommons.org/licenses/by-nc-sa/3.0/ <https://creativecommons.org/licenses/by-nc-sa/3.0/>`_). Attribution must be given in written, oral and digital presentations to PhosphoSitePlus, www.phosphosite.org. Written documents should additionally cite:\n \n-          Hornbeck PV, Kornhauser JM, Tkachev S, Zhang B, Skrzypek E, Murray B, Latham V, Sullivan M (2012) PhosphoSitePlus: a comprehensive resource for investigating the structure and function of experimentally determined post-translational modifications in man and mouse. Nucleic Acids Res. 40, D261-D270.; www.phosphosite.org.\n+       Hornbeck PV, Kornhauser JM, Tkachev S, Zhang B, Skrzypek E, Murray B, Latham V, Sullivan M (2012) PhosphoSitePlus: a comprehensive resource for investigating the structure and function of experimentally determined post-translational modifications in man and mouse. Nucleic Acids Res. 40, D261-D270.; www.phosphosite.org.\n \n ``psp_regulatory_sites``\n-  \'Regulatory sites: information curated from the literature about modification sites shown to regulate molecular functions, biological processes, and molecular interactions including protein-protein interactions\' [Hornbeck 2011].  This tabular-formatted file may be downloaded for non-commercial purposes as \'Regulatory_sites.gz\' from `https://www.phosphosite.org/staticDownloads.action <https://www.phosphosite.org/staticDownloads.action>`_.\n+   \'Regulatory sites: information curated from the literature about modification sites shown to regulate molecular functions, biological processes, and molecular interactions including protein-protein interactions\' [Hornbeck 2011].  This tabular-formatted file may be downloaded for non-commercial purposes as \'Regulatory_sites.gz\' from `https://www.phosphosite.org/staticDownloads.action <https://www.phosphosite.org/staticDownloads.action>`_.\n \n       Terms of use and citatation are as for the ``psp_kinase_substrate`` file.\n \n-**Output datasets**\n+Output datasets\n+===============\n \n ``ppep_map``\n   Data table (in tabular format, consumed by the merge/filter step) presenting, for each phosphopeptide, the kinase mappings,  the mass-spectral intensities for each sample, and the metadata from UniProtKB/SwissProt, phospho-sites, phospho-motifs, and regulatory sites.  Data in the columns marked "``Domain``", "``ON_...``", or "``..._PhosphoSite``" are available subject to the following terms:\n@@ -455,7 +476,8 @@\n ``ppep_mapping_sqlite``\n   SQLite database (consumed by the merge/filter step).\n \n-**Authors**\n+Authors\n+=======\n \n ``Nicholas A. Graham``\n   (`ORCiD 0000-0002-6811-1941 <https://orcid.org/0000-0002-6811-1941>`_) wrote the original script.\n@@ -464,18 +486,19 @@\n   (`ORCiD 0000-0002-2882-0508 <https://orcid.org/0000-0002-2882-0508>`_) adapted the script to run in Galaxy.\n \n \n-======================================================\n-Phopsphoproteomic Enrichment Pipeline Merge and Filter\n-======================================================\n+*Phopsphoproteomic Enrichment Pipeline Merge and Filter*\n+========================================================\n \n This step merges mapped metadata into metadata for phosphopeptides, filtering by species.\n \n-**Input parameters**\n+Input parameters\n+================\n \n ``species``\n   Limit PhosphoSitesPlus to indicated species. Default: **human**\n \n-**Output datasets**\n+Output datasets\n+===============\n \n ``preproc_tab``\n   Phosphopeptides annotated with SwissProt and phosphosite metadata, in tabular format.  This file is designed to be consumed by the downstream ANOVA tool.  Some data in the columns marked "PSP" are available subject to the following terms:\n@@ -488,7 +511,8 @@\n ``preproc_sqlite``\n   ``ppep_mapping_sqlite`` updated with annotations, in SQLite format.\n \n-**Authors**\n+Authors\n+=======\n \n ``Nicholas A. Graham``\n   (`ORCiD 0000-0002-6811-1941 <https://orcid.org/0000-0002-6811-1941>`_) initiated the original script.\n'
b
diff -r 8dfd5d2b5903 -r b76c75521d91 perpage.tex
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/perpage.tex Fri Oct 28 18:26:42 2022 +0000
[
b"@@ -0,0 +1,547 @@\n+% \\iffalse\n+%%\n+%% perpage is part of the bigfoot bundle for critical typesetting\n+%% Copyright 2002--2014  David Kastrup <dak@gnu.org>\n+%%\n+%% The license notice and corresponding source code for this file are\n+%% contained in perpage.dtx.\n+%%\n+% This program is free software; you can redistribute it and/or modify\n+% it under the terms of the GNU General Public License as published by\n+% the Free Software Foundation; either version 2 of the License, or\n+% (at your option) any later version.\n+%\n+% This program is distributed in the hope that it will be useful,\n+% but WITHOUT ANY WARRANTY; without even the implied warranty of\n+% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n+% GNU General Public License for more details.\n+%\n+% You should have received a copy of the GNU General Public License\n+% along with this program; if not, write to the Free Software\n+% Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA\n+% \\fi\n+% \\CheckSum{396}\n+% \\GetFileInfo{perpage.sty}\n+% \\date{\\filedate}\n+% \\author{David Kastrup\\thanks{\\texttt{dak@gnu.org}}}\n+% \\title{The \\texttt{perpage} package\\\\Version \\fileversion}\n+% \\maketitle\n+% \\section{Description}\n+%\n+% The \\texttt{perpage} package adds the ability to reset counters per\n+% page and/or keep their occurences sorted in order of appearance on\n+% the page.\n+%\n+% It works by attaching itself to the code for \\cmd{\\stepcounter} and\n+% will then modify the given counter according to information written\n+% to the |.aux| file, which means that multiple passes may be needed.\n+% Since it uses the internals of the \\cmd{\\label} mechanism, the need\n+% for additional passes will get announced by \\LaTeX\\ as ``labels may\n+% have changed''.\n+%\n+% \\DescribeMacro{\\MakePerPage}\n+% \\begin{quote}\n+%   |\\MakePerPage[2]{footnote}|\n+% \\end{quote}\n+% will start footnote numbers with~2 on each page (the optional\n+% argument defaults to~1).  2~might be a strange number, unless you\n+% have used something like\n+% \\begin{quote}\n+%   |\\renewcommand\\thefootnote{\\fnsymbol{footnote}}|\n+% \\end{quote}\n+% and want to start off with a dagger.  The starting value must not be\n+% less than~1 so that the counter logic can detect the reset of a\n+% counter\n+% reliably.\\footnote{This unfortunately means that you can't just use\n+%   \\cmd{\\alph} in order to get figures on page~10 numbered as ``10'',\n+%   ``10a'', ``10b''.}\n+% It could be a good idea to redefine |\\@cnterr| if you use a format\n+% with limited range: at the first pass, footnotes are not reset\n+% across pages and things like |\\fnsymbol| will quickly run out of\n+% characters to use.\n+%\n+% \\DescribeMacro{\\theperpage}\n+% If you want to label things also on a per page base, for example\n+% with\n+% \\begin{quote}\n+%   |\\renewcommand{\\thefigure}{\\thepage-\\arabic{figure}}|\n+% \\end{quote}\n+% you'll have the problem that \\cmd{\\thepage} is updated\n+% asynchronously with the real page, since \\TeX\\ does not know which\n+% page the figure will end up.  If you have used the |perpage| package\n+% for modifying the figure counter, however, at the point where the\n+% counter is incremented, the macro \\cmd{\\theperpage} will be set to\n+% the correct value corresponding to the actual page location.  Note\n+% that this macro is shared between all counters, so advancing a\n+% different counter under control of |perpage| will render\n+% \\cmd{\\thefigure} incorrect.\n+%\n+% \\DescribeMacro{\\MakeSorted}\n+% \\begin{quote}\n+%   |\\MakeSorted{figure}|\n+% \\end{quote}\n+% will make the |figure| counter get `sorted': this means that counter\n+% values will be assigned in order of appearance in the output, not in\n+% order of appearance in the source code.  For example, the order of\n+% interspersed one- and two-column figures might get mixed up by\n+% \\LaTeX\\ in the output.  Making the counter sorted will fix the order\n+% to match the order of appearance.  A similar problem is when\n+% ordinary footnotes are present in floating material (this does not\n+% work in standard "..b'@\n+    \\penalty \\ifnum\\count@<\\@M \\@M \\else \\count@ \\fi\n+  \\else \\kern\\dimen@\\fi\n+  \\pp@cl@end}\n+%    \\end{macrocode}\n+% \\end{macro}\n+% \\begin{macro}{\\pp@labeldef}\n+%   This is a helper macro.\n+%    \\begin{macrocode}\n+\\def\\pp@labeldef#1#2#3#4#5{\\@newl@bel{pp@r@#2}{#3}{{#1}{#4}{#5}}}\n+%    \\end{macrocode}\n+% \\end{macro}\n+% \n+% \\begin{macro}{\\pp@pagectr}\n+%   This is the workhorse for normal per page counters.  It is called\n+%   whenever the |.aux| file is read in and establishes the\n+%   appropriate information for each counter advancement in a\n+%   pseudolabel.\n+%    \\begin{macrocode}\n+\\def\\pp@pagectr#1#2#3#4{\\@ifundefined{c@pp@a@#1}{}{%\n+    \\addtocounter{pp@a@#1}\\@ne\n+    \\expandafter\\pp@labeldef\\expandafter\n+      {\\number\\value{pp@a@#1}}{#1}{#2}{#3}{#4}}}\n+%    \\end{macrocode}\n+% \\end{macro}\n+% \\begin{macro}{\\c@schk@}\n+%   This is called for implementing sorted counters.  Sorted counters\n+%   maintain a ``count group\'\', and the values in each count group are\n+%   numbered independently from that of other count groups.  Whenever\n+%   a counter is found to have been reset, it will start a new count\n+%   group.  At the end of document, the count group counters need to\n+%   get reset, too, so that the check for changed |.aux| files will\n+%   still work.\n+%    \\begin{macrocode}\n+\\def\\c@schk@#1{\\pp@cl@begin\n+  \\addtocounter{pp@a@#1}\\@ne\n+  \\ifnum\\value{#1}=\\@ne\n+    \\expandafter\\xdef\\csname pp@g@#1\\endcsname{\\number\\value{pp@a@#1}}%\n+    \\edef\\next{\\noexpand\\AtEndDocument{\\global\\let\n+      \\expandafter\\noexpand\\csname pp@g@#1@\\number\\value{pp@a@#1}\\endcsname\n+      \\relax}}\\next\n+  \\fi\n+  \\pp@fetchctr{#1}%\n+  \\ifx\\pp@page\\@empty\n+  \\else \\setcounter{#1}{\\pp@label}\\fi\n+  \\pp@writectr\\pp@spagectr{#1}{\\csname pp@g@#1\\endcsname}}%\n+%    \\end{macrocode}\n+% \\end{macro}\n+% \\begin{macro}{\\pp@spagectr}\n+%   This is the code advancing the respective value of the appropriate\n+%   count group and assigning the label.\n+%    \\begin{macrocode}\n+\\def\\pp@spagectr#1#2#3#4{\\@ifundefined{c@pp@a@#1}{}{%\n+    \\count@0\\csname pp@g@#1@#3\\endcsname\n+    \\advance\\count@\\@ne\n+    \\expandafter\\xdef\\csname pp@g@#1@#3\\endcsname{\\number\\count@}%\n+    \\expandafter\\pp@labeldef\\expandafter\n+      {\\number\\count@}{#1}{#2}{#3}{#4}}}\n+%    \\end{macrocode}\n+% \\end{macro}\n+% \\begin{macro}{\\c@spchk@}\n+%   And this finally is the counter advance code for sorted counters\n+%   per page.  Basically, we just use one count group per page.\n+%   Resetting a counter manually will not introduce a new count group,\n+%   and it would be hard to decide what to do in case count groups and\n+%   page positions overlap.\n+%    \\begin{macrocode}\n+\\def\\c@spchk@#1{\\pp@cl@begin\n+  \\addtocounter{pp@a@#1}\\@ne\n+  \\pp@fetchctr{#1}%\n+  \\ifx\\pp@page\\@empty\n+  \\else \\setcounter{#1}{\\pp@label}\\fi\n+  \\pp@writectr\\pp@ppagectr{#1}{\\noexpand\\theabspage}}\n+%    \\end{macrocode}\n+% \\end{macro}\n+% \\begin{macro}{\\pp@ppagectr}\n+%    \\begin{macrocode}\n+\\def\\pp@ppagectr#1#2#3#4{\\@ifundefined{c@pp@a@#1}{}{%\n+    \\def\\next{#3}%\n+    \\expandafter\\ifx\\csname pp@page@#1\\endcsname\\next\n+      \\addtocounter{pp@a@#1}\\@ne\n+    \\else\n+      \\setcounter{pp@a@#1}{\\value{pp@r@#1}}%\n+    \\fi\n+    \\global\\expandafter\\let\\csname pp@page@#1\\endcsname\\next\n+    \\expandafter\\pp@labeldef\\expandafter\n+      {\\number\\value{pp@a@#1}}{#1}{#2}{#3}{#4}}}\n+%    \\end{macrocode}\n+% \\end{macro}\n+% \\begin{macro}{\\@testdef}\n+%   \\LaTeX\'s current (2007) definition of this macro causes save stack\n+%   overflow.  We fix this by an additional grouping.  Delay to the\n+%   beginning of document to keep Babel happy.\n+%   \\begin{macrocode}\n+\\AtBeginDocument{%\n+  \\begingroup\n+    \\@testdef{}{undefined}{}%\n+    \\expandafter\n+  \\endgroup\n+  \\ifx\\@undefined\\relax\n+    \\let\\pp@@testdef\\@testdef\n+    \\def\\@testdef#1#2#3{{\\pp@@testdef{#1}{#2}{#3}%\n+        \\if@tempswa\\aftergroup\\@tempswatrue\\fi}}%\n+  \\fi}\n+%</style>\n+%    \\end{macrocode}\n+% \\end{macro}\n+% \n+% \\Finale\n+% \\endinput\n+% Local Variables: \n+% mode: doctex\n+% TeX-master: "perpage.drv"\n+% End: \n'
b
diff -r 8dfd5d2b5903 -r b76c75521d91 search_ppep.py
--- a/search_ppep.py Mon Jul 11 19:22:54 2022 +0000
+++ b/search_ppep.py Fri Oct 28 18:26:42 2022 +0000
[
@@ -237,7 +237,10 @@
 
     # Parse Command Line
     parser = argparse.ArgumentParser(
-        description="Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB)."
+        description=" ".join([
+            "Phopsphoproteomic Enrichment",
+            "phosphopeptide SwissProt search (in place in SQLite DB)."
+        ])
     )
 
     # inputs:
@@ -249,7 +252,11 @@
         nargs=1,
         required=True,
         dest="phosphopeptides",
-        help="Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool",
+        help=" ".join([
+            "Phosphopeptide data for experimental results,",
+            "generated by the Phopsphoproteomic Enrichment Localization",
+            "Filter tool"
+        ]),
     )
     parser.add_argument(
         "--uniprotkb",
@@ -257,7 +264,10 @@
         nargs=1,
         required=True,
         dest="uniprotkb",
-        help="UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool",
+        help=" ".join([
+            "UniProtKB/Swiss-Prot data, converted from FASTA format by the",
+            "Phopsphoproteomic Enrichment Kinase Mapping tool"
+        ]),
     )
     parser.add_argument(
         "--schema",
@@ -310,7 +320,8 @@
     cur.executescript(DROP_TABLES_SQL)
 
     # if options.db_schema:
-    #     print("\nAfter dropping tables/views that are to be created, schema is:")
+    #     print("\nAfter dropping tables/views that are to be created,"
+    #         + schema is:")
     #     cur.execute("SELECT * FROM sqlite_schema")
     #     for row in cur.fetchall():
     #         if row[4] is not None:
@@ -403,7 +414,11 @@
         deppep_count = row[0]
 
     cur.execute(
-        "SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)"
+        """
+        SELECT count(*) FROM (
+          SELECT Sequence FROM UniProtKB GROUP BY Sequence
+          )
+        """
     )
     for row in cur.fetchall():
         sequence_count = row[0]
@@ -431,9 +446,11 @@
     old_seq = ""
     for row in cur.fetchall():
         if duplicate_count == 0:
-            print(
-                "\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column)."
-            )
+            print(" ".join([
+                "\nEach of the following sequences is associated with several",
+                "accession IDs (which are listed in the first column) but",
+                "the same gene ID (which is listed in the second column)."
+            ]))
         if row[2] != old_seq:
             old_seq = row[2]
             duplicate_count += 1
@@ -480,13 +497,19 @@
                     )
             else:
                 raise ValueError(
-                    "UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID"
-                    % (UniProtKB_id,)
+                    "UniProtKB_id %s, but Sequence is None: %s %s"
+                    % (
+                        UniProtKB_id,
+                        "Check whether SwissProt file is missing",
+                        "the sequence for this ID")
                 )
     ker.execute(
         """
-        SELECT   count(*) || ' accession-peptide-phosphopeptide combinations were found'
-        FROM     uniprotkb_pep_ppep_view
+        SELECT
+          count(*) ||
+            ' accession-peptide-phosphopeptide combinations were found'
+        FROM
+          uniprotkb_pep_ppep_view
         """
     )
     for row in ker.fetchall():
@@ -494,7 +517,9 @@
 
     ker.execute(
         """
-      SELECT   count(*) || ' accession matches were found', count(*) AS accession_count
+      SELECT
+        count(*) || ' accession matches were found',
+        count(*) AS accession_count
       FROM     (
         SELECT   accession
         FROM     uniprotkb_pep_ppep_view
@@ -520,7 +545,9 @@
 
     ker.execute(
         """
-      SELECT   count(*) || ' phosphopeptide matches were found', count(*) AS phosphopeptide_count
+      SELECT
+        count(*) || ' phosphopeptide matches were found',
+        count(*) AS phosphopeptide_count
       FROM     (
         SELECT   phosphopeptide
         FROM     uniprotkb_pep_ppep_view