Repository 'mqppep_anova'
hg clone https://toolshed.g2.bx.psu.edu/repos/galaxyp/mqppep_anova

Changeset 1:08678c931f5d (2022-10-28)
Previous changeset 0:dbff53e6f75f (2022-07-11) Next changeset 2:2336fbff8866 (2022-12-12)
Commit message:
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 43e7a43b545c24b2dc33d039198551c032aa79be
modified:
MaxQuantProcessingScript.R
macros.xml
mqppep_anova.R
mqppep_anova.xml
mqppep_anova_script.Rmd
mqppep_mrgfltr.py
search_ppep.py
added:
KSEA_impl_flowchart.dia
KSEA_impl_flowchart.pdf
kinase_name_uniprot_lut.tabular.bz2
kinase_uniprot_description_lut.tabular.bz2
mqppep_anova_preamble.tex
perpage.tex
b
diff -r dbff53e6f75f -r 08678c931f5d KSEA_impl_flowchart.dia
b
Binary file KSEA_impl_flowchart.dia has changed
b
diff -r dbff53e6f75f -r 08678c931f5d KSEA_impl_flowchart.pdf
b
Binary file KSEA_impl_flowchart.pdf has changed
b
diff -r dbff53e6f75f -r 08678c931f5d MaxQuantProcessingScript.R
--- a/MaxQuantProcessingScript.R Mon Jul 11 19:22:25 2022 +0000
+++ b/MaxQuantProcessingScript.R Fri Oct 28 18:27:21 2022 +0000
[
@@ -220,7 +220,6 @@
     type = "character",
     help = "pY or pST enriched samples (ie, 'Y' or 'ST')"
   )
-  # default = "^Number of Phospho [(]STY[)]$",
   ,
   make_option(
     c("-p", "--phosphoCol"),
@@ -229,7 +228,6 @@
     help = paste0("PERL-compatible regular expression matching",
              " header of column having number of 'Phospho (STY)'")
   )
-  # default = "^Intensity[^_]",
   ,
   make_option(
     c("-s", "--startCol"),
@@ -238,7 +236,6 @@
     help = paste0("PERL-compatible regular expression matching",
              " header of column having first sample intensity")
   )
-  # default = 1,
   ,
   make_option(
     c("-I", "--intervalCol"),
@@ -247,7 +244,6 @@
     help = paste0("Column interval between the Intensities of samples",
              " (eg, 1 if subsequent column; 2 if every other column")
   )
-  # default = 0.75,
   ,
   make_option(
     c("-l", "--localProbCutoff"),
@@ -255,7 +251,6 @@
     type = "double",
     help = "Localization Probability Cutoff"
   )
-  # default = "sum",
   ,
   make_option(
     c("-f", "--collapse_func"),
@@ -264,7 +259,6 @@
     help = paste0("merge identical phosphopeptides",
              " by ('sum' or 'average') the intensities")
   )
-  # default = "filtered_data.txt",
   ,
   make_option(
     c("-r", "--filtered_data"),
@@ -272,7 +266,6 @@
     type = "character",
     help = "filtered_data.txt"
   )
-  # default = "quantData.txt",
   ,
   make_option(
     c("-q", "--quant_data"),
b
diff -r dbff53e6f75f -r 08678c931f5d kinase_name_uniprot_lut.tabular.bz2
b
Binary file kinase_name_uniprot_lut.tabular.bz2 has changed
b
diff -r dbff53e6f75f -r 08678c931f5d kinase_uniprot_description_lut.tabular.bz2
b
Binary file kinase_uniprot_description_lut.tabular.bz2 has changed
b
diff -r dbff53e6f75f -r 08678c931f5d macros.xml
--- a/macros.xml Mon Jul 11 19:22:25 2022 +0000
+++ b/macros.xml Fri Oct 28 18:27:21 2022 +0000
b
@@ -1,89 +1,47 @@
 <macros>
-    <token name="@TOOL_VERSION@">0.1.13</token>
+    <token name="@TOOL_VERSION@">0.1.15</token>
     <token name="@VERSION_SUFFIX@">0</token>
     <xml name="requirements">
         <requirements>
             <requirement type="package" version="1.56.0"  >bioconductor-preprocesscore</requirement>
-            <requirement type="package" version="1.22.2"  >numpy</requirement>
+            <requirement type="package" version="6.2.1"   >gmp</requirement>
+            <requirement type="package" version="1.23.4"  >numpy</requirement>
             <requirement type="package" version="0.3.3"   >openblas</requirement>
-            <requirement type="package" version="1.4.1"   >pandas</requirement>
-            <requirement type="package" version="1.64"    >perl-dbd-sqlite</requirement>
-            <requirement type="package" version="5.26.2"  >perl</requirement>
-            <requirement type="package" version="1.4.0"   >pyahocorasick</requirement>
-            <requirement type="package" version="3.9.10"  >python</requirement>
-            <requirement type="package" version="1.14.2"  >r-data.table</requirement>
-            <requirement type="package" version="1.1.2"   >r-dbi</requirement>
-            <requirement type="package" version="3.3.5"   >r-ggplot2</requirement>
+            <requirement type="package" version="1.5.1"   >pandas</requirement>
+            <requirement type="package" version="1.70"    >perl-dbd-sqlite</requirement>
+            <requirement type="package" version="5.32.1"  >perl</requirement>
+            <requirement type="package" version="1.4.4"   >pyahocorasick</requirement>
+            <requirement type="package" version="3.10.6"  >python</requirement>
+            <requirement type="package" version="4.1.3"   >r-base</requirement>
+            <requirement type="package" version="6.0_93"  >r-caret</requirement>
+            <requirement type="package" version="1.14.4"  >r-data.table</requirement>
+            <requirement type="package" version="1.1.3"   >r-dbi</requirement>
+            <requirement type="package" version="3.3.6"   >r-ggplot2</requirement>
             <requirement type="package" version="3.1.3"   >r-gplots</requirement>
-            <requirement type="package" version="0.9.4"   >r-latex2exp</requirement>
-            <requirement type="package" version="1.7.1"   >r-optparse</requirement>
+            <requirement type="package" version="0.9.5"   >r-latex2exp</requirement>
+            <requirement type="package" version="1.7.3"   >r-optparse</requirement>
             <requirement type="package" version="1.4.4"   >r-reshape2</requirement>
-            <requirement type="package" version="2.11"    >r-rmarkdown</requirement>
-            <requirement type="package" version="2.2.8"   >r-rsqlite</requirement>
-            <requirement type="package" version="0.4.0"   >r-sass</requirement>
+            <requirement type="package" version="2.17"    >r-rmarkdown</requirement>
+            <!--
+            <requirement type="package" version="2.2.18"  >r-rsqlite</requirement>
+            <requirement type="package" version="0.4.2"   >r-sass</requirement>
+            -->
+            <requirement type="package" version="1.2.2"   >r-sessioninfo</requirement>
             <requirement type="package" version="0.4_11"  >r-sqldf</requirement>
-            <requirement type="package" version="1.4.0"   >r-stringr</requirement>
-            <requirement type="package" version="0.37"    >r-tinytex</requirement>
+            <requirement type="package" version="1.4.1"   >r-stringr</requirement>
+            <requirement type="package" version="0.42"    >r-tinytex</requirement>
             <requirement type="package" version="0.3.7"   >r-vioplot</requirement>
             <!--
             It would be nice to use conda-forge/texlive-core rather than r-tinytex because the
-            former installs texlive when the package is built, but issue 23 blocked PDF-creation.
+            former installs texlive when the package is built, but issue 19/61 blocked PDF-creation.
             Also, texlive-core also gave pango font errors (output had missing symbols replaced
             with boxes) unless I specified the build as well as the version when building a
             conda environment, e.g.:  texlive-core=20210325=h97429d4_0
             -->
         </requirements>
-        <!-- I specified the versions above because it takes a VERY long time to search for package versions when they are not omitted; also, version numbers should lead to reproducible behavior.  Contrast execution times of this (about 18 seconds):
-            echo n | time conda create -n mqppep_ver -c conda-forge -c bioconda \
-              bioconductor-preprocesscore=1.56.0 \
-              numpy=1.22.2 \
-              openblas=0.3.3 \
-              pandas=1.4.1 \
-              perl-dbd-sqlite=1.64 \
-              perl-dbd-sqlite=1.64 \
-              perl=5.26.2 \
-              pyahocorasick=1.4.0 \
-              python=3.9.10 \
-              r-data.table=1.14.2 \
-              r-dbi=1.1.2 \
-              r-ggplot2=3.3.5 \
-              r-gplots=3.1.3 \
-              r-latex2exp=0.9.4 \
-              r-optparse=1.7.1 \
-              r-reshape2=1.4.4 \
-              r-rmarkdown=2.11 \
-              r-rsqlite=2.2.8 \
-              r-sass=0.4.0 \
-              r-sqldf=0.4_11 \
-              r-stringr=1.4.0 \
-              r-tinytex=0.37 \
-              r-vioplot=0.3.7
-          with this (42 or more seconds):
-            echo n | time conda create -n mqppep_nover -c conda-forge -c bioconda \
-              bioconductor-preprocesscore= \
-              numpy \
-              openblas=0.3.3 \
-              pandas \
-              perl \
-              perl-dbd-sqlite \
-              perl-dbd-sqlite \
-              pyahocorasick \
-              python \
-              r-data.table \
-              r-dbi \
-              r-ggplot2 \
-              r-gplots \
-              r-latex2exp \
-              r-optparse \
-              r-reshape2 \
-              r-rmarkdown \
-              r-rsqlite \
-              r-sass \
-              r-sqldf \
-              r-stringr \
-              r-tinytex \
-              r-vioplot
-
+        <!-- I specified the versions above because it takes a VERY long time
+             to search for package versions when they are not omitted; also,
+             locking version numbers might lead to more-reproducible behavior.
         -->
     </xml>
 </macros>
b
diff -r dbff53e6f75f -r 08678c931f5d mqppep_anova.R
--- a/mqppep_anova.R Mon Jul 11 19:22:25 2022 +0000
+++ b/mqppep_anova.R Fri Oct 28 18:27:21 2022 +0000
[
b'@@ -1,20 +1,15 @@\n #!/usr/bin/env Rscript\n # libraries\n library(optparse)\n-library(data.table)\n library(stringr)\n+library(tinytex)\n \n # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285\n \n # parse options\n option_list <- list(\n-  make_option(\n-    c("-i", "--inputFile"),\n-    action = "store",\n-    default = NA,\n-    type = "character",\n-    help = "Phosphopeptide Intensities sparse input file path"\n-  ),\n+\n+  # files\n   make_option(\n     c("-a", "--alphaFile"),\n     action = "store",\n@@ -24,64 +19,11 @@\n              " path to text file having one column and no header")\n   ),\n   make_option(\n-    c("-S", "--preproc_sqlite"),\n-    action = "store",\n-    default = NA,\n-    type = "character",\n-    help = "Path to \'preproc_sqlite\' produced by `mqppep_mrgfltr.py`"\n-  ),\n-  make_option(\n-    c("-K", "--ksea_sqlite"),\n+    c("-M", "--anova_ksea_metadata"),\n     action = "store",\n-    default = NA,\n-    type = "character",\n-    help = "Path to \'ksea_sqlite\' output produced by this tool"\n-  ),\n-  make_option(\n-    c("-f", "--firstDataColumn"),\n-    action = "store",\n-    default = "^Intensity[^_]",\n-    type = "character",\n-    help = "First column of intensity values"\n-  ),\n-  make_option(\n-    c("-m", "--imputationMethod"),\n-    action = "store",\n-    default = "random",\n+    default = "anova_ksea_metadata.tsv",\n     type = "character",\n-    help = paste0("Method for missing-value imputation,",\n-             " one of c(\'group-median\',\'median\',\'mean\',\'random\')")\n-  ),\n-  make_option(\n-    c("-p", "--meanPercentile"),\n-    action = "store",\n-    default = 3,\n-    type = "integer",\n-    help = paste0("Mean percentile for randomly generated imputed values;",\n-              ", range [1,99]")\n-  ),\n-  make_option(\n-    c("-d", "--sdPercentile"),\n-    action = "store",\n-    default = 3,\n-    type = "double",\n-    help = paste0("Adjustment value for standard deviation of",\n-              " randomly generated imputed values; real")\n-  ),\n-  make_option(\n-    c("-s", "--regexSampleNames"),\n-    action = "store",\n-    default = "\\\\.(\\\\d+)[A-Z]$",\n-    type = "character",\n-    help = "Regular expression extracting sample-names"\n-  ),\n-  make_option(\n-    c("-g", "--regexSampleGrouping"),\n-    action = "store",\n-    default = "(\\\\d+)",\n-    type = "character",\n-    help = paste0("Regular expression extracting sample-group",\n-             " from an extracted sample-name")\n+    help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments"\n   ),\n   make_option(\n     c("-o", "--imputedDataFile"),\n@@ -102,11 +44,56 @@\n         )\n   ),\n   make_option(\n+    c("-i", "--inputFile"),\n+    action = "store",\n+    default = NA,\n+    type = "character",\n+    help = "Phosphopeptide Intensities sparse input file path"\n+  ),\n+  make_option(\n+    c("-K", "--ksea_sqlite"),\n+    action = "store",\n+    default = NA,\n+    type = "character",\n+    help = "Path to \'ksea_sqlite\' output produced by this tool"\n+  ),\n+  make_option(\n+    c("-S", "--preproc_sqlite"),\n+    action = "store",\n+    default = NA,\n+    type = "character",\n+    help = "Path to \'preproc_sqlite\' produced by `mqppep_mrgfltr.py`"\n+  ),\n+  make_option(\n     c("-r", "--reportFile"),\n     action = "store",\n-    default = "QuantDataProcessingScript.html",\n+    default = "mqppep_anova.pdf",\n+    type = "character",\n+    help = "PDF report file path"\n+  ),\n+\n+  # parameters\n+  make_option(\n+    c("-f", "--firstDataColumn"),\n+    action = "store",\n+    default = "^Intensity[^_]",\n     type = "character",\n-    help = "HTML report file path"\n+    help = "First column of intensity values"\n+  ),\n+  make_option(\n+    c("-m", "--imputationMethod"),\n+    action = "store",\n+    default = "random",\n+    type = "character",\n+    help = paste0("Method for missing-value imputation,",\n+             " one of c(\'group-median\',\'median\',\'mean\',\'random\')")\n+  ),\n+  make_option(\n+    c("-C", "--intensityMinValuesPerClass"),\n+    action = "store",\n+    default = "0",\n+    type = "integer",\n+    hel'..b'_config_file_string(args$regexSampleGrouping, nc)\n+cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\\n"))\n+\n+cat(paste0("regex_sample_names file: ", args$regexSampleNames, "\\n"))\n+regex_sample_names <- read_config_file_string(args$regexSampleNames, nc)\n cat(paste0("regex_sample_names: ",    regex_sample_names,    "\\n"))\n-cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\\n"))\n+\n+if (group_filter != "none") {\n+  cat(paste0("group_filter_patterns file: \'", args$sampleGroupFilterPatterns, "\'\\n"))\n+  group_filter_patterns <- read_config_file_string(args$sampleGroupFilterPatterns, nc)\n+} else {\n+  group_filter_patterns <- ".*"\n+}\n+cat(paste0("group_filter_patterns: ", group_filter_patterns, "\\n"))\n+\n+sink()\n+\n \n # from: https://github.com/molgenis/molgenis-pipelines/wiki/\n #   How-to-source-another_file.R-from-within-your-R-script\n@@ -253,45 +391,72 @@\n     return(NULL)\n }\n \n-script_dir <-  location_of_this_script()\n+# validation of input parameters is complete; it is now justifiable to\n+#   install LaTeX tools to render markdown as PDF; this involves a big\n+#   download from GitHub\n+if (!tinytex::is_tinytex()) tinytex::install_tinytex()\n \n rmarkdown_params <- list(\n-    inputFile = input_file\n-  , alphaFile = alpha_file\n-  , preprocDb = preproc_sqlite\n+\n+    # files\n+    alphaFile = alpha_file\n+  , anovaKseaMetadata = anova_ksea_metadata_file\n+  , imputedDataFilename = imputed_data_file\n+  , imputedQNLTDataFile = imp_qn_lt_data_file\n+  , inputFile = input_file\n+  , kseaAppPrepDb = ksea_sqlite_file\n+  , preprocDb = preproc_sqlite_file\n+\n+    # parameters\n   , firstDataColumn = first_data_column\n+  , groupFilter = group_filter\n+  , groupFilterMode = group_filter_mode         # arg sampleGroupFilterMode\n+  , groupFilterPatterns = group_filter_patterns # arg sampleGroupFilterPatterns\n   , imputationMethod = imputation_method\n+  , intensityMinValuesPerGroup = intensity_min_values_per_class\n+  , kseaCutoffStatistic = ksea_cutoff_statistic\n+  , kseaCutoffThreshold = ksea_cutoff_threshold\n+  , kseaMinSubstrateCount = ksea_min_substrate_count\n+  , kseaUseAbsoluteLog2FC = ksea_use_absolute_log2_fc # add\n   , meanPercentile = mean_percentile\n-  , sdPercentile = sd_percentile\n+  , minQuality = min_quality                          # add\n+  , regexSampleGrouping = regex_sample_grouping\n   , regexSampleNames = regex_sample_names\n-  , regexSampleGrouping = regex_sample_grouping\n-  , imputedDataFilename = imputed_data_file_name\n-  , imputedQNLTDataFile = imp_qn_lt_data_filenm\n-  , anovaKseaMetadata = anova_ksea_metadata\n-  , kseaAppPrepDb = ksea_sqlite\n-  , kseaCutoffThreshold = ksea_cutoff_threshold\n-  , kseaCutoffStatistic = ksea_cutoff_statistic\n+  , sdPercentile = sd_percentile\n   )\n \n print("rmarkdown_params")\n-str(rmarkdown_params)\n+print(rmarkdown_params)\n+print(\n+  lapply(\n+    X = rmarkdown_params,\n+    FUN = function(x) {\n+      paste0(\n+        nchar(as.character(x)),\n+        ": \'",\n+        as.character(x),\n+        "\'"\n+      )\n+    }\n+  )\n+)\n+\n \n # freeze the random number generator so the same results will be produced\n #  from run to run\n set.seed(28571)\n \n-# BUG (or "opportunity")\n-# To render as PDF for the time being requires installing the conda\n-# package `r-texlive` until this issue in `texlive-core` is resolved:\n-#   https://github.com/conda-forge/texlive-core-feedstock/issues/19\n-# This workaround is detailed in the fourth comment of:\n-#   https://github.com/conda-forge/texlive-core-feedstock/issues/61\n+script_dir <-  location_of_this_script()\n \n-library(tinytex)\n-tinytex::install_tinytex()\n rmarkdown::render(\n   input = paste(script_dir, "mqppep_anova_script.Rmd", sep = "/")\n-, output_format = rmarkdown::pdf_document(toc = TRUE)\n , output_file = report_file_name\n , params = rmarkdown_params\n+, output_format = rmarkdown::pdf_document(\n+    includes = rmarkdown::includes(in_header = "mqppep_anova_preamble.tex")\n+  , dev = "pdf"\n+  , toc = TRUE\n+  , toc_depth = 2\n+  , number_sections = FALSE\n+  )\n )\n'
b
diff -r dbff53e6f75f -r 08678c931f5d mqppep_anova.xml
--- a/mqppep_anova.xml Mon Jul 11 19:22:25 2022 +0000
+++ b/mqppep_anova.xml Fri Oct 28 18:27:21 2022 +0000
[
b'@@ -7,6 +7,28 @@\n     <description>Runs ANOVA and KSEA for phosphopeptides.</description>\n     <macros>\n         <import>macros.xml</import>\n+        <xml name="group_matching_parm">\n+            <param name="group_filter_mode" type="select"\n+                   help="Regular expression matching mode \'fixed\', \'perl\', or \'grep\' with option for case insensitivity.  See https://rdrr.io/r/base/grep.html"\n+                   label="Sample-group matching mode"\n+            >\n+                <option value="r">ERE ("extended regular expressions")</option>\n+                <option value="ri">  - ERE, case insensitive</option>\n+                <option value="p" selected="true">PCRE ("PERL-compatible regular expressions")</option>\n+                <option value="pi">  - PCRE, case insensitive</option>\n+                <option value="f">fixed strings ("no regular expressions")</option>\n+                <option value="fi">  - fixed strings, case insensitive</option>\n+            </param>\n+            <param name="group_filter_patterns" type="text" value=".+"\n+                   help="Comma-separated list of regular expressions matching group-names"\n+                   label="Sample-group matching pattern">\n+              <sanitizer>\n+                <valid initial="string.printable">\n+                  <remove value="&apos;"/>\n+                </valid>\n+              </sanitizer>\n+            </param>\n+        </xml>\n     </macros>\n     <edam_topics>\n         <edam_topic>topic_0121</edam_topic><!-- proteomics -->\n@@ -27,29 +49,58 @@\n       both need access to a writeable directory, but most directories in a\n       biocontainer are read-only, so this builds a pseudo-home under /tmp\n     -->\n+    <required_files>\n+      <include path="KSEA_impl_flowchart.pdf" />\n+      <include path="kinase_name_uniprot_lut.tabular.bz2" />\n+      <include path="kinase_uniprot_description_lut.tabular.bz2" />\n+      <include path="kinase_uniprot_description_lut.tabular.bz2" />\n+      <include path="mqppep_anova.R" />\n+      <include path="mqppep_anova_preamble.tex" />\n+      <include path="mqppep_anova_script.Rmd" />\n+      <include path="perpage.tex" />\n+    </required_files>\n     <command detect_errors="exit_code"><![CDATA[\n+      (printenv | sort) &&\n       cp \'$__tool_directory__/mqppep_anova_script.Rmd\' . &&\n-      cp \'$__tool_directory__/mqppep_anova.R\'          . &&\n+      cp \'$__tool_directory__/mqppep_anova.R\' . &&\n+      cp \'$__tool_directory__/kinase_name_uniprot_lut.tabular.bz2\' . &&\n+      cp \'$__tool_directory__/kinase_uniprot_description_lut.tabular.bz2\' . &&\n+      cp \'$__tool_directory__/mqppep_anova_preamble.tex\' . &&\n+      cp \'$__tool_directory__/perpage.tex\' . &&\n+      cp \'$__tool_directory__/KSEA_impl_flowchart.pdf\' . &&\n       Rscript mqppep_anova.R\n         --inputFile \'$input_file\'\n         --alphaFile \'$alpha_file\'\n         --preproc_sqlite \'$preproc_sqlite\'\n-        --firstDataColumn $intensity_column_regex_f\n+        --firstDataColumn \'$intensity_column_regex_f\'\n         --imputationMethod $imputation.imputation_method\n         #if $imputation.imputation_method == "random"\n           --meanPercentile \'$imputation.meanPercentile\'\n           --sdPercentile   \'$imputation.sdPercentile\'\n         #end if\n-        --regexSampleNames $sample_names_regex_f\n-        --regexSampleGrouping $sample_grouping_regex_f\n-        --imputedDataFile $imputed_data_file\n+        --regexSampleNames \'$sample_names_regex_f\'\n+        --regexSampleGrouping \'$sample_grouping_regex_f\'\n+        #if $group_filter.group_filter_method == "none"\n+          --sampleGroupFilter \'none\'\n+        #else\n+          --sampleGroupFilter \'$group_filter.group_filter_method\'\n+          --sampleGroupFilterPatterns \'$group_filter_patterns_f\'\n+          --sampleGroupFilterMode \'$group_filter.group_filter_mode\'\n+        #end if\n+        --intensityMinValuesPerClass \'$intnsty_min_vals_per_smpl_grp\'\n+        --imputedDataFile \'$imputed_data_file\'\n         --imputedQNLTDataFile \'$im'..b'nitude of the differences across the contrast for all of the substrates when aggregating them to assess the enrichment of a given kinase\'s substrates.  When FALSE, also consider the direction.  Surprisingly, setting this to TRUE may decrease the enriched kinases. \n+\n+``Minimum quality of substrates for KSEA``\n+  An arbitrary "quality score" is assigned to each substrate, as described in the PDF report produced by the tool.  This score takes into account both FDR-adjusted p-value and the number of missing values for each substrate.  Setting the minimum to zero retains all substrates, which may be a large number.\n \n **Outputs**\n+===========\n \n-``imputed_intensities (input_file.imputation_method-imputed_intensities)``\n-  Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, in tabular format.\n+Report dataset\n+   *[input file].[imputation method]*-``imputed_report``\n+\n+   Summary report for normalization, imputation, and **ANOVA**, in PDF format.\n \n-``imputed_QN_LT_intensities (input_file.imputation_method-imputed_QN_LT_intensities)``\n-  Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, quantile-normalized (**QN**), and log10-transformed (**LT**), in tabular format.\n+Imputed intensities\n+   *[input file].[imputation method]*-``imputed_intensities``\n+\n+   Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, in tabular format.\n \n-``report_file (input_file.imputation_method-imputed_report)``\n-  Summary report for normalization, imputation, and **ANOVA**, in PDF format.\n+Imputed quantum-normalized log-transformed intensities\n+   *[input file].[imputation method]*-``imputed_QN_LT_intensities``\n+\n+   Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, quantile-normalized (**QN**), and log10-transformed (**LT**), in tabular format.\n \n-``anova_ksea_metadata (input_file.imputation_method-imputed_anova_ksea_metadata)``\n-  Phosphopeptide metadata including ANOVA significance and KSEA enrichments.\n+ANOVA KSEA metadata\n+   *[input file].[imputation method]*-``imputed_anova_ksea_metadata``\n+   Phosphopeptide metadata including ANOVA significance and KSEA enrichments.\n \n-``ksea_sqlite (input_file.imputation_method-imputed_ksea_sqlite)``\n-  SQLite database for ad-hoc report creation.\n+KSEA SQLite database sqlite\n+   *[input file].[imputation method]*-``imputed_ksea_sqlite``\n+   An SQLite database that is usable for *ad hoc* report creation.\n \n **Algorithm**\n+=============\n \n-The KSEA algorithm used here is as in the KSEAapp package as reported in [Wiredja 2017].\n-The code is adapted from "Danica D. Wiredja (2017). KSEAapp: Kinase-Substrate Enrichment Analysis. R package version 0.99.0." to work with output from the "MaxQuant Phosphopeptide Preprocessing" Galaxy tool.\n+The KSEA algorithm used here is as in the KSEAapp package as reported in `[Wiredja 2017] <https://doi.org/10.1093/bioinformatics/btx415>`_.\n+The code is adapted from `"Danica D. Wiredja (2017). KSEAapp: Kinase-Substrate Enrichment Analysis. R package version 0.99.0." <https://cran.r-project.org/package=KSEAapp>`_ to work with output from the "MaxQuant Phosphopeptide Preprocessing" Galaxy tool and the multiple kinase-substrate databases that the latter tool searches.\n \n **Authors**\n+===========\n \n ``Larry C. Cheng``\n   (`ORCiD 0000-0002-6922-6433 <https://orcid.org/0000-0002-6922-6433>`_) wrote the original script.\n@@ -337,5 +525,11 @@\n         <citation type="doi">10.3791/57996</citation>\n         <!-- Wiredja_2017 "The KSEA App ..." PMID: 28655153 -->\n         <citation type="doi">10.1093/bioinformatics/btx415</citation>\n+        <citation type="bibtex">@Manual{,\n+                title = {KSEAapp: Kinase-Substrate Enrichment Analysis},\n+                author = {Danica D. Wiredja},\n+                year = {2017},\n+                note = {R package version 0.99.0},\n+        }</citation>\n     </citations>\n </tool>\n'
b
diff -r dbff53e6f75f -r 08678c931f5d mqppep_anova_preamble.tex
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mqppep_anova_preamble.tex Fri Oct 28 18:27:21 2022 +0000
[
@@ -0,0 +1,90 @@
+% -----------------------------------------------------------------------------
+% preamble includes BEGIN
+% -----------------------------------------------------------------------------
+\usepackage{longtable, lscape, ifthen}
+
+% -----------------------------------------------------------------------------
+% put \T or \B at the ends of lines to add space for super- or sub-
+%   scripts above or below, respectively
+% ref: ?
+\newcommand\T{\rule{0pt}{2.6ex}}       % Top strut
+\newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut
+
+% -----------------------------------------------------------------------------
+% horizontal line commands; ideally, these would compute the width rather than
+%   hardcoding it
+% ref: ?
+\def\hlinport{\makebox[6.5in]{\hrulefill} \\} % hline outside tabular, port
+\def\hlinlscp{\makebox[9in]{\hrulefill} \\} % hline outside tabular, lndscp
+%ref: https://stackoverflow.com/a/67335722
+\def\hlinnotab{\\makebox[1.0\linewidth]{\hrulefill}\\[1ex]}
+
+% -----------------------------------------------------------------------------
+% ref: https://latex.org/forum/viewtopic.php?p=23257#p23257
+\newcommand{\nonemptyline}[1]{%
+  %\ifthenelse{\equal{#1}{}}{do when empty}{do when not empty}
+  \ifthenelse{\equal{#1}{}}{}{#1}%
+}
+
+% -----------------------------------------------------------------------------
+% For RMarkdown, I needed to put this into a preamble.tex file and include it
+%   via `output: pdf_document: includes: in_header: preamble.tex` because
+%   Markdown was expanding the \tabfill command before writing the tex file
+% ref: https://tex.stackexchange.com/a/119477 in reply to
+%      https://tex.stackexchange.com/questions/119473/tabbing-and-line-wrapping
+\makeatletter
+\newlength\tdima
+\newcommand\tabfill[1]{\setlength\tdima{\linewidth}%
+  \addtolength\tdima{\@totalleftmargin}%
+  \addtolength\tdima{-\dimen\@curtab}%
+  \parbox[t]{\tdima}{#1\ifhmode\strut\fi}}
+  %\parbox[t]{\tdima}{\nonemptyline{#1}\ifhmode\strut\fi}}
+\makeatother
+%
+% Create a tabbing environment in which to use tabfill
+% param #1 is specified the tabstops (as expected by the tabbing
+% environment) and is provided in braces after invocation, e.g.:
+%   \begin{tabwrap}{\hspace{1.25in}\=}
+% param #2 is the contents of the envirnent
+\newenvironment{tabwrap}[2]{%
+  \begin{tabbing}#1\kill\ignorespaces%
+  #2}%
+  {\end{tabbing}%
+}
+
+% -----------------------------------------------------------------------------
+% Make a caption for a non-floating figure or table, e.g.,
+% ref: https://github.com/rf-latex/capt-of/blob/main/capt-of.dtx
+%      https://texfaq.org/FAQ-figurehere
+%   
+% Usage: \captionof{*type*}[*move*]{*caption*}
+%        *type*    is `figure` or `table` (or some type you've
+%                    defined with the`float` package)
+%        *move*    is the optional moving argument *caption* (the thing
+%                    that goes to the list of tables/figures)
+%        *caption* is the text of the caption
+\makeatletter
+\newcommand\captionof[1]{\def\@captype{#1}\caption}
+\makeatother
+%
+%%ACE \captionof{table}{Hello world from line 210}
+% To circumvent mis-numbering of interleaved float and non-float table
+%   and figure captions, it is necessary to include the `perpage` package and
+%   "make them sorted" (FFI see https://texfaq.org/FAQ-figurehere)
+% I (ACE) don't know how to get this package to include:
+%    \usepackage{bigfoot}
+% so I included the source instead:
+\makeatletter
+\input{perpage.tex}
+\makeatother
+%
+% Ensure that table numbers are sorted
+\MakeSorted{table}
+% Ensure that figure numbers are sorted
+\MakeSorted{figure}
+
+% -----------------------------------------------------------------------------
+
+% -----------------------------------------------------------------------------
+% preamble includes END
+% -----------------------------------------------------------------------------
b
diff -r dbff53e6f75f -r 08678c931f5d mqppep_anova_script.Rmd
--- a/mqppep_anova_script.Rmd Mon Jul 11 19:22:25 2022 +0000
+++ b/mqppep_anova_script.Rmd Fri Oct 28 18:27:21 2022 +0000
[
b'@@ -7,81 +7,153 @@\n date:\n - "May 28, 2018"\n - "; revised June 23, 2022"\n+lot: true\n output:\n   pdf_document:\n     toc: true\n-    toc_depth: 3\n+    toc_depth: 2\n     keep_tex: true\n-header-includes:\n-  - \\usepackage{longtable}\n-  - \\newcommand\\T{\\rule{0pt}{2.6ex}}       % Top strut\n-  - \\newcommand\\B{\\rule[-1.2ex]{0pt}{0pt}} % Bottom strut\n+    dev: pdf\n+    includes:\n+      in_header: mqppep_anova_preamble.tex\n+latex_macros: false\n+raw_tex: true\n+urlcolor: blue\n params:\n   alphaFile:            "test-data/alpha_levels.tabular"\n   inputFile:            "test-data/test_input_for_anova.tabular"\n   preprocDb:            "test-data/test_input_for_anova.sqlite"\n   kseaAppPrepDb:        !r c(":memory:", "test-data/mqppep.sqlite")[2]\n-  show_toc:             true\n-  firstDataColumn:      "^Intensity[^_]"\n-  imputationMethod:     !r c("group-median", "median", "mean", "random")[1]\n-  meanPercentile:       1\n-  sdPercentile:         1.0\n   regexSampleNames:     "\\\\.\\\\d+[A-Z]$"\n   regexSampleGrouping:  "\\\\d+"\n+  groupFilterPatterns:  ".+"\n+  groupFilter:    !r c("none", "exclude", "include")[1]\n+  imputationMethod:     !r c("group-median", "median", "mean", "random")[4]\n+  kseaCutoffThreshold:  !r c(0.05, 0.1, 0.25, 0.5, 0.9)[5]\n+  #imputationMethod:     !r c("group-median", "median", "mean", "random")[1]\n+\n+  # how should sample groups be interpreted?\n+  #  - "f": fixed patterns (like `grep -F`)\n+  #  - "p": PERL-compatible (like `grep -P`)\n+  #  - "r": extended grep patterns (like `grep -E`)\n+  # use what case sensitivity?\n+  #  - "i": case insensitive matching (like `grep -i`)\n+  groupFilterMode: !r c("r", "ri", "p", "pi", "f", "fi")[1]\n+  # what pattern should be used for the first column\n+  #   (extended grep pattern, case sensitive)\n+  firstDataColumn:      "^Intensity[^_]"\n+  # for small random value imputation, what percentile should be center?\n+  meanPercentile:       50\n+  #meanPercentile:       1\n+  # for small random value imputation, what should `s / mean(x)` ratio be?\n+  sdPercentile:         1.0\n+  # output path for imputed data file\n   imputedDataFilename:  "test-data/limbo/imputedDataFilename.txt"\n+  # output path for imputed/quantile-normalized/log-transformed data file\n   imputedQNLTDataFile:  "test-data/limbo/imputedQNLTDataFile.txt"\n+  # output path for contents of `stats_metadata_v` table\n   anovaKseaMetadata:    "test-data/limbo/anovaKseaMetadata.txt"\n+  # how to test one variable with > 2 categories (e.g., aov or kruskal.test)\n   oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1]\n+  # how to test one variable with 2 categories (e.g., oneway.test)\n   oneWayTwoCategories:  !r c("aov", "kruskal.test", "oneway.test")[3]\n-  kseaCutoffStatistic:  !r c("p.value", "FDR")[2]\n-  kseaCutoffThreshold:  !r c( 0.1, 0.05)[2]\n-  kseaMinKinaseCount:   1\n-  intensityHeatmapRows: 75\n+  # what should be the minimum quality for consideration in both\n+  minQuality:           0\n+  # correct KSEA with FDR (recommended) or raw p-value\n+  kseaCutoffStatistic:  !r c("FDR", "p.value")[1]\n+  # correct KSEA threshold 0.05 (conventional) or higher (perhaps better)\n+  #   "perhaps better" meaning that KSEA is an hypothesis-generator, not -test\n+  #kseaCutoffThreshold:  !r c(0.05, 0.1, 0.25, 0.5)[1]\n+  # minimum number of substrates required for a kinase to be considered in KSEA\n+  kseaMinSubstrateCount: 1\n+  # Should KSEA be performed aggregating signed log2FC or absolute?\n+  # FALSE use raw log2FC for KSEA as for KSEAapp::KSEA.Scores\n+  # TRUE  use abs(log2FC) for KSEA as Justin Drake requested; this is a\n+  #         justifiable deviation from the KSEAapp::KSEA.Scores algorithm.\n+  kseaUseAbsoluteLog2FC: TRUE\n+  #kseaUseAbsoluteLog2FC: FALSE\n+  # minimum number of observed values per sample-group\n+  intensityMinValuesPerGroup: 1\n+  # maximum number of heatmap rows (result are poor when > 50)\n+  intensityHeatmapRows: 50\n+  # what should be the primary criterion to eliminate excessive heatmap rows\n+  intensityHeatmapCriteria: '..b'nb_messages) nbe("Output quantile normalized data tabular file\\n")\n+  write.table(\n+    data_table_imputed,\n+    file = imp_qn_lt_data_filenm,\n+    sep = "\\t",\n+    col.names = TRUE,\n+    row.names = FALSE,\n+    quote = FALSE\n   )\n \n-\n-#output quantile normalized data\n-impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log)\n-colnames(impish)[1] <- "Phosphopeptide"\n-data_table_imputed <- sqldf(data_table_imputed_sql)\n-# Zap the duplicated \'Phosphopeptide\' column named \'ppep\'\n-data_table_imputed <-\n-    data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]\n-write.table(\n-  data_table_imputed,\n-  file = imp_qn_lt_data_filenm,\n-  sep = "\\t",\n-  col.names = TRUE,\n-  row.names = FALSE,\n-  quote = FALSE\n-)\n-\n-ppep_kinase <- sqldf("\n-  SELECT DISTINCT k.ppep, k.kinase\n-    FROM (\n-      SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep\n-        FROM pseudo_ksdata\n-        WHERE GENE IN (SELECT kinase FROM enriched_kinases)\n-      ) k\n-    ORDER BY k.ppep, k.kinase\n-  ")\n-\n-RSQLite::dbWriteTable(\n-  conn = db,\n-  name = "ksea_enriched_ks",\n-  value = ppep_kinase,\n-  append = FALSE\n-  )\n+  ppep_kinase <- sqldf("\n+    SELECT DISTINCT k.ppep, k.kinase\n+      FROM (\n+        SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep\n+          FROM pseudo_ksdata\n+          WHERE GENE IN (SELECT kinase FROM enriched_kinases)\n+        ) k\n+      ORDER BY k.ppep, k.kinase\n+    ")\n+\n+  RSQLite::dbWriteTable(\n+    conn = db,\n+    name = "ksea_enriched_ks",\n+    value = ppep_kinase,\n+    append = FALSE\n+    )\n+}\n+\n+if (print_nb_messages) nb("RSQLite::dbWriteTable anova_signif\\n")\n \n RSQLite::dbWriteTable(\n   conn = db,\n@@ -3453,6 +6293,8 @@\n     "\n   )\n \n+if (print_nb_messages) nb("Output contents of `stats_metadata_v` table to tabular file\\n")\n+if (print_nb_messages) nbe("Output contents of `stats_metadata_v` table to tabular file\\n")\n write.table(\n   dbReadTable(db, "stats_metadata_v"),\n   file = anova_ksea_mtdt_file,\n@@ -3462,75 +6304,21 @@\n   quote = FALSE\n   )\n \n+cat("\\n\\\\clearpage\\n")\n \n ```\n \n+# Data-processing summary flowchart\n+\n+![Flowchart showing ANOVA and KSEA data-processing steps](KSEA_impl_flowchart.pdf)\n+\n ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = \'asis\'}\n cat("\\\\leavevmode\\n\\n\\n")\n \n-# write parameters to report\n-\n-param_unlist <- unlist(as.list(params))\n-param_df <- data.frame(\n-  parameter = paste0("\\\\verb@", names(param_unlist), "@"),\n-  value = paste0("\\\\verb@", gsub("$", "\\\\$", param_unlist, fixed = TRUE), "@")\n-  )\n-\n-data_frame_latex(\n-  x = param_df,\n-  justification = "p{0.35\\\\linewidth} p{0.6\\\\linewidth}",\n-  centered = TRUE,\n-  caption = "Input parameters",\n-  anchor = const_table_anchor_bp,\n-  underscore_whack = FALSE\n-  )\n-\n-# write parameters to SQLite output\n-\n-mqppep_anova_script_param_df <- data.frame(\n-  script    = "mqppep_anova_script.Rmd",\n-  parameter = names(param_unlist),\n-  value     = param_unlist\n-  )\n-ddl_exec(db, "\n-  DROP TABLE IF EXISTS script_parameter;\n-  "\n-)\n-ddl_exec(db, "\n-  CREATE TABLE IF NOT EXISTS script_parameter(\n-    script    TEXT,\n-    parameter TEXT,\n-    value     ANY,\n-    UNIQUE (script, parameter) ON CONFLICT REPLACE\n-    )\n-    ;\n-  "\n-)\n-RSQLite::dbWriteTable(\n-  conn = db,\n-  name = "script_parameter",\n-  value = mqppep_anova_script_param_df,\n-  append = TRUE\n-)\n-\n+write_params(db)\n # We are done with output\n RSQLite::dbDisconnect(db)\n+\n+cat("\\\\clearpage\\n\\\\section{R package versions}\\n")\n+utils::toLatex(utils::sessionInfo())\n ```\n-<!--\n-There\'s gotta be a better way...\n-\n-loaded_packages_df <-  sessioninfo::package_info("loaded")\n-loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library)\n-loaded_packages_df <- data.frame(\n-  package = loaded_packages_df$package,\n-  version = loaded_packages_df$loadedversion,\n-  date    = loaded_packages_df$date\n-  )\n-data_frame_latex(\n-  x = loaded_packages_df,\n-  justification = "l | l l",\n-  centered = FALSE,\n-  caption = "Loaded R packages",\n-  anchor = const_table_anchor_bp\n-  )\n--->\n'
b
diff -r dbff53e6f75f -r 08678c931f5d mqppep_mrgfltr.py
--- a/mqppep_mrgfltr.py Mon Jul 11 19:22:25 2022 +0000
+++ b/mqppep_mrgfltr.py Fri Oct 28 18:27:21 2022 +0000
[
b'@@ -87,7 +87,10 @@\n         nargs=1,\n         required=True,\n         dest="phosphopeptides",\n-        help="Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format",\n+        help=" ".join([\n+            "Phosphopeptide data for experimental results, including the",\n+            "intensities and the mapping to kinase domains, in tabular format"\n+        ]),\n     )\n     #   UniProtKB/SwissProt DB input, SQLite\n     parser.add_argument(\n@@ -106,7 +109,10 @@\n         required=False,\n         default=[],\n         dest="species",\n-        help="limit PhosphoSitePlus records to indicated species (field may be empty)",\n+        help=" ".join([\n+            "limit PhosphoSitePlus records to indicated species",\n+            "(field may be empty)"\n+        ]),\n     )\n \n     # outputs:\n@@ -174,7 +180,7 @@\n     # determine species to limit records from PSP_Regulatory_Sites\n     if options.species is None:\n         exit(\n-            \'Argument "species" is required (and may be empty) but not supplied\'\n+            \'Argument "species" is required (& may be empty) but not supplied\'\n         )\n     try:\n         if len(options.species) > 0:\n@@ -216,20 +222,25 @@\n         FUNCTION_PHOSPHORESIDUE = (\n             "Function Phosphoresidue(PSP=PhosphoSitePlus.org)"\n         )\n-        GENE_NAME = "Gene_Name"  # Gene Name from UniProtKB\n-        ON_FUNCTION = (\n-            "ON_FUNCTION"  # ON_FUNCTION column from PSP_Regulatory_Sites\n-        )\n-        ON_NOTES = "NOTES"  # NOTES column from PSP_Regulatory_Sites\n-        ON_OTHER_INTERACT = "ON_OTHER_INTERACT"  # ON_OTHER_INTERACT column from PSP_Regulatory_Sites\n-        ON_PROCESS = (\n-            "ON_PROCESS"  # ON_PROCESS column from PSP_Regulatory_Sites\n-        )\n-        ON_PROT_INTERACT = "ON_PROT_INTERACT"  # ON_PROT_INTERACT column from PSP_Regulatory_Sites\n+        # Gene Name from UniProtKB\n+        GENE_NAME = "Gene_Name"\n+        # ON_FUNCTION column from PSP_Regulatory_Sites\n+        ON_FUNCTION = ("ON_FUNCTION")\n+        # NOTES column from PSP_Regulatory_Sites\n+        ON_NOTES = "NOTES"\n+        # ON_OTHER_INTERACT column from PSP_Regulatory_Sites\n+        ON_OTHER_INTERACT = "ON_OTHER_INTERACT"\n+        # ON_PROCESS column from PSP_Regulatory_Sites\n+        ON_PROCESS = ("ON_PROCESS")\n+        # ON_PROT_INTERACT column from PSP_Regulatory_Sites\n+        ON_PROT_INTERACT = "ON_PROT_INTERACT"\n         PHOSPHOPEPTIDE = "Phosphopeptide"\n         PHOSPHOPEPTIDE_MATCH = "Phosphopeptide_match"\n         PHOSPHORESIDUE = "Phosphoresidue"\n-        PUTATIVE_UPSTREAM_DOMAINS = "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains"\n+        PUTATIVE_UPSTREAM_DOMAINS = " ".join([\n+            "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/",\n+            "Phosphatases/Binding Domains"\n+        ])\n         SEQUENCE = "Sequence"\n         SEQUENCE10 = "Sequence10"\n         SEQUENCE7 = "Sequence7"\n@@ -328,8 +339,26 @@\n             CitationData\n           ) VALUES (?,?)\n           """\n-        CITATION_INSERT_PSP = \'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."\'\n-        CITATION_INSERT_PSP_REF = \'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122\'\n+        CITATION_INSERT_PSP = " '..b'- end read upstream_data_melt --------------------------------------\n+        # ... end read upstream_data_melt ---------------------------------\n \n         end_time = time.process_time()  # timer\n         print(\n@@ -1332,10 +1312,13 @@\n             if p_peptide in melt_dict:\n                 melt_dict[p_peptide].append(characterization)\n             else:\n-                exit(\n-                    \'Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping\'\n-                    % (p_peptide)\n-                )\n+                los = [\n+                    "Phosphopeptide %s" % p_peptide,\n+                    "not found in ppep_mapping_db:",\n+                    \'"phopsphopeptides" and "ppep_mapping_db" must both\',\n+                    "originate from the same run of mqppep_kinase_mapping"\n+                ]\n+                exit(" ".join(los))\n \n         end_time = time.process_time()  # timer\n         print(\n@@ -1397,29 +1380,12 @@\n             ]\n         ]\n \n-        # cols_output_prelim = output_df.columns.tolist()\n-        #\n-        # print("cols_output_prelim")\n-        # print(cols_output_prelim)\n-        #\n-        # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]]\n-        #\n-        # print("cols_output with p-peptide")\n-        # print(cols_output)\n-        #\n-        # cols_output = [col for col in cols_output if not col == "p-peptide"]\n-        #\n-        # print("cols_output")\n-        # print(cols_output)\n-        #\n-        # output_df = output_df[cols_output]\n-\n         # join output_df back to quantitative columns in data_in df\n         quant_cols = data_in.columns.tolist()\n         quant_cols = quant_cols[1:]\n         quant_data = data_in[quant_cols]\n \n-        # ----------- Write merge/filter metadata to SQLite database (start) -----------\n+        # ---- Write merge/filter metadata to SQLite database (start) ----\n         # Open SwissProt SQLite database\n         conn = sql.connect(output_sqlite)\n         cur = conn.cursor()\n@@ -1467,7 +1433,7 @@\n \n         # Close SwissProt SQLite database\n         conn.close()\n-        # ----------- Write merge/filter metadata to SQLite database (finish) -----------\n+        # ---- Write merge/filter metadata to SQLite database (finish) ----\n \n         output_df = output_df.merge(\n             quant_data,\n@@ -1480,15 +1446,18 @@\n         output_df = output_df[output_cols]\n \n         # cosmetic changes to Upstream column\n+        # fill the NaN with "" for those Phosphopeptides that got a\n+        #   "WARNING: Failed match for " in the upstream mapping\n         output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[\n             PUTATIVE_UPSTREAM_DOMAINS\n         ].fillna(\n             ""\n-        )  # fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping\n+        )\n         us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS])\n         i = 0\n         while i < len(us_series):\n-            # turn blanks into N_A to signify the info was searched for but cannot be found\n+            # turn blanks into N_A to signify the info\n+            #   that was searched for but cannot be found\n             if us_series[i] == "":\n                 us_series[i] = N_A\n             i += 1\n@@ -1530,8 +1499,9 @@\n         # Rev. 7/1/2016\n         # Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A\'s\n         # Rev. 7/3/2016:  renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS\n-        # Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \\\n-        #                read from SwissProt SQLite database\n+        # Rev. 12/2/2021: Converted to Python from ipynb; use fast \\\n+        #                 Aho-Corasick searching; \\\n+        #                 read from SwissProt SQLite database\n         # Rev. 12/9/2021: Transfer code to Galaxy tool wrapper\n \n         #\n'
b
diff -r dbff53e6f75f -r 08678c931f5d perpage.tex
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/perpage.tex Fri Oct 28 18:27:21 2022 +0000
[
b"@@ -0,0 +1,547 @@\n+% \\iffalse\n+%%\n+%% perpage is part of the bigfoot bundle for critical typesetting\n+%% Copyright 2002--2014  David Kastrup <dak@gnu.org>\n+%%\n+%% The license notice and corresponding source code for this file are\n+%% contained in perpage.dtx.\n+%%\n+% This program is free software; you can redistribute it and/or modify\n+% it under the terms of the GNU General Public License as published by\n+% the Free Software Foundation; either version 2 of the License, or\n+% (at your option) any later version.\n+%\n+% This program is distributed in the hope that it will be useful,\n+% but WITHOUT ANY WARRANTY; without even the implied warranty of\n+% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n+% GNU General Public License for more details.\n+%\n+% You should have received a copy of the GNU General Public License\n+% along with this program; if not, write to the Free Software\n+% Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA\n+% \\fi\n+% \\CheckSum{396}\n+% \\GetFileInfo{perpage.sty}\n+% \\date{\\filedate}\n+% \\author{David Kastrup\\thanks{\\texttt{dak@gnu.org}}}\n+% \\title{The \\texttt{perpage} package\\\\Version \\fileversion}\n+% \\maketitle\n+% \\section{Description}\n+%\n+% The \\texttt{perpage} package adds the ability to reset counters per\n+% page and/or keep their occurences sorted in order of appearance on\n+% the page.\n+%\n+% It works by attaching itself to the code for \\cmd{\\stepcounter} and\n+% will then modify the given counter according to information written\n+% to the |.aux| file, which means that multiple passes may be needed.\n+% Since it uses the internals of the \\cmd{\\label} mechanism, the need\n+% for additional passes will get announced by \\LaTeX\\ as ``labels may\n+% have changed''.\n+%\n+% \\DescribeMacro{\\MakePerPage}\n+% \\begin{quote}\n+%   |\\MakePerPage[2]{footnote}|\n+% \\end{quote}\n+% will start footnote numbers with~2 on each page (the optional\n+% argument defaults to~1).  2~might be a strange number, unless you\n+% have used something like\n+% \\begin{quote}\n+%   |\\renewcommand\\thefootnote{\\fnsymbol{footnote}}|\n+% \\end{quote}\n+% and want to start off with a dagger.  The starting value must not be\n+% less than~1 so that the counter logic can detect the reset of a\n+% counter\n+% reliably.\\footnote{This unfortunately means that you can't just use\n+%   \\cmd{\\alph} in order to get figures on page~10 numbered as ``10'',\n+%   ``10a'', ``10b''.}\n+% It could be a good idea to redefine |\\@cnterr| if you use a format\n+% with limited range: at the first pass, footnotes are not reset\n+% across pages and things like |\\fnsymbol| will quickly run out of\n+% characters to use.\n+%\n+% \\DescribeMacro{\\theperpage}\n+% If you want to label things also on a per page base, for example\n+% with\n+% \\begin{quote}\n+%   |\\renewcommand{\\thefigure}{\\thepage-\\arabic{figure}}|\n+% \\end{quote}\n+% you'll have the problem that \\cmd{\\thepage} is updated\n+% asynchronously with the real page, since \\TeX\\ does not know which\n+% page the figure will end up.  If you have used the |perpage| package\n+% for modifying the figure counter, however, at the point where the\n+% counter is incremented, the macro \\cmd{\\theperpage} will be set to\n+% the correct value corresponding to the actual page location.  Note\n+% that this macro is shared between all counters, so advancing a\n+% different counter under control of |perpage| will render\n+% \\cmd{\\thefigure} incorrect.\n+%\n+% \\DescribeMacro{\\MakeSorted}\n+% \\begin{quote}\n+%   |\\MakeSorted{figure}|\n+% \\end{quote}\n+% will make the |figure| counter get `sorted': this means that counter\n+% values will be assigned in order of appearance in the output, not in\n+% order of appearance in the source code.  For example, the order of\n+% interspersed one- and two-column figures might get mixed up by\n+% \\LaTeX\\ in the output.  Making the counter sorted will fix the order\n+% to match the order of appearance.  A similar problem is when\n+% ordinary footnotes are present in floating material (this does not\n+% work in standard "..b'@\n+    \\penalty \\ifnum\\count@<\\@M \\@M \\else \\count@ \\fi\n+  \\else \\kern\\dimen@\\fi\n+  \\pp@cl@end}\n+%    \\end{macrocode}\n+% \\end{macro}\n+% \\begin{macro}{\\pp@labeldef}\n+%   This is a helper macro.\n+%    \\begin{macrocode}\n+\\def\\pp@labeldef#1#2#3#4#5{\\@newl@bel{pp@r@#2}{#3}{{#1}{#4}{#5}}}\n+%    \\end{macrocode}\n+% \\end{macro}\n+% \n+% \\begin{macro}{\\pp@pagectr}\n+%   This is the workhorse for normal per page counters.  It is called\n+%   whenever the |.aux| file is read in and establishes the\n+%   appropriate information for each counter advancement in a\n+%   pseudolabel.\n+%    \\begin{macrocode}\n+\\def\\pp@pagectr#1#2#3#4{\\@ifundefined{c@pp@a@#1}{}{%\n+    \\addtocounter{pp@a@#1}\\@ne\n+    \\expandafter\\pp@labeldef\\expandafter\n+      {\\number\\value{pp@a@#1}}{#1}{#2}{#3}{#4}}}\n+%    \\end{macrocode}\n+% \\end{macro}\n+% \\begin{macro}{\\c@schk@}\n+%   This is called for implementing sorted counters.  Sorted counters\n+%   maintain a ``count group\'\', and the values in each count group are\n+%   numbered independently from that of other count groups.  Whenever\n+%   a counter is found to have been reset, it will start a new count\n+%   group.  At the end of document, the count group counters need to\n+%   get reset, too, so that the check for changed |.aux| files will\n+%   still work.\n+%    \\begin{macrocode}\n+\\def\\c@schk@#1{\\pp@cl@begin\n+  \\addtocounter{pp@a@#1}\\@ne\n+  \\ifnum\\value{#1}=\\@ne\n+    \\expandafter\\xdef\\csname pp@g@#1\\endcsname{\\number\\value{pp@a@#1}}%\n+    \\edef\\next{\\noexpand\\AtEndDocument{\\global\\let\n+      \\expandafter\\noexpand\\csname pp@g@#1@\\number\\value{pp@a@#1}\\endcsname\n+      \\relax}}\\next\n+  \\fi\n+  \\pp@fetchctr{#1}%\n+  \\ifx\\pp@page\\@empty\n+  \\else \\setcounter{#1}{\\pp@label}\\fi\n+  \\pp@writectr\\pp@spagectr{#1}{\\csname pp@g@#1\\endcsname}}%\n+%    \\end{macrocode}\n+% \\end{macro}\n+% \\begin{macro}{\\pp@spagectr}\n+%   This is the code advancing the respective value of the appropriate\n+%   count group and assigning the label.\n+%    \\begin{macrocode}\n+\\def\\pp@spagectr#1#2#3#4{\\@ifundefined{c@pp@a@#1}{}{%\n+    \\count@0\\csname pp@g@#1@#3\\endcsname\n+    \\advance\\count@\\@ne\n+    \\expandafter\\xdef\\csname pp@g@#1@#3\\endcsname{\\number\\count@}%\n+    \\expandafter\\pp@labeldef\\expandafter\n+      {\\number\\count@}{#1}{#2}{#3}{#4}}}\n+%    \\end{macrocode}\n+% \\end{macro}\n+% \\begin{macro}{\\c@spchk@}\n+%   And this finally is the counter advance code for sorted counters\n+%   per page.  Basically, we just use one count group per page.\n+%   Resetting a counter manually will not introduce a new count group,\n+%   and it would be hard to decide what to do in case count groups and\n+%   page positions overlap.\n+%    \\begin{macrocode}\n+\\def\\c@spchk@#1{\\pp@cl@begin\n+  \\addtocounter{pp@a@#1}\\@ne\n+  \\pp@fetchctr{#1}%\n+  \\ifx\\pp@page\\@empty\n+  \\else \\setcounter{#1}{\\pp@label}\\fi\n+  \\pp@writectr\\pp@ppagectr{#1}{\\noexpand\\theabspage}}\n+%    \\end{macrocode}\n+% \\end{macro}\n+% \\begin{macro}{\\pp@ppagectr}\n+%    \\begin{macrocode}\n+\\def\\pp@ppagectr#1#2#3#4{\\@ifundefined{c@pp@a@#1}{}{%\n+    \\def\\next{#3}%\n+    \\expandafter\\ifx\\csname pp@page@#1\\endcsname\\next\n+      \\addtocounter{pp@a@#1}\\@ne\n+    \\else\n+      \\setcounter{pp@a@#1}{\\value{pp@r@#1}}%\n+    \\fi\n+    \\global\\expandafter\\let\\csname pp@page@#1\\endcsname\\next\n+    \\expandafter\\pp@labeldef\\expandafter\n+      {\\number\\value{pp@a@#1}}{#1}{#2}{#3}{#4}}}\n+%    \\end{macrocode}\n+% \\end{macro}\n+% \\begin{macro}{\\@testdef}\n+%   \\LaTeX\'s current (2007) definition of this macro causes save stack\n+%   overflow.  We fix this by an additional grouping.  Delay to the\n+%   beginning of document to keep Babel happy.\n+%   \\begin{macrocode}\n+\\AtBeginDocument{%\n+  \\begingroup\n+    \\@testdef{}{undefined}{}%\n+    \\expandafter\n+  \\endgroup\n+  \\ifx\\@undefined\\relax\n+    \\let\\pp@@testdef\\@testdef\n+    \\def\\@testdef#1#2#3{{\\pp@@testdef{#1}{#2}{#3}%\n+        \\if@tempswa\\aftergroup\\@tempswatrue\\fi}}%\n+  \\fi}\n+%</style>\n+%    \\end{macrocode}\n+% \\end{macro}\n+% \n+% \\Finale\n+% \\endinput\n+% Local Variables: \n+% mode: doctex\n+% TeX-master: "perpage.drv"\n+% End: \n'
b
diff -r dbff53e6f75f -r 08678c931f5d search_ppep.py
--- a/search_ppep.py Mon Jul 11 19:22:25 2022 +0000
+++ b/search_ppep.py Fri Oct 28 18:27:21 2022 +0000
[
@@ -237,7 +237,10 @@
 
     # Parse Command Line
     parser = argparse.ArgumentParser(
-        description="Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB)."
+        description=" ".join([
+            "Phopsphoproteomic Enrichment",
+            "phosphopeptide SwissProt search (in place in SQLite DB)."
+        ])
     )
 
     # inputs:
@@ -249,7 +252,11 @@
         nargs=1,
         required=True,
         dest="phosphopeptides",
-        help="Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool",
+        help=" ".join([
+            "Phosphopeptide data for experimental results,",
+            "generated by the Phopsphoproteomic Enrichment Localization",
+            "Filter tool"
+        ]),
     )
     parser.add_argument(
         "--uniprotkb",
@@ -257,7 +264,10 @@
         nargs=1,
         required=True,
         dest="uniprotkb",
-        help="UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool",
+        help=" ".join([
+            "UniProtKB/Swiss-Prot data, converted from FASTA format by the",
+            "Phopsphoproteomic Enrichment Kinase Mapping tool"
+        ]),
     )
     parser.add_argument(
         "--schema",
@@ -310,7 +320,8 @@
     cur.executescript(DROP_TABLES_SQL)
 
     # if options.db_schema:
-    #     print("\nAfter dropping tables/views that are to be created, schema is:")
+    #     print("\nAfter dropping tables/views that are to be created,"
+    #         + schema is:")
     #     cur.execute("SELECT * FROM sqlite_schema")
     #     for row in cur.fetchall():
     #         if row[4] is not None:
@@ -403,7 +414,11 @@
         deppep_count = row[0]
 
     cur.execute(
-        "SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)"
+        """
+        SELECT count(*) FROM (
+          SELECT Sequence FROM UniProtKB GROUP BY Sequence
+          )
+        """
     )
     for row in cur.fetchall():
         sequence_count = row[0]
@@ -431,9 +446,11 @@
     old_seq = ""
     for row in cur.fetchall():
         if duplicate_count == 0:
-            print(
-                "\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column)."
-            )
+            print(" ".join([
+                "\nEach of the following sequences is associated with several",
+                "accession IDs (which are listed in the first column) but",
+                "the same gene ID (which is listed in the second column)."
+            ]))
         if row[2] != old_seq:
             old_seq = row[2]
             duplicate_count += 1
@@ -480,13 +497,19 @@
                     )
             else:
                 raise ValueError(
-                    "UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID"
-                    % (UniProtKB_id,)
+                    "UniProtKB_id %s, but Sequence is None: %s %s"
+                    % (
+                        UniProtKB_id,
+                        "Check whether SwissProt file is missing",
+                        "the sequence for this ID")
                 )
     ker.execute(
         """
-        SELECT   count(*) || ' accession-peptide-phosphopeptide combinations were found'
-        FROM     uniprotkb_pep_ppep_view
+        SELECT
+          count(*) ||
+            ' accession-peptide-phosphopeptide combinations were found'
+        FROM
+          uniprotkb_pep_ppep_view
         """
     )
     for row in ker.fetchall():
@@ -494,7 +517,9 @@
 
     ker.execute(
         """
-      SELECT   count(*) || ' accession matches were found', count(*) AS accession_count
+      SELECT
+        count(*) || ' accession matches were found',
+        count(*) AS accession_count
       FROM     (
         SELECT   accession
         FROM     uniprotkb_pep_ppep_view
@@ -520,7 +545,9 @@
 
     ker.execute(
         """
-      SELECT   count(*) || ' phosphopeptide matches were found', count(*) AS phosphopeptide_count
+      SELECT
+        count(*) || ' phosphopeptide matches were found',
+        count(*) AS phosphopeptide_count
       FROM     (
         SELECT   phosphopeptide
         FROM     uniprotkb_pep_ppep_view